aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/00-INDEX10
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node2
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-hugepages2
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-ksm2
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab4
-rw-r--r--Documentation/RCU/whatisRCU.txt2
-rw-r--r--Documentation/admin-guide/bcache.rst (renamed from Documentation/bcache.txt)0
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst (renamed from Documentation/cgroup-v2.txt)0
-rw-r--r--Documentation/admin-guide/index.rst3
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt169
-rw-r--r--Documentation/admin-guide/mm/concepts.rst222
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst (renamed from Documentation/vm/hugetlbpage.txt)263
-rw-r--r--Documentation/admin-guide/mm/idle_page_tracking.rst (renamed from Documentation/vm/idle_page_tracking.txt)56
-rw-r--r--Documentation/admin-guide/mm/index.rst36
-rw-r--r--Documentation/admin-guide/mm/ksm.rst189
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst495
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst (renamed from Documentation/vm/pagemap.txt)184
-rw-r--r--Documentation/admin-guide/mm/soft-dirty.rst (renamed from Documentation/vm/soft-dirty.txt)20
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst418
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst (renamed from Documentation/vm/userfaultfd.txt)66
-rw-r--r--Documentation/admin-guide/ramoops.rst2
-rw-r--r--Documentation/arm/Marvell/README15
-rw-r--r--Documentation/block/cmdline-partition.txt21
-rw-r--r--Documentation/core-api/atomic_ops.rst13
-rw-r--r--Documentation/core-api/cachetlb.rst (renamed from Documentation/cachetlb.txt)0
-rw-r--r--Documentation/core-api/circular-buffers.rst (renamed from Documentation/circular-buffers.txt)0
-rw-r--r--Documentation/core-api/gfp_mask-from-fs-io.rst66
-rw-r--r--Documentation/core-api/index.rst3
-rw-r--r--Documentation/core-api/kernel-api.rst60
-rw-r--r--Documentation/core-api/refcount-vs-atomic.rst2
-rw-r--r--Documentation/crypto/index.rst1
-rw-r--r--Documentation/dev-tools/kasan.rst2
-rw-r--r--Documentation/dev-tools/kselftest.rst5
-rw-r--r--Documentation/devicetree/bindings/hwmon/gpio-fan.txt2
-rw-r--r--Documentation/devicetree/bindings/hwmon/ltc2990.txt36
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt11
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt17
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt3
-rw-r--r--Documentation/driver-api/clk.rst (renamed from Documentation/clk.txt)0
-rw-r--r--Documentation/driver-api/device_connection.rst2
-rw-r--r--Documentation/driver-api/gpio/driver.rst6
-rw-r--r--Documentation/driver-api/index.rst2
-rw-r--r--Documentation/driver-api/uio-howto.rst3
-rw-r--r--Documentation/features/core/cBPF-JIT/arch-support.txt (renamed from Documentation/features/lib/strncasecmp/arch-support.txt)14
-rw-r--r--Documentation/features/core/eBPF-JIT/arch-support.txt (renamed from Documentation/features/core/BPF-JIT/arch-support.txt)8
-rw-r--r--Documentation/features/core/generic-idle-thread/arch-support.txt4
-rw-r--r--Documentation/features/core/jump-labels/arch-support.txt2
-rw-r--r--Documentation/features/core/tracehook/arch-support.txt2
-rw-r--r--Documentation/features/debug/KASAN/arch-support.txt4
-rw-r--r--Documentation/features/debug/gcov-profile-all/arch-support.txt2
-rw-r--r--Documentation/features/debug/kgdb/arch-support.txt4
-rw-r--r--Documentation/features/debug/kprobes-on-ftrace/arch-support.txt2
-rw-r--r--Documentation/features/debug/kprobes/arch-support.txt4
-rw-r--r--Documentation/features/debug/kretprobes/arch-support.txt4
-rw-r--r--Documentation/features/debug/optprobes/arch-support.txt4
-rw-r--r--Documentation/features/debug/stackprotector/arch-support.txt2
-rw-r--r--Documentation/features/debug/uprobes/arch-support.txt6
-rw-r--r--Documentation/features/debug/user-ret-profiler/arch-support.txt2
-rw-r--r--Documentation/features/io/dma-api-debug/arch-support.txt31
-rw-r--r--Documentation/features/io/dma-contiguous/arch-support.txt4
-rw-r--r--Documentation/features/io/sg-chain/arch-support.txt2
-rw-r--r--Documentation/features/locking/cmpxchg-local/arch-support.txt4
-rw-r--r--Documentation/features/locking/lockdep/arch-support.txt4
-rw-r--r--Documentation/features/locking/queued-rwlocks/arch-support.txt10
-rw-r--r--Documentation/features/locking/queued-spinlocks/arch-support.txt8
-rw-r--r--Documentation/features/locking/rwsem-optimized/arch-support.txt10
-rw-r--r--Documentation/features/perf/kprobes-event/arch-support.txt6
-rw-r--r--Documentation/features/perf/perf-regs/arch-support.txt4
-rw-r--r--Documentation/features/perf/perf-stackdump/arch-support.txt4
-rw-r--r--Documentation/features/sched/membarrier-sync-core/arch-support.txt2
-rw-r--r--Documentation/features/sched/numa-balancing/arch-support.txt6
-rwxr-xr-xDocumentation/features/scripts/features-refresh.sh98
-rw-r--r--Documentation/features/seccomp/seccomp-filter/arch-support.txt6
-rw-r--r--Documentation/features/time/arch-tick-broadcast/arch-support.txt4
-rw-r--r--Documentation/features/time/clockevents/arch-support.txt4
-rw-r--r--Documentation/features/time/context-tracking/arch-support.txt2
-rw-r--r--Documentation/features/time/irq-time-acct/arch-support.txt4
-rw-r--r--Documentation/features/time/modern-timekeeping/arch-support.txt2
-rw-r--r--Documentation/features/time/virt-cpuacct/arch-support.txt2
-rw-r--r--Documentation/features/vm/ELF-ASLR/arch-support.txt4
-rw-r--r--Documentation/features/vm/PG_uncached/arch-support.txt2
-rw-r--r--Documentation/features/vm/THP/arch-support.txt2
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt2
-rw-r--r--Documentation/features/vm/huge-vmap/arch-support.txt2
-rw-r--r--Documentation/features/vm/ioremap_prot/arch-support.txt2
-rw-r--r--Documentation/features/vm/numa-memblock/arch-support.txt4
-rw-r--r--Documentation/features/vm/pte_special/arch-support.txt2
-rw-r--r--Documentation/filesystems/Locking9
-rw-r--r--Documentation/filesystems/proc.txt8
-rw-r--r--Documentation/filesystems/tmpfs.txt5
-rw-r--r--Documentation/filesystems/vfs.txt15
-rw-r--r--Documentation/hwmon/hwmon-kernel-api.txt3
-rw-r--r--Documentation/hwmon/ltc299024
-rw-r--r--Documentation/index.rst3
-rw-r--r--Documentation/ioctl/botching-up-ioctls.txt4
-rw-r--r--Documentation/memory-barriers.txt19
-rw-r--r--Documentation/process/2.Process.rst72
-rw-r--r--Documentation/process/5.Posting.rst16
-rw-r--r--Documentation/process/index.rst1
-rw-r--r--Documentation/process/maintainer-pgp-guide.rst39
-rw-r--r--Documentation/process/submitting-patches.rst2
-rw-r--r--Documentation/scheduler/sched-deadline.txt25
-rw-r--r--Documentation/security/index.rst2
-rw-r--r--Documentation/sound/alsa-configuration.rst4
-rw-r--r--Documentation/sound/soc/codec.rst2
-rw-r--r--Documentation/sound/soc/platform.rst2
-rw-r--r--Documentation/sysctl/vm.txt6
-rw-r--r--Documentation/trace/coresight.txt103
-rw-r--r--Documentation/trace/ftrace-uses.rst4
-rw-r--r--Documentation/trace/ftrace.rst7
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt54
-rw-r--r--Documentation/vfio.txt5
-rw-r--r--Documentation/vm/00-INDEX58
-rw-r--r--Documentation/vm/active_mm.rst91
-rw-r--r--Documentation/vm/active_mm.txt83
-rw-r--r--Documentation/vm/balance.rst (renamed from Documentation/vm/balance)15
-rw-r--r--Documentation/vm/cleancache.rst (renamed from Documentation/vm/cleancache.txt)105
-rw-r--r--Documentation/vm/conf.py10
-rw-r--r--Documentation/vm/frontswap.rst (renamed from Documentation/vm/frontswap.txt)59
-rw-r--r--Documentation/vm/highmem.rst (renamed from Documentation/vm/highmem.txt)87
-rw-r--r--Documentation/vm/hmm.rst (renamed from Documentation/vm/hmm.txt)78
-rw-r--r--Documentation/vm/hugetlbfs_reserv.rst (renamed from Documentation/vm/hugetlbfs_reserv.txt)220
-rw-r--r--Documentation/vm/hwpoison.rst (renamed from Documentation/vm/hwpoison.txt)141
-rw-r--r--Documentation/vm/index.rst50
-rw-r--r--Documentation/vm/ksm.rst87
-rw-r--r--Documentation/vm/ksm.txt178
-rw-r--r--Documentation/vm/mmu_notifier.rst99
-rw-r--r--Documentation/vm/mmu_notifier.txt93
-rw-r--r--Documentation/vm/numa.rst (renamed from Documentation/vm/numa)6
-rw-r--r--Documentation/vm/numa_memory_policy.txt452
-rw-r--r--Documentation/vm/overcommit-accounting80
-rw-r--r--Documentation/vm/overcommit-accounting.rst87
-rw-r--r--Documentation/vm/page_frags.rst (renamed from Documentation/vm/page_frags)5
-rw-r--r--Documentation/vm/page_migration.rst (renamed from Documentation/vm/page_migration)149
-rw-r--r--Documentation/vm/page_owner.rst (renamed from Documentation/vm/page_owner.txt)34
-rw-r--r--Documentation/vm/remap_file_pages.rst (renamed from Documentation/vm/remap_file_pages.txt)6
-rw-r--r--Documentation/vm/slub.rst361
-rw-r--r--Documentation/vm/slub.txt342
-rw-r--r--Documentation/vm/split_page_table_lock.rst (renamed from Documentation/vm/split_page_table_lock)12
-rw-r--r--Documentation/vm/swap_numa.rst (renamed from Documentation/vm/swap_numa.txt)55
-rw-r--r--Documentation/vm/transhuge.rst197
-rw-r--r--Documentation/vm/transhuge.txt527
-rw-r--r--Documentation/vm/unevictable-lru.rst (renamed from Documentation/vm/unevictable-lru.txt)117
-rw-r--r--Documentation/vm/z3fold.rst (renamed from Documentation/vm/z3fold.txt)6
-rw-r--r--Documentation/vm/zsmalloc.rst (renamed from Documentation/vm/zsmalloc.txt)60
-rw-r--r--Documentation/vm/zswap.rst (renamed from Documentation/vm/zswap.txt)71
-rw-r--r--Documentation/x86/intel_rdt_ui.txt75
-rw-r--r--Documentation/x86/x86_64/boot-options.txt13
-rw-r--r--LICENSES/exceptions/Linux-syscall-note2
-rw-r--r--LICENSES/other/Apache-2.0183
-rw-r--r--LICENSES/other/CC-BY-SA-4.0397
-rw-r--r--LICENSES/other/CDDL-1.0364
-rw-r--r--LICENSES/other/Linux-OpenIB26
-rw-r--r--LICENSES/other/X1137
-rw-r--r--LICENSES/preferred/GPL-2.06
-rw-r--r--MAINTAINERS15
-rw-r--r--arch/Kconfig18
-rw-r--r--arch/alpha/Kconfig16
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/pci.h5
-rw-r--r--arch/alpha/include/uapi/asm/Kbuild4
-rw-r--r--arch/alpha/include/uapi/asm/ipcbuf.h2
-rw-r--r--arch/alpha/include/uapi/asm/msgbuf.h28
-rw-r--r--arch/alpha/include/uapi/asm/sembuf.h23
-rw-r--r--arch/alpha/include/uapi/asm/shmbuf.h39
-rw-r--r--arch/alpha/include/uapi/asm/siginfo.h14
-rw-r--r--arch/alpha/kernel/osf_sys.c11
-rw-r--r--arch/alpha/kernel/signal.c20
-rw-r--r--arch/alpha/kernel/traps.c79
-rw-r--r--arch/alpha/mm/fault.c13
-rw-r--r--arch/arc/Kconfig11
-rw-r--r--arch/arc/include/asm/Kbuild2
-rw-r--r--arch/arc/include/asm/dma-mapping.h21
-rw-r--r--arch/arc/include/asm/pci.h6
-rw-r--r--arch/arc/mm/dma.c162
-rw-r--r--arch/arc/mm/fault.c2
-rw-r--r--arch/arm/Kconfig15
-rw-r--r--arch/arm/boot/dts/stm32mp157-pinctrl.dtsi4
-rw-r--r--arch/arm/boot/dts/stm32mp157c.dtsi7
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/pci.h7
-rw-r--r--arch/arm/kernel/ptrace.c1
-rw-r--r--arch/arm/kernel/setup.c2
-rw-r--r--arch/arm/kernel/swp_emulate.c1
-rw-r--r--arch/arm/kernel/traps.c5
-rw-r--r--arch/arm/mach-axxia/Kconfig1
-rw-r--r--arch/arm/mach-bcm/Kconfig1
-rw-r--r--arch/arm/mach-exynos/Kconfig1
-rw-r--r--arch/arm/mach-highbank/Kconfig1
-rw-r--r--arch/arm/mach-rockchip/Kconfig1
-rw-r--r--arch/arm/mach-shmobile/Kconfig1
-rw-r--r--arch/arm/mach-tegra/Kconfig1
-rw-r--r--arch/arm/mm/Kconfig7
-rw-r--r--arch/arm/mm/alignment.c1
-rw-r--r--arch/arm/mm/dma-mapping-nommu.c9
-rw-r--r--arch/arm/mm/dma-mapping.c9
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm/vfp/vfpmodule.c3
-rw-r--r--arch/arm64/Kconfig22
-rw-r--r--arch/arm64/include/asm/compat.h43
-rw-r--r--arch/arm64/include/asm/pci.h5
-rw-r--r--arch/arm64/include/asm/spinlock.h5
-rw-r--r--arch/arm64/include/asm/stat.h1
-rw-r--r--arch/arm64/kernel/fpsimd.c2
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c1
-rw-r--r--arch/arm64/kernel/perf_regs.c2
-rw-r--r--arch/arm64/kernel/sys_compat.c1
-rw-r--r--arch/arm64/kernel/traps.c1
-rw-r--r--arch/arm64/mm/dma-mapping.c10
-rw-r--r--arch/arm64/mm/fault.c18
-rw-r--r--arch/c6x/Kconfig4
-rw-r--r--arch/c6x/include/asm/Kbuild2
-rw-r--r--arch/c6x/include/asm/dma-mapping.h28
-rw-r--r--arch/c6x/include/asm/setup.h2
-rw-r--r--arch/c6x/kernel/Makefile2
-rw-r--r--arch/c6x/kernel/dma.c149
-rw-r--r--arch/c6x/kernel/traps.c9
-rw-r--r--arch/c6x/mm/dma-coherent.c40
-rw-r--r--arch/h8300/include/asm/Kbuild1
-rw-r--r--arch/h8300/include/asm/pci.h2
-rw-r--r--arch/hexagon/Kconfig4
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/hexagon/kernel/dma.c1
-rw-r--r--arch/hexagon/kernel/traps.c9
-rw-r--r--arch/hexagon/mm/vm_fault.c20
-rw-r--r--arch/ia64/Kconfig25
-rw-r--r--arch/ia64/hp/common/sba_iommu.c3
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/ia64/include/asm/hardirq.h2
-rw-r--r--arch/ia64/include/asm/pci.h17
-rw-r--r--arch/ia64/include/uapi/asm/Kbuild4
-rw-r--r--arch/ia64/include/uapi/asm/ipcbuf.h2
-rw-r--r--arch/ia64/include/uapi/asm/msgbuf.h28
-rw-r--r--arch/ia64/include/uapi/asm/sembuf.h23
-rw-r--r--arch/ia64/include/uapi/asm/shmbuf.h39
-rw-r--r--arch/ia64/include/uapi/asm/siginfo.h7
-rw-r--r--arch/ia64/kernel/brl_emu.c1
-rw-r--r--arch/ia64/kernel/dma-mapping.c10
-rw-r--r--arch/ia64/kernel/setup.c12
-rw-r--r--arch/ia64/kernel/signal.c2
-rw-r--r--arch/ia64/kernel/traps.c31
-rw-r--r--arch/ia64/kernel/unaligned.c1
-rw-r--r--arch/ia64/mm/fault.c4
-rw-r--r--arch/ia64/sn/kernel/io_common.c5
-rw-r--r--arch/m68k/68000/timers.c4
-rw-r--r--arch/m68k/apollo/config.c8
-rw-r--r--arch/m68k/configs/amiga_defconfig20
-rw-r--r--arch/m68k/configs/apollo_defconfig20
-rw-r--r--arch/m68k/configs/atari_defconfig20
-rw-r--r--arch/m68k/configs/bvme6000_defconfig20
-rw-r--r--arch/m68k/configs/hp300_defconfig20
-rw-r--r--arch/m68k/configs/mac_defconfig20
-rw-r--r--arch/m68k/configs/multi_defconfig20
-rw-r--r--arch/m68k/configs/mvme147_defconfig20
-rw-r--r--arch/m68k/configs/mvme16x_defconfig20
-rw-r--r--arch/m68k/configs/q40_defconfig20
-rw-r--r--arch/m68k/configs/sun3_defconfig20
-rw-r--r--arch/m68k/configs/sun3x_defconfig20
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/delay.h11
-rw-r--r--arch/m68k/include/asm/pci.h6
-rw-r--r--arch/m68k/include/asm/uaccess_mm.h16
-rw-r--r--arch/m68k/kernel/dma.c10
-rw-r--r--arch/m68k/kernel/signal.c23
-rw-r--r--arch/m68k/kernel/time.c25
-rw-r--r--arch/m68k/kernel/traps.c58
-rw-r--r--arch/m68k/mac/config.c2
-rw-r--r--arch/m68k/mm/fault.c25
-rw-r--r--arch/m68k/mm/kmap.c3
-rw-r--r--arch/m68k/mvme147/config.c4
-rw-r--r--arch/m68k/mvme16x/config.c4
-rw-r--r--arch/m68k/sun3/intersil.c8
-rw-r--r--arch/m68k/sun3x/time.c8
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/microblaze/include/asm/pci.h6
-rw-r--r--arch/microblaze/kernel/dma.c11
-rw-r--r--arch/microblaze/kernel/exceptions.c8
-rw-r--r--arch/microblaze/mm/fault.c12
-rw-r--r--arch/mips/Kconfig24
-rw-r--r--arch/mips/cavium-octeon/Kconfig12
-rw-r--r--arch/mips/include/asm/compat.h51
-rw-r--r--arch/mips/include/asm/pci.h7
-rw-r--r--arch/mips/include/uapi/asm/msgbuf.h57
-rw-r--r--arch/mips/include/uapi/asm/sembuf.h15
-rw-r--r--arch/mips/include/uapi/asm/shmbuf.h23
-rw-r--r--arch/mips/kernel/signal32.c2
-rw-r--r--arch/mips/kernel/traps.c65
-rw-r--r--arch/mips/loongson64/Kconfig15
-rw-r--r--arch/mips/mm/dma-default.c10
-rw-r--r--arch/mips/mm/fault.c18
-rw-r--r--arch/mips/netlogic/Kconfig6
-rw-r--r--arch/nds32/Kconfig3
-rw-r--r--arch/nds32/include/asm/Kbuild2
-rw-r--r--arch/nds32/include/asm/dma-mapping.h14
-rw-r--r--arch/nds32/kernel/dma.c196
-rw-r--r--arch/nds32/kernel/traps.c35
-rw-r--r--arch/nds32/mm/fault.c18
-rw-r--r--arch/nios2/include/asm/Kbuild1
-rw-r--r--arch/nios2/kernel/traps.c8
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/openrisc/kernel/dma.c11
-rw-r--r--arch/openrisc/kernel/traps.c30
-rw-r--r--arch/openrisc/mm/fault.c18
-rw-r--r--arch/parisc/Kconfig8
-rw-r--r--arch/parisc/include/asm/compat.h43
-rw-r--r--arch/parisc/include/asm/hardirq.h8
-rw-r--r--arch/parisc/include/asm/pci.h23
-rw-r--r--arch/parisc/include/uapi/asm/msgbuf.h33
-rw-r--r--arch/parisc/include/uapi/asm/sembuf.h16
-rw-r--r--arch/parisc/include/uapi/asm/shmbuf.h19
-rw-r--r--arch/parisc/kernel/ptrace.c10
-rw-r--r--arch/parisc/kernel/setup.c5
-rw-r--r--arch/parisc/kernel/traps.c61
-rw-r--r--arch/parisc/kernel/unaligned.c15
-rw-r--r--arch/parisc/math-emu/driver.c8
-rw-r--r--arch/parisc/mm/fault.c54
-rw-r--r--arch/powerpc/Kconfig27
-rw-r--r--arch/powerpc/include/asm/compat.h43
-rw-r--r--arch/powerpc/include/asm/hardirq.h7
-rw-r--r--arch/powerpc/include/asm/pci.h18
-rw-r--r--arch/powerpc/include/uapi/asm/msgbuf.h18
-rw-r--r--arch/powerpc/include/uapi/asm/sembuf.h14
-rw-r--r--arch/powerpc/include/uapi/asm/shmbuf.h19
-rw-r--r--arch/powerpc/include/uapi/asm/siginfo.h15
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/dma.c3
-rw-r--r--arch/powerpc/kernel/process.c1
-rw-r--r--arch/powerpc/kernel/traps.c13
-rw-r--r--arch/powerpc/mm/fault.c1
-rw-r--r--arch/powerpc/oprofile/backtrace.c1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/cell/spufs/fault.c2
-rw-r--r--arch/riscv/Kconfig44
-rw-r--r--arch/riscv/include/asm/dma-mapping.h15
-rw-r--r--arch/riscv/include/asm/pci.h3
-rw-r--r--arch/riscv/kernel/setup.c2
-rw-r--r--arch/riscv/kernel/traps.c16
-rw-r--r--arch/s390/Kconfig17
-rw-r--r--arch/s390/hypfs/hypfs_sprp.c1
-rw-r--r--arch/s390/include/asm/compat.h43
-rw-r--r--arch/s390/include/asm/elf.h4
-rw-r--r--arch/s390/include/asm/hardirq.h2
-rw-r--r--arch/s390/include/asm/pci.h2
-rw-r--r--arch/s390/include/uapi/asm/Kbuild3
-rw-r--r--arch/s390/include/uapi/asm/msgbuf.h38
-rw-r--r--arch/s390/include/uapi/asm/sembuf.h30
-rw-r--r--arch/s390/include/uapi/asm/shmbuf.h49
-rw-r--r--arch/s390/kernel/traps.c29
-rw-r--r--arch/s390/kvm/priv.c1
-rw-r--r--arch/s390/mm/fault.c21
-rw-r--r--arch/s390/pci/pci_clp.c1
-rw-r--r--arch/s390/pci/pci_dma.c11
-rw-r--r--arch/sh/Kconfig10
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/pci.h6
-rw-r--r--arch/sh/kernel/dma-nommu.c1
-rw-r--r--arch/sh/kernel/hw_breakpoint.c9
-rw-r--r--arch/sh/kernel/irq.c2
-rw-r--r--arch/sh/kernel/traps_32.c20
-rw-r--r--arch/sh/math-emu/math.c8
-rw-r--r--arch/sh/mm/consistent.c9
-rw-r--r--arch/sh/mm/fault.c9
-rw-r--r--arch/sparc/Kconfig18
-rw-r--r--arch/sparc/include/asm/compat.h47
-rw-r--r--arch/sparc/include/asm/hardirq_64.h5
-rw-r--r--arch/sparc/include/asm/iommu-common.h (renamed from include/linux/iommu-common.h)0
-rw-r--r--arch/sparc/include/asm/iommu_64.h2
-rw-r--r--arch/sparc/include/asm/pci_32.h4
-rw-r--r--arch/sparc/include/asm/pci_64.h6
-rw-r--r--arch/sparc/include/uapi/asm/msgbuf.h22
-rw-r--r--arch/sparc/include/uapi/asm/sembuf.h16
-rw-r--r--arch/sparc/include/uapi/asm/shmbuf.h21
-rw-r--r--arch/sparc/include/uapi/asm/siginfo.h7
-rw-r--r--arch/sparc/kernel/Makefile4
-rw-r--r--arch/sparc/kernel/dma.c13
-rw-r--r--arch/sparc/kernel/iommu-common.c (renamed from lib/iommu-common.c)5
-rw-r--r--arch/sparc/kernel/iommu.c2
-rw-r--r--arch/sparc/kernel/ldc.c2
-rw-r--r--arch/sparc/kernel/pci_sun4v.c2
-rw-r--r--arch/sparc/kernel/process_64.c9
-rw-r--r--arch/sparc/kernel/sys_sparc_32.c8
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c8
-rw-r--r--arch/sparc/kernel/traps_32.c104
-rw-r--r--arch/sparc/kernel/traps_64.c131
-rw-r--r--arch/sparc/kernel/unaligned_32.c11
-rw-r--r--arch/sparc/mm/fault_32.c12
-rw-r--r--arch/sparc/mm/fault_64.c8
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/um/kernel/ptrace.c13
-rw-r--r--arch/um/kernel/trap.c62
-rw-r--r--arch/unicore32/Kconfig5
-rw-r--r--arch/unicore32/include/asm/Kbuild1
-rw-r--r--arch/unicore32/kernel/fpu-ucf64.c8
-rw-r--r--arch/unicore32/mm/Kconfig11
-rw-r--r--arch/unicore32/mm/fault.c3
-rw-r--r--arch/x86/Kconfig37
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/eboot.c114
-rw-r--r--arch/x86/boot/compressed/head_64.S1
-rw-r--r--arch/x86/boot/compressed/kaslr.c4
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c14
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile11
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/events/amd/ibs.c2
-rw-r--r--arch/x86/events/amd/uncore.c21
-rw-r--r--arch/x86/events/core.c2
-rw-r--r--arch/x86/events/intel/pt.c4
-rw-r--r--arch/x86/events/intel/uncore.c76
-rw-r--r--arch/x86/events/intel/uncore.h127
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c2
-rw-r--r--arch/x86/events/intel/uncore_snb.c101
-rw-r--r--arch/x86/events/intel/uncore_snbep.c82
-rw-r--r--arch/x86/hyperv/Makefile3
-rw-r--r--arch/x86/hyperv/hv_apic.c256
-rw-r--r--arch/x86/hyperv/hv_init.c32
-rw-r--r--arch/x86/hyperv/mmu.c75
-rw-r--r--arch/x86/include/asm/cacheinfo.h7
-rw-r--r--arch/x86/include/asm/compat.h43
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h30
-rw-r--r--arch/x86/include/asm/intel_mid_vrtc.h4
-rw-r--r--arch/x86/include/asm/io.h8
-rw-r--r--arch/x86/include/asm/mc146818rtc.h4
-rw-r--r--arch/x86/include/asm/mshyperv.h44
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h4
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pgalloc.h4
-rw-r--r--arch/x86/include/asm/pgtable.h12
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h25
-rw-r--r--arch/x86/include/asm/processor.h9
-rw-r--r--arch/x86/include/asm/pvclock.h2
-rw-r--r--arch/x86/include/asm/qspinlock.h21
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h3
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/sparsemem.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_64.h10
-rw-r--r--arch/x86/include/asm/uaccess_64.h14
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/include/uapi/asm/sembuf.h11
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c36
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c (renamed from arch/x86/kernel/cpu/intel_cacheinfo.c)46
-rw-r--r--arch/x86/kernel/cpu/centaur.c53
-rw-r--r--arch/x86/kernel/cpu/common.c35
-rw-r--r--arch/x86/kernel/cpu/cpu.h10
-rw-r--r--arch/x86/kernel/cpu/intel.c34
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c50
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h18
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c24
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c170
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c33
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c15
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c (renamed from arch/x86/kernel/cpu/mtrr/main.c)37
-rw-r--r--arch/x86/kernel/cpu/topology.c8
-rw-r--r--arch/x86/kernel/dumpstack.c144
-rw-r--r--arch/x86/kernel/e820.c32
-rw-r--r--arch/x86/kernel/early-quirks.c8
-rw-r--r--arch/x86/kernel/head64.c25
-rw-r--r--arch/x86/kernel/hpet.c5
-rw-r--r--arch/x86/kernel/jailhouse.c2
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/pci-dma.c58
-rw-r--r--arch/x86/kernel/perf_regs.c10
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/pvclock.c15
-rw-r--r--arch/x86/kernel/rtc.c10
-rw-r--r--arch/x86/kernel/signal_compat.c2
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/umip.c1
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/lib/memcpy_64.S102
-rw-r--r--arch/x86/lib/usercopy_64.c30
-rw-r--r--arch/x86/mm/dump_pagetables.c6
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/mm/ident_map.c2
-rw-r--r--arch/x86/mm/init_64.c8
-rw-r--r--arch/x86/mm/kasan_init_64.c14
-rw-r--r--arch/x86/mm/kaslr.c8
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c233
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c12
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/um/vdso/Makefile4
-rw-r--r--arch/x86/xen/efi.c57
-rw-r--r--arch/x86/xen/time.c10
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/asm/pci.h2
-rw-r--r--arch/xtensa/include/uapi/asm/msgbuf.h25
-rw-r--r--arch/xtensa/include/uapi/asm/sembuf.h17
-rw-r--r--arch/xtensa/include/uapi/asm/shmbuf.h37
-rw-r--r--arch/xtensa/kernel/pci-dma.c9
-rw-r--r--arch/xtensa/kernel/traps.c9
-rw-r--r--arch/xtensa/mm/fault.c18
-rw-r--r--crypto/af_alg.c14
-rw-r--r--crypto/algif_aead.c4
-rw-r--r--crypto/algif_hash.c2
-rw-r--r--crypto/algif_rng.c1
-rw-r--r--crypto/algif_skcipher.c4
-rw-r--r--drivers/amba/bus.c5
-rw-r--r--drivers/base/dma-mapping.c31
-rw-r--r--drivers/base/platform-msi.c3
-rw-r--r--drivers/base/platform.c18
-rw-r--r--drivers/base/regmap/regmap-mmio.c3
-rw-r--r--drivers/base/regmap/regmap-slimbus.c2
-rw-r--r--drivers/bcma/main.c2
-rw-r--r--drivers/bus/fsl-mc/fsl-mc-msi.c2
-rw-r--r--drivers/char/random.c29
-rw-r--r--drivers/clocksource/Kconfig4
-rw-r--r--drivers/clocksource/arc_timer.c14
-rw-r--r--drivers/clocksource/mips-gic-timer.c18
-rw-r--r--drivers/clocksource/mxs_timer.c28
-rw-r--r--drivers/clocksource/timer-imx-gpt.c28
-rw-r--r--drivers/clocksource/timer-imx-tpm.c13
-rw-r--r--drivers/dma/qcom/hidma_mgmt.c2
-rw-r--r--drivers/firmware/efi/Kconfig5
-rw-r--r--drivers/firmware/efi/Makefile1
-rw-r--r--drivers/firmware/efi/capsule-loader.c14
-rw-r--r--drivers/firmware/efi/cper-arm.c6
-rw-r--r--drivers/firmware/efi/cper-x86.c356
-rw-r--r--drivers/firmware/efi/cper.c16
-rw-r--r--drivers/firmware/efi/libstub/secureboot.c3
-rw-r--r--drivers/firmware/efi/libstub/tpm.c2
-rw-r--r--drivers/gpu/host1x/bus.c9
-rw-r--r--drivers/hwmon/Kconfig7
-rw-r--r--drivers/hwmon/asus_atk0110.c117
-rw-r--r--drivers/hwmon/fschmd.c2
-rw-r--r--drivers/hwmon/hwmon.c3
-rw-r--r--drivers/hwmon/k10temp.c60
-rw-r--r--drivers/hwmon/ltc2990.c212
-rw-r--r--drivers/hwmon/mc13783-adc.c60
-rw-r--r--drivers/ide/ide-dma.c2
-rw-r--r--drivers/ide/ide-lib.c26
-rw-r--r--drivers/ide/ide-probe.c6
-rw-r--r--drivers/iommu/Kconfig1
-rw-r--r--drivers/irqchip/Makefile2
-rw-r--r--drivers/irqchip/irq-gic-v3-mbi.c331
-rw-r--r--drivers/irqchip/irq-gic-v3.c7
-rw-r--r--drivers/irqchip/irq-meson-gpio.c5
-rw-r--r--drivers/irqchip/irq-mvebu-gicp.c38
-rw-r--r--drivers/irqchip/irq-mvebu-gicp.h12
-rw-r--r--drivers/irqchip/irq-mvebu-icu.c33
-rw-r--r--drivers/irqchip/irq-stm32-exti.c683
-rw-r--r--drivers/isdn/mISDN/socket.c3
-rw-r--r--drivers/mfd/mc13xxx-core.c15
-rw-r--r--drivers/mtd/devices/Kconfig1
-rw-r--r--drivers/mtd/devices/m25p80.c238
-rw-r--r--drivers/net/ethernet/sfc/efx.c5
-rw-r--r--drivers/net/ethernet/sfc/falcon/efx.c5
-rw-r--r--drivers/net/ppp/pppoe.c2
-rw-r--r--drivers/net/ppp/pptp.c1
-rw-r--r--drivers/nvdimm/claim.c3
-rw-r--r--drivers/nvdimm/pmem.c6
-rw-r--r--drivers/nvme/host/core.c2
-rw-r--r--drivers/of/device.c6
-rw-r--r--drivers/of/of_reserved_mem.c2
-rw-r--r--drivers/parisc/Kconfig5
-rw-r--r--drivers/parisc/ccio-dma.c2
-rw-r--r--drivers/parisc/sba_iommu.c2
-rw-r--r--drivers/pci/Kconfig4
-rw-r--r--drivers/pci/bus.c4
-rw-r--r--drivers/pci/msi.c3
-rw-r--r--drivers/pci/pci-driver.c33
-rw-r--r--drivers/pinctrl/stm32/pinctrl-stm32.c13
-rw-r--r--drivers/platform/chrome/Kconfig11
-rw-r--r--drivers/platform/chrome/Makefile1
-rw-r--r--drivers/platform/chrome/chromeos_laptop.c315
-rw-r--r--drivers/platform/chrome/chromeos_tbmc.c111
-rw-r--r--drivers/platform/chrome/cros_ec_lightbar.c21
-rw-r--r--drivers/platform/chrome/cros_ec_lpc.c13
-rw-r--r--drivers/platform/chrome/cros_ec_sysfs.c2
-rw-r--r--drivers/platform/chrome/cros_ec_vbc.c9
-rw-r--r--drivers/s390/block/dasd_ioctl.c1
-rw-r--r--drivers/s390/char/fs3270.c1
-rw-r--r--drivers/s390/char/sclp_ctl.c1
-rw-r--r--drivers/s390/char/vmcp.c1
-rw-r--r--drivers/s390/cio/chsc_sch.c1
-rw-r--r--drivers/s390/net/qeth_core_main.c2
-rw-r--r--drivers/scsi/scsi_lib.c24
-rw-r--r--drivers/spi/Kconfig27
-rw-r--r--drivers/spi/Makefile2
-rw-r--r--drivers/spi/internals.h43
-rw-r--r--drivers/spi/spi-bcm-qspi.c162
-rw-r--r--drivers/spi/spi-bcm53xx.c360
-rw-r--r--drivers/spi/spi-bcm53xx.h73
-rw-r--r--drivers/spi/spi-bcm63xx-hsspi.c25
-rw-r--r--drivers/spi/spi-cadence.c6
-rw-r--r--drivers/spi/spi-fsl-lpspi.c21
-rw-r--r--drivers/spi/spi-imx.c22
-rw-r--r--drivers/spi/spi-mem.c410
-rw-r--r--drivers/spi/spi-meson-spicc.c11
-rw-r--r--drivers/spi/spi-mpc52xx.c2
-rw-r--r--drivers/spi/spi-mxs.c48
-rw-r--r--drivers/spi/spi-omap2-mcspi.c111
-rw-r--r--drivers/spi/spi-pxa2xx-dma.c28
-rw-r--r--drivers/spi/spi-pxa2xx.c257
-rw-r--r--drivers/spi/spi-pxa2xx.h17
-rw-r--r--drivers/spi/spi-s3c64xx.c160
-rw-r--r--drivers/spi/spi-sh-msiof.c73
-rw-r--r--drivers/spi/spi-stm32.c2
-rw-r--r--drivers/spi/spi-ti-qspi.c87
-rw-r--r--drivers/spi/spi-zynqmp-gqspi.c92
-rw-r--r--drivers/spi/spi.c145
-rw-r--r--drivers/staging/comedi/drivers/serial2002.c4
-rw-r--r--drivers/staging/ipx/af_ipx.c2
-rw-r--r--drivers/staging/ncpfs/dir.c42
-rw-r--r--drivers/vfio/virqfd.c2
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/w1/w1_io.c1
-rw-r--r--drivers/zorro/zorro.c3
-rw-r--r--fs/9p/vfs_inode.c35
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/adfs/dir.c24
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/aio.c731
-rw-r--r--fs/attr.c36
-rw-r--r--fs/bfs/dir.c43
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/compression.c7
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c123
-rw-r--r--fs/btrfs/ctree.h76
-rw-r--r--fs/btrfs/delayed-inode.c9
-rw-r--r--fs/btrfs/delayed-ref.c275
-rw-r--r--fs/btrfs/delayed-ref.h5
-rw-r--r--fs/btrfs/dev-replace.c150
-rw-r--r--fs/btrfs/disk-io.c391
-rw-r--r--fs/btrfs/extent-tree.c253
-rw-r--r--fs/btrfs/extent_io.c62
-rw-r--r--fs/btrfs/extent_io.h20
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/free-space-tree.c192
-rw-r--r--fs/btrfs/free-space-tree.h8
-rw-r--r--fs/btrfs/inode.c1349
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/locking.c34
-rw-r--r--fs/btrfs/lzo.c76
-rw-r--r--fs/btrfs/ordered-data.c14
-rw-r--r--fs/btrfs/print-tree.c21
-rw-r--r--fs/btrfs/qgroup.c69
-rw-r--r--fs/btrfs/raid56.c38
-rw-r--r--fs/btrfs/relocation.c8
-rw-r--r--fs/btrfs/scrub.c1
-rw-r--r--fs/btrfs/send.c46
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.h6
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c56
-rw-r--r--fs/btrfs/tests/extent-io-tests.c75
-rw-r--r--fs/btrfs/tests/extent-map-tests.c90
-rw-r--r--fs/btrfs/tests/free-space-tests.c177
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c129
-rw-r--r--fs/btrfs/tests/inode-tests.c312
-rw-r--r--fs/btrfs/tests/qgroup-tests.c100
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c28
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/volumes.c506
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/cifs/Makefile7
-rw-r--r--fs/cifs/cifs_debug.c43
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c48
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h47
-rw-r--r--fs/cifs/cifsproto.h16
-rw-r--r--fs/cifs/cifssmb.c23
-rw-r--r--fs/cifs/connect.c93
-rw-r--r--fs/cifs/dir.c40
-rw-r--r--fs/cifs/file.c75
-rw-r--r--fs/cifs/inode.c13
-rw-r--r--fs/cifs/misc.c7
-rw-r--r--fs/cifs/netmisc.c2
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/smb2glob.h5
-rw-r--r--fs/cifs/smb2inode.c43
-rw-r--r--fs/cifs/smb2maperror.c11
-rw-r--r--fs/cifs/smb2misc.c126
-rw-r--r--fs/cifs/smb2ops.c314
-rw-r--r--fs/cifs/smb2pdu.c340
-rw-r--r--fs/cifs/smb2pdu.h84
-rw-r--r--fs/cifs/smb2proto.h9
-rw-r--r--fs/cifs/smb2transport.c10
-rw-r--r--fs/cifs/trace.c18
-rw-r--r--fs/cifs/trace.h429
-rw-r--r--fs/cifs/transport.c4
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/dax.c2
-rw-r--r--fs/dlm/lowcomms.c16
-rw-r--r--fs/eventfd.c15
-rw-r--r--fs/eventpoll.c5
-rw-r--r--fs/fcntl.c15
-rw-r--r--fs/freevxfs/vxfs_lookup.c8
-rw-r--r--fs/gfs2/aops.c69
-rw-r--r--fs/gfs2/bmap.c428
-rw-r--r--fs/gfs2/bmap.h6
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/log.h7
-rw-r--r--fs/gfs2/ops_fstype.c19
-rw-r--r--fs/gfs2/quota.c5
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/trans.c27
-rw-r--r--fs/hfs/dir.c20
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfsplus/dir.c3
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/omfs/dir.c7
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/orangefs/namei.c64
-rw-r--r--fs/pipe.c22
-rw-r--r--fs/proc/base.c136
-rw-r--r--fs/proc/fd.c138
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/namespaces.c24
-rw-r--r--fs/proc/proc_sysctl.c15
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/qnx4/namei.c8
-rw-r--r--fs/qnx6/namei.c8
-rw-r--r--fs/romfs/super.c36
-rw-r--r--fs/select.c85
-rw-r--r--fs/signalfd.c93
-rw-r--r--fs/sysv/namei.c9
-rw-r--r--fs/timerfd.c22
-rw-r--r--fs/ubifs/dir.c43
-rw-r--r--fs/xfs/xfs_iops.c16
-rw-r--r--include/asm-generic/atomic-long.h19
-rw-r--r--include/asm-generic/barrier.h27
-rw-r--r--include/asm-generic/compat.h3
-rw-r--r--include/asm-generic/dma-mapping.h9
-rw-r--r--include/asm-generic/pci.h8
-rw-r--r--include/asm-generic/qspinlock.h4
-rw-r--r--include/asm-generic/qspinlock_types.h32
-rw-r--r--include/crypto/if_alg.h3
-rw-r--r--include/linux/aio.h2
-rw-r--r--include/linux/atomic.h2
-rw-r--r--include/linux/compat.h19
-rw-r--r--include/linux/compat_time.h23
-rw-r--r--include/linux/cper.h4
-rw-r--r--include/linux/delayacct.h2
-rw-r--r--include/linux/device.h11
-rw-r--r--include/linux/dma-debug.h6
-rw-r--r--include/linux/dma-direct.h7
-rw-r--r--include/linux/dma-iommu.h1
-rw-r--r--include/linux/dma-mapping.h19
-rw-r--r--include/linux/dma-noncoherent.h47
-rw-r--r--include/linux/efi.h6
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/hmm.h2
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/interrupt.h13
-rw-r--r--include/linux/iommu-helper.h13
-rw-r--r--include/linux/irq.h1
-rw-r--r--include/linux/irq_cpustat.h10
-rw-r--r--include/linux/irq_sim.h13
-rw-r--r--include/linux/irqchip/arm-gic-v3.h1
-rw-r--r--include/linux/irqdomain.h8
-rw-r--r--include/linux/memremap.h4
-rw-r--r--include/linux/mfd/cros_ec.h2
-rw-r--r--include/linux/mfd/mc13xxx.h2
-rw-r--r--include/linux/mmu_notifier.h2
-rw-r--r--include/linux/msi.h2
-rw-r--r--include/linux/mutex.h3
-rw-r--r--include/linux/net.h1
-rw-r--r--include/linux/of_device.h8
-rw-r--r--include/linux/pci.h2
-rw-r--r--include/linux/perf_event.h10
-rw-r--r--include/linux/platform_device.h2
-rw-r--r--include/linux/poll.h14
-rw-r--r--include/linux/ptrace.h1
-rw-r--r--include/linux/rcupdate.h5
-rw-r--r--include/linux/rcutiny.h1
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/regmap.h19
-rw-r--r--include/linux/restart_block.h7
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/sched/mm.h42
-rw-r--r--include/linux/signal.h3
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--include/linux/spi/spi-mem.h249
-rw-r--r--include/linux/spi/spi.h60
-rw-r--r--include/linux/spinlock.h18
-rw-r--r--include/linux/srcu.h36
-rw-r--r--include/linux/string.h4
-rw-r--r--include/linux/swait.h15
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/syscalls.h25
-rw-r--r--include/linux/time.h4
-rw-r--r--include/linux/time32.h18
-rw-r--r--include/linux/time64.h17
-rw-r--r--include/linux/timekeeping.h91
-rw-r--r--include/linux/timekeeping32.h70
-rw-r--r--include/linux/tracehook.h1
-rw-r--r--include/linux/uio.h15
-rw-r--r--include/net/bluetooth/bluetooth.h2
-rw-r--r--include/net/busy_poll.h15
-rw-r--r--include/net/iucv/af_iucv.h2
-rw-r--r--include/net/sctp/sctp.h3
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/tcp.h3
-rw-r--r--include/net/udp.h2
-rw-r--r--include/trace/events/btrfs.h323
-rw-r--r--include/trace/events/rcu.h13
-rw-r--r--include/uapi/asm-generic/msgbuf.h27
-rw-r--r--include/uapi/asm-generic/posix_types.h1
-rw-r--r--include/uapi/asm-generic/sembuf.h26
-rw-r--r--include/uapi/asm-generic/shmbuf.h41
-rw-r--r--include/uapi/asm-generic/siginfo.h3
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/aio_abi.h12
-rw-r--r--include/uapi/linux/btrfs.h97
-rw-r--r--include/uapi/linux/signalfd.h6
-rw-r--r--include/uapi/linux/time.h7
-rw-r--r--include/uapi/linux/types.h4
-rw-r--r--ipc/mqueue.c86
-rw-r--r--ipc/msg.c20
-rw-r--r--ipc/sem.c27
-rw-r--r--ipc/shm.c14
-rw-r--r--ipc/syscall.c13
-rw-r--r--ipc/util.h4
-rw-r--r--kernel/compat.c52
-rw-r--r--kernel/delayacct.c17
-rw-r--r--kernel/events/core.c44
-rw-r--r--kernel/irq/irq_sim.c7
-rw-r--r--kernel/irq/msi.c33
-rw-r--r--kernel/locking/lockdep.c70
-rw-r--r--kernel/locking/mcs_spinlock.h10
-rw-r--r--kernel/locking/mutex.c3
-rw-r--r--kernel/locking/qspinlock.c247
-rw-r--r--kernel/locking/qspinlock_paravirt.h49
-rw-r--r--kernel/locking/qspinlock_stat.h9
-rw-r--r--kernel/locking/rwsem-xadd.c25
-rw-r--r--kernel/rcu/rcu.h12
-rw-r--r--kernel/rcu/rcu_segcblist.c18
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcuperf.c2
-rw-r--r--kernel/rcu/rcutorture.c15
-rw-r--r--kernel/rcu/srcutiny.c9
-rw-r--r--kernel/rcu/srcutree.c30
-rw-r--r--kernel/rcu/tree.c364
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_exp.h235
-rw-r--r--kernel/rcu/tree_plugin.h98
-rw-r--r--kernel/rcu/update.c50
-rw-r--r--kernel/sched/core.c53
-rw-r--r--kernel/sched/cpufreq_schedutil.c17
-rw-r--r--kernel/sched/fair.c117
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/signal.c181
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/time/clocksource.c35
-rw-r--r--kernel/time/hrtimer.c10
-rw-r--r--kernel/time/posix-stubs.c12
-rw-r--r--kernel/time/posix-timers.c24
-rw-r--r--kernel/time/tick-common.c3
-rw-r--r--kernel/time/time.c60
-rw-r--r--kernel/time/timekeeping.c73
-rw-r--r--kernel/time/timer.c14
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--lib/Kconfig43
-rw-r--r--lib/Kconfig.debug19
-rw-r--r--lib/Makefile3
-rw-r--r--lib/dma-debug.c65
-rw-r--r--lib/dma-direct.c29
-rw-r--r--lib/dma-noncoherent.c102
-rw-r--r--lib/iommu-helper.c14
-rw-r--r--lib/iov_iter.c61
-rw-r--r--lib/swiotlb.c11
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/ksm.c23
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/util.c2
-rw-r--r--net/9p/trans_fd.c18
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/common.c11
-rw-r--r--net/atm/common.h2
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/bluetooth/af_bluetooth.c7
-rw-r--r--net/bluetooth/bnep/sock.c1
-rw-r--r--net/bluetooth/cmtp/sock.c1
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/sock.c1
-rw-r--r--net/bluetooth/l2cap_sock.c2
-rw-r--r--net/bluetooth/rfcomm/sock.c2
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/caif/caif_socket.c12
-rw-r--r--net/can/bcm.c2
-rw-r--r--net/can/raw.c2
-rw-r--r--net/core/datagram.c13
-rw-r--r--net/core/dev.c20
-rw-r--r--net/core/sock.c6
-rw-r--r--net/dccp/dccp.h3
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/dccp/proto.c13
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/ieee802154/socket.c4
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/tcp.c23
-rw-r--r--net/ipv4/udp.c10
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/raw.c4
-rw-r--r--net/iucv/af_iucv.c7
-rw-r--r--net/kcm/kcmsock.c10
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/l2tp/l2tp_ip.c2
-rw-r--r--net/l2tp/l2tp_ip6.c2
-rw-r--r--net/l2tp/l2tp_ppp.c2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/netlink/af_netlink.c2
-rw-r--r--net/netrom/af_netrom.c2
-rw-r--r--net/nfc/llcp_sock.c9
-rw-r--r--net/nfc/rawsock.c4
-rw-r--r--net/packet/af_packet.c9
-rw-r--r--net/phonet/socket.c9
-rw-r--r--net/qrtr/qrtr.c2
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/rxrpc/af_rxrpc.c10
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/protocol.c2
-rw-r--r--net/sctp/socket.c4
-rw-r--r--net/socket.c55
-rw-r--r--net/tipc/socket.c14
-rw-r--r--net/unix/af_unix.c30
-rw-r--r--net/vmw_vsock/af_vsock.c19
-rw-r--r--net/x25/af_x25.c2
-rwxr-xr-xscripts/documentation-file-ref-check125
-rwxr-xr-xscripts/faddr2line18
-rwxr-xr-xscripts/spdxcheck.py284
-rw-r--r--security/commoncap.c8
-rw-r--r--security/integrity/evm/evm_crypto.c3
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h20
-rw-r--r--tools/include/linux/compiler-gcc.h3
-rw-r--r--tools/include/uapi/linux/prctl.h12
-rw-r--r--tools/lib/api/fs/tracing_path.c40
-rw-r--r--tools/lib/api/fs/tracing_path.h9
-rw-r--r--tools/lib/symbol/kallsyms.c6
-rw-r--r--tools/lib/symbol/kallsyms.h2
-rw-r--r--tools/memory-model/Documentation/cheatsheet.txt7
-rw-r--r--tools/memory-model/Documentation/explanation.txt221
-rw-r--r--tools/memory-model/Documentation/references.txt17
-rw-r--r--tools/memory-model/README2
-rw-r--r--tools/memory-model/linux-kernel.bell4
-rw-r--r--tools/memory-model/linux-kernel.cat41
-rw-r--r--tools/memory-model/linux-kernel.def34
-rw-r--r--tools/memory-model/litmus-tests/.gitignore1
-rw-r--r--tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus2
-rw-r--r--tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus35
-rw-r--r--tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus34
-rw-r--r--tools/memory-model/litmus-tests/README19
-rw-r--r--tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus4
-rw-r--r--tools/memory-model/lock.cat107
-rw-r--r--tools/memory-model/scripts/checkalllitmus.sh73
-rw-r--r--tools/memory-model/scripts/checklitmus.sh86
-rw-r--r--tools/perf/Documentation/Makefile29
-rw-r--r--tools/perf/Documentation/asciidoctor-extensions.rb29
-rw-r--r--tools/perf/Documentation/perf-buildid-cache.txt7
-rw-r--r--tools/perf/Documentation/perf-stat.txt16
-rw-r--r--tools/perf/Makefile.config14
-rw-r--r--tools/perf/Makefile.perf10
-rw-r--r--tools/perf/arch/arm/tests/dwarf-unwind.c2
-rw-r--r--tools/perf/arch/arm64/tests/dwarf-unwind.c2
-rw-r--r--tools/perf/arch/powerpc/tests/dwarf-unwind.c2
-rw-r--r--tools/perf/arch/powerpc/util/skip-callchain-idx.c3
-rw-r--r--tools/perf/arch/x86/tests/dwarf-unwind.c2
-rw-r--r--tools/perf/arch/x86/util/Build2
-rw-r--r--tools/perf/arch/x86/util/event.c76
-rw-r--r--tools/perf/arch/x86/util/machine.c103
-rw-r--r--tools/perf/builtin-annotate.c9
-rw-r--r--tools/perf/builtin-buildid-cache.c81
-rw-r--r--tools/perf/builtin-inject.c4
-rw-r--r--tools/perf/builtin-kallsyms.c2
-rw-r--r--tools/perf/builtin-kmem.c6
-rw-r--r--tools/perf/builtin-report.c27
-rw-r--r--tools/perf/builtin-script.c56
-rw-r--r--tools/perf/builtin-stat.c92
-rw-r--r--tools/perf/builtin-timechart.c8
-rw-r--r--tools/perf/builtin-top.c9
-rw-r--r--tools/perf/builtin-trace.c11
-rwxr-xr-xtools/perf/check-headers.sh30
-rw-r--r--tools/perf/examples/bpf/5sec.c49
-rw-r--r--tools/perf/examples/bpf/empty.c3
-rw-r--r--tools/perf/include/bpf/bpf.h13
-rw-r--r--tools/perf/perf.c24
-rw-r--r--tools/perf/tests/builtin-test.c9
-rw-r--r--tools/perf/tests/code-reading.c5
-rw-r--r--tools/perf/tests/hists_common.c6
-rw-r--r--tools/perf/tests/mmap-thread-lookup.c7
-rw-r--r--tools/perf/tests/parse-events.c22
-rwxr-xr-xtools/perf/tests/shell/record+probe_libc_inet_pton.sh12
-rw-r--r--tools/perf/tests/vmlinux-kallsyms.c20
-rwxr-xr-xtools/perf/trace/beauty/prctl_option.sh2
-rw-r--r--tools/perf/ui/browsers/annotate.c8
-rw-r--r--tools/perf/ui/browsers/map.c2
-rw-r--r--tools/perf/ui/stdio/hist.c3
-rw-r--r--tools/perf/util/Build2
-rw-r--r--tools/perf/util/annotate.c57
-rw-r--r--tools/perf/util/annotate.h11
-rw-r--r--tools/perf/util/auxtrace.c12
-rw-r--r--tools/perf/util/build-id.c4
-rw-r--r--tools/perf/util/config.c16
-rw-r--r--tools/perf/util/config.h1
-rw-r--r--tools/perf/util/cs-etm.c4
-rw-r--r--tools/perf/util/db-export.c7
-rw-r--r--tools/perf/util/dso.c34
-rw-r--r--tools/perf/util/dso.h37
-rw-r--r--tools/perf/util/env.c31
-rw-r--r--tools/perf/util/env.h3
-rw-r--r--tools/perf/util/event.c73
-rw-r--r--tools/perf/util/event.h8
-rw-r--r--tools/perf/util/evlist.c15
-rw-r--r--tools/perf/util/evlist.h3
-rw-r--r--tools/perf/util/evsel.c2
-rw-r--r--tools/perf/util/genelf.c2
-rw-r--r--tools/perf/util/intel-bts.c3
-rw-r--r--tools/perf/util/intel-pt-decoder/insn.h18
-rw-r--r--tools/perf/util/intel-pt.c8
-rw-r--r--tools/perf/util/llvm-utils.c19
-rw-r--r--tools/perf/util/machine.c355
-rw-r--r--tools/perf/util/machine.h72
-rw-r--r--tools/perf/util/map.c121
-rw-r--r--tools/perf/util/map.h74
-rw-r--r--tools/perf/util/parse-events.c73
-rw-r--r--tools/perf/util/probe-event.c29
-rw-r--r--tools/perf/util/probe-file.c3
-rw-r--r--tools/perf/util/session.c13
-rw-r--r--tools/perf/util/sort.c10
-rw-r--r--tools/perf/util/sort.h4
-rw-r--r--tools/perf/util/srcline.c1
-rw-r--r--tools/perf/util/stat.h3
-rw-r--r--tools/perf/util/symbol-elf.c494
-rw-r--r--tools/perf/util/symbol-minimal.c3
-rw-r--r--tools/perf/util/symbol.c264
-rw-r--r--tools/perf/util/symbol.h24
-rw-r--r--tools/perf/util/symbol_fprintf.c4
-rw-r--r--tools/perf/util/thread.c35
-rw-r--r--tools/perf/util/thread.h13
-rw-r--r--tools/perf/util/trace-event-info.c11
-rw-r--r--tools/perf/util/trace-event.c8
-rw-r--r--tools/perf/util/unwind-libdw.c23
-rw-r--r--tools/perf/util/unwind-libunwind-local.c19
-rw-r--r--tools/perf/util/util.c34
-rw-r--r--tools/perf/util/util.h4
-rw-r--r--tools/perf/util/vdso.c6
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-find-errors.sh56
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh12
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh4
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh1
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh115
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-torture.sh105
-rw-r--r--tools/virtio/linux/dma-mapping.h2
-rw-r--r--virt/kvm/arm/mmu.c1
-rw-r--r--virt/kvm/eventfd.c2
1099 files changed, 23443 insertions, 16081 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 708dc4c166e4..2754fe83f0d4 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -64,8 +64,6 @@ auxdisplay/
- misc. LCD driver documentation (cfag12864b, ks0108).
backlight/
- directory with info on controlling backlights in flat panel displays
-bcache.txt
- - Block-layer cache on fast SSDs to improve slow (raid) I/O performance.
block/
- info on the Block I/O (BIO) layer.
blockdev/
@@ -78,18 +76,10 @@ bus-devices/
- directory with info on TI GPMC (General Purpose Memory Controller)
bus-virt-phys-mapping.txt
- how to access I/O mapped memory from within device drivers.
-cachetlb.txt
- - describes the cache/TLB flushing interfaces Linux uses.
cdrom/
- directory with information on the CD-ROM drivers that Linux has.
cgroup-v1/
- cgroups v1 features, including cpusets and memory controller.
-cgroup-v2.txt
- - cgroups v2 features, including cpusets and memory controller.
-circular-buffers.txt
- - how to make use of the existing circular buffer infrastructure
-clk.txt
- - info on the common clock framework
cma/
- Continuous Memory Area (CMA) debugfs interface.
conf.py
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 5b2d0f08867c..3e90e1f3bf0a 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -90,4 +90,4 @@ Date: December 2009
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
Description:
The node's huge page size control/query attributes.
- See Documentation/vm/hugetlbpage.txt \ No newline at end of file
+ See Documentation/admin-guide/mm/hugetlbpage.rst \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
index e21c00571cf4..fdaa2162fae1 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -12,4 +12,4 @@ Description:
free_hugepages
surplus_hugepages
resv_hugepages
- See Documentation/vm/hugetlbpage.txt for details.
+ See Documentation/admin-guide/mm/hugetlbpage.rst for details.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 73e653ee2481..dfc13244cda3 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -40,7 +40,7 @@ Description: Kernel Samepage Merging daemon sysfs interface
sleep_millisecs: how many milliseconds ksm should sleep between
scans.
- See Documentation/vm/ksm.txt for more information.
+ See Documentation/vm/ksm.rst for more information.
What: /sys/kernel/mm/ksm/merge_across_nodes
Date: January 2013
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 2cc0a72b64be..29601d93a1c2 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
The alloc_calls file is read-only and lists the kernel code
locations from which allocations for this cache were performed.
The alloc_calls file only contains information if debugging is
- enabled for that cache (see Documentation/vm/slub.txt).
+ enabled for that cache (see Documentation/vm/slub.rst).
What: /sys/kernel/slab/cache/alloc_fastpath
Date: February 2008
@@ -219,7 +219,7 @@ Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Description:
The free_calls file is read-only and lists the locations of
object frees if slab debugging is enabled (see
- Documentation/vm/slub.txt).
+ Documentation/vm/slub.rst).
What: /sys/kernel/slab/cache/free_fastpath
Date: February 2008
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index a27fbfb0efb8..65eb856526b7 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -1,3 +1,5 @@
+What is RCU? -- "Read, Copy, Update"
+
Please note that the "What is RCU?" LWN series is an excellent place
to start learning about RCU:
diff --git a/Documentation/bcache.txt b/Documentation/admin-guide/bcache.rst
index c0ce64d75bbf..c0ce64d75bbf 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/admin-guide/bcache.rst
diff --git a/Documentation/cgroup-v2.txt b/Documentation/admin-guide/cgroup-v2.rst
index 74cdeaed9f7a..74cdeaed9f7a 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 5bb9161dbe6a..48d70af11652 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -48,6 +48,7 @@ configure specific aspects of kernel behavior to your liking.
:maxdepth: 1
initrd
+ cgroup-v2
serial-console
braille-console
parport
@@ -60,9 +61,11 @@ configure specific aspects of kernel behavior to your liking.
mono
java
ras
+ bcache
pm/index
thunderbolt
LSM/index
+ mm/index
.. only:: subproject and html
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2040d46f095..9d699c85d8fe 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -106,11 +106,11 @@
use by PCI
Format: <irq>,<irq>...
- acpi_mask_gpe= [HW,ACPI]
+ acpi_mask_gpe= [HW,ACPI]
Due to the existence of _Lxx/_Exx, some GPEs triggered
by unsupported hardware/firmware features can result in
- GPE floodings that cannot be automatically disabled by
- the GPE dispatcher.
+ GPE floodings that cannot be automatically disabled by
+ the GPE dispatcher.
This facility can be used to prevent such uncontrolled
GPE floodings.
Format: <int>
@@ -472,10 +472,10 @@
for platform specific values (SB1, Loongson3 and
others).
- ccw_timeout_log [S390]
+ ccw_timeout_log [S390]
See Documentation/s390/CommonIO for details.
- cgroup_disable= [KNL] Disable a particular controller
+ cgroup_disable= [KNL] Disable a particular controller
Format: {name of the controller(s) to disable}
The effects of cgroup_disable=foo are:
- foo isn't auto-mounted if you mount all cgroups in
@@ -518,7 +518,7 @@
those clocks in any way. This parameter is useful for
debug and development, but should not be needed on a
platform with proper driver support. For more
- information, see Documentation/clk.txt.
+ information, see Documentation/driver-api/clk.rst.
clock= [BUGS=X86-32, HW] gettimeofday clocksource override.
[Deprecated]
@@ -587,11 +587,6 @@
Sets the size of memory pool for coherent, atomic dma
allocations, by default set to 256K.
- code_bytes [X86] How many bytes of object code to print
- in an oops report.
- Range: 0 - 8192
- Default: 64
-
com20020= [HW,NET] ARCnet - COM20020 chipset
Format:
<io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -641,8 +636,8 @@
hvc<n> Use the hypervisor console device <n>. This is for
both Xen and PowerPC hypervisors.
- If the device connected to the port is not a TTY but a braille
- device, prepend "brl," before the device type, for instance
+ If the device connected to the port is not a TTY but a braille
+ device, prepend "brl," before the device type, for instance
console=brl,ttyS0
For now, only VisioBraille is supported.
@@ -662,7 +657,7 @@
consoleblank= [KNL] The console blank (screen saver) timeout in
seconds. A value of 0 disables the blank timer.
- Defaults to 0.
+ Defaults to 0.
coredump_filter=
[KNL] Change the default value for
@@ -730,7 +725,7 @@
or memory reserved is below 4G.
cryptomgr.notests
- [KNL] Disable crypto self-tests
+ [KNL] Disable crypto self-tests
cs89x0_dma= [HW,NET]
Format: <dma>
@@ -746,7 +741,7 @@
Format: <port#>,<type>
See also Documentation/input/devices/joystick-parport.rst
- ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
+ ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
time. See
Documentation/admin-guide/dynamic-debug-howto.rst for
details. Deprecated, see dyndbg.
@@ -833,7 +828,7 @@
causing system reset or hang due to sending
INIT from AP to BSP.
- disable_ddw [PPC/PSERIES]
+ disable_ddw [PPC/PSERIES]
Disable Dynamic DMA Window support. Use this if
to workaround buggy firmware.
@@ -1188,7 +1183,7 @@
parameter will force ia64_sal_cache_flush to call
ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
- forcepae [X86-32]
+ forcepae [X86-32]
Forcefully enable Physical Address Extension (PAE).
Many Pentium M systems disable PAE but may have a
functionally usable PAE implementation.
@@ -1247,7 +1242,7 @@
gamma= [HW,DRM]
- gart_fix_e820= [X86_64] disable the fix e820 for K8 GART
+ gart_fix_e820= [X86_64] disable the fix e820 for K8 GART
Format: off | on
default: on
@@ -1341,23 +1336,32 @@
x86-64 are 2M (when the CPU supports "pse") and 1G
(when the CPU supports the "pdpe1gb" cpuinfo flag).
- hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
- terminal devices. Valid values: 0..8
- hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
- If specified, z/VM IUCV HVC accepts connections
- from listed z/VM user IDs only.
+ hung_task_panic=
+ [KNL] Should the hung task detector generate panics.
+ Format: <integer>
+ A nonzero value instructs the kernel to panic when a
+ hung task is detected. The default value is controlled
+ by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
+ option. The value selected by this boot parameter can
+ be changed later by the kernel.hung_task_panic sysctl.
+
+ hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
+ terminal devices. Valid values: 0..8
+ hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
+ If specified, z/VM IUCV HVC accepts connections
+ from listed z/VM user IDs only.
keep_bootcon [KNL]
Do not unregister boot console at start. This is only
useful for debugging when something happens in the window
between unregistering the boot console and initializing
the real console.
- i2c_bus= [HW] Override the default board specific I2C bus speed
- or register an additional I2C bus that is not
- registered from board initialization code.
- Format:
- <bus_id>,<clkrate>
+ i2c_bus= [HW] Override the default board specific I2C bus speed
+ or register an additional I2C bus that is not
+ registered from board initialization code.
+ Format:
+ <bus_id>,<clkrate>
i8042.debug [HW] Toggle i8042 debug mode
i8042.unmask_kbd_data
@@ -1386,7 +1390,7 @@
Default: only on s2r transitions on x86; most other
architectures force reset to be always executed
i8042.unlock [HW] Unlock (ignore) the keylock
- i8042.kbdreset [HW] Reset device connected to KBD port
+ i8042.kbdreset [HW] Reset device connected to KBD port
i810= [HW,DRM]
@@ -1548,13 +1552,13 @@
programs exec'd, files mmap'd for exec, and all files
opened for read by uid=0.
- ima_template= [IMA]
+ ima_template= [IMA]
Select one of defined IMA measurements template formats.
Formats: { "ima" | "ima-ng" | "ima-sig" }
Default: "ima-ng"
ima_template_fmt=
- [IMA] Define a custom template format.
+ [IMA] Define a custom template format.
Format: { "field1|...|fieldN" }
ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
@@ -1597,7 +1601,7 @@
inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
Format: <irq>
- int_pln_enable [x86] Enable power limit notification interrupt
+ int_pln_enable [x86] Enable power limit notification interrupt
integrity_audit=[IMA]
Format: { "0" | "1" }
@@ -1650,39 +1654,39 @@
0 disables intel_idle and fall back on acpi_idle.
1 to 9 specify maximum depth of C-state.
- intel_pstate= [X86]
- disable
- Do not enable intel_pstate as the default
- scaling driver for the supported processors
- passive
- Use intel_pstate as a scaling driver, but configure it
- to work with generic cpufreq governors (instead of
- enabling its internal governor). This mode cannot be
- used along with the hardware-managed P-states (HWP)
- feature.
- force
- Enable intel_pstate on systems that prohibit it by default
- in favor of acpi-cpufreq. Forcing the intel_pstate driver
- instead of acpi-cpufreq may disable platform features, such
- as thermal controls and power capping, that rely on ACPI
- P-States information being indicated to OSPM and therefore
- should be used with caution. This option does not work with
- processors that aren't supported by the intel_pstate driver
- or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
- no_hwp
- Do not enable hardware P state control (HWP)
- if available.
- hwp_only
- Only load intel_pstate on systems which support
- hardware P state control (HWP) if available.
- support_acpi_ppc
- Enforce ACPI _PPC performance limits. If the Fixed ACPI
- Description Table, specifies preferred power management
- profile as "Enterprise Server" or "Performance Server",
- then this feature is turned on by default.
- per_cpu_perf_limits
- Allow per-logical-CPU P-State performance control limits using
- cpufreq sysfs interface
+ intel_pstate= [X86]
+ disable
+ Do not enable intel_pstate as the default
+ scaling driver for the supported processors
+ passive
+ Use intel_pstate as a scaling driver, but configure it
+ to work with generic cpufreq governors (instead of
+ enabling its internal governor). This mode cannot be
+ used along with the hardware-managed P-states (HWP)
+ feature.
+ force
+ Enable intel_pstate on systems that prohibit it by default
+ in favor of acpi-cpufreq. Forcing the intel_pstate driver
+ instead of acpi-cpufreq may disable platform features, such
+ as thermal controls and power capping, that rely on ACPI
+ P-States information being indicated to OSPM and therefore
+ should be used with caution. This option does not work with
+ processors that aren't supported by the intel_pstate driver
+ or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
+ no_hwp
+ Do not enable hardware P state control (HWP)
+ if available.
+ hwp_only
+ Only load intel_pstate on systems which support
+ hardware P state control (HWP) if available.
+ support_acpi_ppc
+ Enforce ACPI _PPC performance limits. If the Fixed ACPI
+ Description Table, specifies preferred power management
+ profile as "Enterprise Server" or "Performance Server",
+ then this feature is turned on by default.
+ per_cpu_perf_limits
+ Allow per-logical-CPU P-State performance control limits using
+ cpufreq sysfs interface
intremap= [X86-64, Intel-IOMMU]
on enable Interrupt Remapping (default)
@@ -1705,7 +1709,6 @@
nopanic
merge
nomerge
- forcesac
soft
pt [x86, IA-64]
nobypass [PPC/POWERNV]
@@ -2027,7 +2030,7 @@
* [no]ncqtrim: Turn off queued DSM TRIM.
* nohrst, nosrst, norst: suppress hard, soft
- and both resets.
+ and both resets.
* rstonce: only attempt one reset during
hot-unplug link recovery
@@ -2215,7 +2218,7 @@
[KNL,SH] Allow user to override the default size for
per-device physically contiguous DMA buffers.
- memhp_default_state=online/offline
+ memhp_default_state=online/offline
[KNL] Set the initial state for the memory hotplug
onlining policy. If not specified, the default value is
set according to the
@@ -2600,6 +2603,9 @@
emulation library even if a 387 maths coprocessor
is present.
+ no5lvl [X86-64] Disable 5-level paging mode. Forces
+ kernel to use 4-level paging instead.
+
no_console_suspend
[HW] Never suspend the console
Disable suspending of consoles during suspend and
@@ -2765,7 +2771,7 @@
[X86,PV_OPS] Disable paravirtualized VMware scheduler
clock and use the default one.
- no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
+ no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
steal time is computed, but won't influence scheduler
behaviour
@@ -2826,7 +2832,7 @@
notsc [BUGS=X86-32] Disable Time Stamp Counter
nowatchdog [KNL] Disable both lockup detectors, i.e.
- soft-lockup and NMI watchdog (hard-lockup).
+ soft-lockup and NMI watchdog (hard-lockup).
nowb [ARM]
@@ -2846,7 +2852,7 @@
If the dependencies are under your control, you can
turn on cpu0_hotplug.
- nps_mtm_hs_ctr= [KNL,ARC]
+ nps_mtm_hs_ctr= [KNL,ARC]
This parameter sets the maximum duration, in
cycles, each HW thread of the CTOP can run
without interruptions, before HW switches it.
@@ -2987,7 +2993,7 @@
pci=option[,option...] [PCI] various PCI subsystem options:
earlydump [X86] dump PCI config space before the kernel
- changes anything
+ changes anything
off [X86] don't probe for the PCI bus
bios [X86-32] force use of PCI BIOS, don't access
the hardware directly. Use this if your machine
@@ -3075,7 +3081,7 @@
is enabled by default. If you need to use this,
please report a bug.
nocrs [X86] Ignore PCI host bridge windows from ACPI.
- If you need to use this, please report a bug.
+ If you need to use this, please report a bug.
routeirq Do IRQ routing for all PCI devices.
This is normally done in pci_enable_device(),
so this option is a temporary workaround
@@ -3918,7 +3924,7 @@
cache (risks via metadata attacks are mostly
unchanged). Debug options disable merging on their
own.
- For more information see Documentation/vm/slub.txt.
+ For more information see Documentation/vm/slub.rst.
slab_max_order= [MM, SLAB]
Determines the maximum allowed order for slabs.
@@ -3932,7 +3938,7 @@
slub_debug can create guard zones around objects and
may poison objects when not in use. Also tracks the
last alloc / free. For more information see
- Documentation/vm/slub.txt.
+ Documentation/vm/slub.rst.
slub_memcg_sysfs= [MM, SLUB]
Determines whether to enable sysfs directories for
@@ -3946,7 +3952,7 @@
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
fragmentation. For more information see
- Documentation/vm/slub.txt.
+ Documentation/vm/slub.rst.
slub_min_objects= [MM, SLUB]
The minimum number of objects per slab. SLUB will
@@ -3955,12 +3961,12 @@
the number of objects indicated. The higher the number
of objects the smaller the overhead of tracking slabs
and the less frequently locks need to be acquired.
- For more information see Documentation/vm/slub.txt.
+ For more information see Documentation/vm/slub.rst.
slub_min_order= [MM, SLUB]
Determines the minimum page order for slabs. Must be
lower than slub_max_order.
- For more information see Documentation/vm/slub.txt.
+ For more information see Documentation/vm/slub.rst.
slub_nomerge [MM, SLUB]
Same with slab_nomerge. This is supported for legacy.
@@ -4358,7 +4364,8 @@
Format: [always|madvise|never]
Can be used to control the default behavior of the system
with respect to transparent hugepages.
- See Documentation/vm/transhuge.txt for more details.
+ See Documentation/admin-guide/mm/transhuge.rst
+ for more details.
tsc= Disable clocksource stability checks for TSC.
Format: <string>
@@ -4436,7 +4443,7 @@
usbcore.initial_descriptor_timeout=
[USB] Specifies timeout for the initial 64-byte
- USB_REQ_GET_DESCRIPTOR request in milliseconds
+ USB_REQ_GET_DESCRIPTOR request in milliseconds
(default 5000 = 5.0 seconds).
usbcore.nousb [USB] Disable the USB subsystem
diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst
new file mode 100644
index 000000000000..291699c810d4
--- /dev/null
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -0,0 +1,222 @@
+.. _mm_concepts:
+
+=================
+Concepts overview
+=================
+
+The memory management in Linux is complex system that evolved over the
+years and included more and more functionality to support variety of
+systems from MMU-less microcontrollers to supercomputers. The memory
+management for systems without MMU is called ``nommu`` and it
+definitely deserves a dedicated document, which hopefully will be
+eventually written. Yet, although some of the concepts are the same,
+here we assume that MMU is available and CPU can translate a virtual
+address to a physical address.
+
+.. contents:: :local:
+
+Virtual Memory Primer
+=====================
+
+The physical memory in a computer system is a limited resource and
+even for systems that support memory hotplug there is a hard limit on
+the amount of memory that can be installed. The physical memory is not
+necessary contiguous, it might be accessible as a set of distinct
+address ranges. Besides, different CPU architectures, and even
+different implementations of the same architecture have different view
+how these address ranges defined.
+
+All this makes dealing directly with physical memory quite complex and
+to avoid this complexity a concept of virtual memory was developed.
+
+The virtual memory abstracts the details of physical memory from the
+application software, allows to keep only needed information in the
+physical memory (demand paging) and provides a mechanism for the
+protection and controlled sharing of data between processes.
+
+With virtual memory, each and every memory access uses a virtual
+address. When the CPU decodes the an instruction that reads (or
+writes) from (or to) the system memory, it translates the `virtual`
+address encoded in that instruction to a `physical` address that the
+memory controller can understand.
+
+The physical system memory is divided into page frames, or pages. The
+size of each page is architecture specific. Some architectures allow
+selection of the page size from several supported values; this
+selection is performed at the kernel build time by setting an
+appropriate kernel configuration option.
+
+Each physical memory page can be mapped as one or more virtual
+pages. These mappings are described by page tables that allow
+translation from virtual address used by programs to real address in
+the physical memory. The page tables organized hierarchically.
+
+The tables at the lowest level of the hierarchy contain physical
+addresses of actual pages used by the software. The tables at higher
+levels contain physical addresses of the pages belonging to the lower
+levels. The pointer to the top level page table resides in a
+register. When the CPU performs the address translation, it uses this
+register to access the top level page table. The high bits of the
+virtual address are used to index an entry in the top level page
+table. That entry is then used to access the next level in the
+hierarchy with the next bits of the virtual address as the index to
+that level page table. The lowest bits in the virtual address define
+the offset inside the actual page.
+
+Huge Pages
+==========
+
+The address translation requires several memory accesses and memory
+accesses are slow relatively to CPU speed. To avoid spending precious
+processor cycles on the address translation, CPUs maintain a cache of
+such translations called Translation Lookaside Buffer (or
+TLB). Usually TLB is pretty scarce resource and applications with
+large memory working set will experience performance hit because of
+TLB misses.
+
+Many modern CPU architectures allow mapping of the memory pages
+directly by the higher levels in the page table. For instance, on x86,
+it is possible to map 2M and even 1G pages using entries in the second
+and the third level page tables. In Linux such pages are called
+`huge`. Usage of huge pages significantly reduces pressure on TLB,
+improves TLB hit-rate and thus improves overall system performance.
+
+There are two mechanisms in Linux that enable mapping of the physical
+memory with the huge pages. The first one is `HugeTLB filesystem`, or
+hugetlbfs. It is a pseudo filesystem that uses RAM as its backing
+store. For the files created in this filesystem the data resides in
+the memory and mapped using huge pages. The hugetlbfs is described at
+:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`.
+
+Another, more recent, mechanism that enables use of the huge pages is
+called `Transparent HugePages`, or THP. Unlike the hugetlbfs that
+requires users and/or system administrators to configure what parts of
+the system memory should and can be mapped by the huge pages, THP
+manages such mappings transparently to the user and hence the
+name. See
+:ref:`Documentation/admin-guide/mm/transhuge.rst <admin_guide_transhuge>`
+for more details about THP.
+
+Zones
+=====
+
+Often hardware poses restrictions on how different physical memory
+ranges can be accessed. In some cases, devices cannot perform DMA to
+all the addressable memory. In other cases, the size of the physical
+memory exceeds the maximal addressable size of virtual memory and
+special actions are required to access portions of the memory. Linux
+groups memory pages into `zones` according to their possible
+usage. For example, ZONE_DMA will contain memory that can be used by
+devices for DMA, ZONE_HIGHMEM will contain memory that is not
+permanently mapped into kernel's address space and ZONE_NORMAL will
+contain normally addressed pages.
+
+The actual layout of the memory zones is hardware dependent as not all
+architectures define all zones, and requirements for DMA are different
+for different platforms.
+
+Nodes
+=====
+
+Many multi-processor machines are NUMA - Non-Uniform Memory Access -
+systems. In such systems the memory is arranged into banks that have
+different access latency depending on the "distance" from the
+processor. Each bank is referred as `node` and for each node Linux
+constructs an independent memory management subsystem. A node has it's
+own set of zones, lists of free and used pages and various statistics
+counters. You can find more details about NUMA in
+:ref:`Documentation/vm/numa.rst <numa>` and in
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
+
+Page cache
+==========
+
+The physical memory is volatile and the common case for getting data
+into the memory is to read it from files. Whenever a file is read, the
+data is put into the `page cache` to avoid expensive disk access on
+the subsequent reads. Similarly, when one writes to a file, the data
+is placed in the page cache and eventually gets into the backing
+storage device. The written pages are marked as `dirty` and when Linux
+decides to reuse them for other purposes, it makes sure to synchronize
+the file contents on the device with the updated data.
+
+Anonymous Memory
+================
+
+The `anonymous memory` or `anonymous mappings` represent memory that
+is not backed by a filesystem. Such mappings are implicitly created
+for program's stack and heap or by explicit calls to mmap(2) system
+call. Usually, the anonymous mappings only define virtual memory areas
+that the program is allowed to access. The read accesses will result
+in creation of a page table entry that references a special physical
+page filled with zeroes. When the program performs a write, regular
+physical page will be allocated to hold the written data. The page
+will be marked dirty and if the kernel will decide to repurpose it,
+the dirty page will be swapped out.
+
+Reclaim
+=======
+
+Throughout the system lifetime, a physical page can be used for storing
+different types of data. It can be kernel internal data structures,
+DMA'able buffers for device drivers use, data read from a filesystem,
+memory allocated by user space processes etc.
+
+Depending on the page usage it is treated differently by the Linux
+memory management. The pages that can be freed at any time, either
+because they cache the data available elsewhere, for instance, on a
+hard disk, or because they can be swapped out, again, to the hard
+disk, are called `reclaimable`. The most notable categories of the
+reclaimable pages are page cache and anonymous memory.
+
+In most cases, the pages holding internal kernel data and used as DMA
+buffers cannot be repurposed, and they remain pinned until freed by
+their user. Such pages are called `unreclaimable`. However, in certain
+circumstances, even pages occupied with kernel data structures can be
+reclaimed. For instance, in-memory caches of filesystem metadata can
+be re-read from the storage device and therefore it is possible to
+discard them from the main memory when system is under memory
+pressure.
+
+The process of freeing the reclaimable physical memory pages and
+repurposing them is called (surprise!) `reclaim`. Linux can reclaim
+pages either asynchronously or synchronously, depending on the state
+of the system. When system is not loaded, most of the memory is free
+and allocation request will be satisfied immediately from the free
+pages supply. As the load increases, the amount of the free pages goes
+down and when it reaches a certain threshold (high watermark), an
+allocation request will awaken the ``kswapd`` daemon. It will
+asynchronously scan memory pages and either just free them if the data
+they contain is available elsewhere, or evict to the backing storage
+device (remember those dirty pages?). As memory usage increases even
+more and reaches another threshold - min watermark - an allocation
+will trigger the `direct reclaim`. In this case allocation is stalled
+until enough memory pages are reclaimed to satisfy the request.
+
+Compaction
+==========
+
+As the system runs, tasks allocate and free the memory and it becomes
+fragmented. Although with virtual memory it is possible to present
+scattered physical pages as virtually contiguous range, sometimes it is
+necessary to allocate large physically contiguous memory areas. Such
+need may arise, for instance, when a device driver requires large
+buffer for DMA, or when THP allocates a huge page. Memory `compaction`
+addresses the fragmentation issue. This mechanism moves occupied pages
+from the lower part of a memory zone to free pages in the upper part
+of the zone. When a compaction scan is finished free pages are grouped
+together at the beginning of the zone and allocations of large
+physically contiguous areas become possible.
+
+Like reclaim, the compaction may happen asynchronously in ``kcompactd``
+daemon or synchronously as a result of memory allocation request.
+
+OOM killer
+==========
+
+It may happen, that on a loaded machine memory will be exhausted. When
+the kernel detects that the system runs out of memory (OOM) it invokes
+`OOM killer`. Its mission is simple: all it has to do is to select a
+task to sacrifice for the sake of the overall system health. The
+selected task is killed in a hope that after it exits enough memory
+will be freed to continue normal operation.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/admin-guide/mm/hugetlbpage.rst
index faf077d50d42..1cc0bc78d10e 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -1,3 +1,11 @@
+.. _hugetlbpage:
+
+=============
+HugeTLB Pages
+=============
+
+Overview
+========
The intent of this file is to give a brief summary of hugetlbpage support in
the Linux kernel. This support is built on top of multiple page size support
@@ -18,53 +26,59 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
automatically when CONFIG_HUGETLBFS is selected) configuration
options.
-The /proc/meminfo file provides information about the total number of
+The ``/proc/meminfo`` file provides information about the total number of
persistent hugetlb pages in the kernel's huge page pool. It also displays
default huge page size and information about the number of free, reserved
and surplus huge pages in the pool of huge pages of default size.
The huge page size is needed for generating the proper alignment and
size of the arguments to system calls that map huge page regions.
-The output of "cat /proc/meminfo" will include lines like:
+The output of ``cat /proc/meminfo`` will include lines like::
-.....
-HugePages_Total: uuu
-HugePages_Free: vvv
-HugePages_Rsvd: www
-HugePages_Surp: xxx
-Hugepagesize: yyy kB
-Hugetlb: zzz kB
+ HugePages_Total: uuu
+ HugePages_Free: vvv
+ HugePages_Rsvd: www
+ HugePages_Surp: xxx
+ Hugepagesize: yyy kB
+ Hugetlb: zzz kB
where:
-HugePages_Total is the size of the pool of huge pages.
-HugePages_Free is the number of huge pages in the pool that are not yet
- allocated.
-HugePages_Rsvd is short for "reserved," and is the number of huge pages for
- which a commitment to allocate from the pool has been made,
- but no allocation has yet been made. Reserved huge pages
- guarantee that an application will be able to allocate a
- huge page from the pool of huge pages at fault time.
-HugePages_Surp is short for "surplus," and is the number of huge pages in
- the pool above the value in /proc/sys/vm/nr_hugepages. The
- maximum number of surplus huge pages is controlled by
- /proc/sys/vm/nr_overcommit_hugepages.
-Hugepagesize is the default hugepage size (in Kb).
-Hugetlb is the total amount of memory (in kB), consumed by huge
- pages of all sizes.
- If huge pages of different sizes are in use, this number
- will exceed HugePages_Total * Hugepagesize. To get more
- detailed information, please, refer to
- /sys/kernel/mm/hugepages (described below).
-
-
-/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
-in the kernel.
-
-/proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge
+
+HugePages_Total
+ is the size of the pool of huge pages.
+HugePages_Free
+ is the number of huge pages in the pool that are not yet
+ allocated.
+HugePages_Rsvd
+ is short for "reserved," and is the number of huge pages for
+ which a commitment to allocate from the pool has been made,
+ but no allocation has yet been made. Reserved huge pages
+ guarantee that an application will be able to allocate a
+ huge page from the pool of huge pages at fault time.
+HugePages_Surp
+ is short for "surplus," and is the number of huge pages in
+ the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
+ maximum number of surplus huge pages is controlled by
+ ``/proc/sys/vm/nr_overcommit_hugepages``.
+Hugepagesize
+ is the default hugepage size (in Kb).
+Hugetlb
+ is the total amount of memory (in kB), consumed by huge
+ pages of all sizes.
+ If huge pages of different sizes are in use, this number
+ will exceed HugePages_Total \* Hugepagesize. To get more
+ detailed information, please, refer to
+ ``/sys/kernel/mm/hugepages`` (described below).
+
+
+``/proc/filesystems`` should also show a filesystem of type "hugetlbfs"
+configured in the kernel.
+
+``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge
pages in the kernel's huge page pool. "Persistent" huge pages will be
returned to the huge page pool when freed by a task. A user with root
privileges can dynamically allocate more or free some persistent huge pages
-by increasing or decreasing the value of 'nr_hugepages'.
+by increasing or decreasing the value of ``nr_hugepages``.
Pages that are used as huge pages are reserved inside the kernel and cannot
be used for other purposes. Huge pages cannot be swapped out under
@@ -73,7 +87,7 @@ memory pressure.
Once a number of huge pages have been pre-allocated to the kernel huge page
pool, a user with appropriate privilege can use either the mmap system call
or shared memory system calls to use the huge pages. See the discussion of
-Using Huge Pages, below.
+:ref:`Using Huge Pages <using_huge_pages>`, below.
The administrator can allocate persistent huge pages on the kernel boot
command line by specifying the "hugepages=N" parameter, where 'N' = the
@@ -86,10 +100,10 @@ with a huge page size selection parameter "hugepagesz=<size>". <size> must
be specified in bytes with optional scale suffix [kKmMgG]. The default huge
page size may be selected with the "default_hugepagesz=<size>" boot parameter.
-When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages
+When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
indicates the current number of pre-allocated huge pages of the default size.
Thus, one can use the following command to dynamically allocate/deallocate
-default sized persistent huge pages:
+default sized persistent huge pages::
echo 20 > /proc/sys/vm/nr_hugepages
@@ -98,11 +112,12 @@ huge page pool to 20, allocating or freeing huge pages, as required.
On a NUMA platform, the kernel will attempt to distribute the huge page pool
over all the set of allowed nodes specified by the NUMA memory policy of the
-task that modifies nr_hugepages. The default for the allowed nodes--when the
+task that modifies ``nr_hugepages``. The default for the allowed nodes--when the
task has default memory policy--is all on-line nodes with memory. Allowed
nodes with insufficient available, contiguous memory for a huge page will be
-silently skipped when allocating persistent huge pages. See the discussion
-below of the interaction of task memory policy, cpusets and per node attributes
+silently skipped when allocating persistent huge pages. See the
+:ref:`discussion below <mem_policy_and_hp_alloc>`
+of the interaction of task memory policy, cpusets and per node attributes
with the allocation and freeing of persistent huge pages.
The success or failure of huge page allocation depends on the amount of
@@ -117,51 +132,52 @@ init files. This will enable the kernel to allocate huge pages early in
the boot process when the possibility of getting physical contiguous pages
is still very high. Administrators can verify the number of huge pages
actually allocated by checking the sysctl or meminfo. To check the per node
-distribution of huge pages in a NUMA system, use:
+distribution of huge pages in a NUMA system, use::
cat /sys/devices/system/node/node*/meminfo | fgrep Huge
-/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
-huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
+``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of
+huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are
requested by applications. Writing any non-zero value into this file
indicates that the hugetlb subsystem is allowed to try to obtain that
number of "surplus" huge pages from the kernel's normal page pool, when the
persistent huge page pool is exhausted. As these surplus huge pages become
unused, they are freed back to the kernel's normal page pool.
-When increasing the huge page pool size via nr_hugepages, any existing surplus
-pages will first be promoted to persistent huge pages. Then, additional
+When increasing the huge page pool size via ``nr_hugepages``, any existing
+surplus pages will first be promoted to persistent huge pages. Then, additional
huge pages will be allocated, if necessary and if possible, to fulfill
the new persistent huge page pool size.
The administrator may shrink the pool of persistent huge pages for
-the default huge page size by setting the nr_hugepages sysctl to a
+the default huge page size by setting the ``nr_hugepages`` sysctl to a
smaller value. The kernel will attempt to balance the freeing of huge pages
-across all nodes in the memory policy of the task modifying nr_hugepages.
+across all nodes in the memory policy of the task modifying ``nr_hugepages``.
Any free huge pages on the selected nodes will be freed back to the kernel's
normal page pool.
-Caveat: Shrinking the persistent huge page pool via nr_hugepages such that
+Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that
it becomes less than the number of huge pages in use will convert the balance
of the in-use huge pages to surplus huge pages. This will occur even if
-the number of surplus pages it would exceed the overcommit value. As long as
-this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is
+the number of surplus pages would exceed the overcommit value. As long as
+this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is
increased sufficiently, or the surplus huge pages go out of use and are freed--
no more surplus huge pages will be allowed to be allocated.
With support for multiple huge page pools at run-time available, much of
-the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs.
-The /proc interfaces discussed above have been retained for backwards
-compatibility. The root huge page control directory in sysfs is:
+the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in
+sysfs.
+The ``/proc`` interfaces discussed above have been retained for backwards
+compatibility. The root huge page control directory in sysfs is::
/sys/kernel/mm/hugepages
For each huge page size supported by the running kernel, a subdirectory
-will exist, of the form:
+will exist, of the form::
hugepages-${size}kB
-Inside each of these directories, the same set of files will exist:
+Inside each of these directories, the same set of files will exist::
nr_hugepages
nr_hugepages_mempolicy
@@ -172,37 +188,39 @@ Inside each of these directories, the same set of files will exist:
which function as described above for the default huge page-sized case.
+.. _mem_policy_and_hp_alloc:
Interaction of Task Memory Policy with Huge Page Allocation/Freeing
===================================================================
-Whether huge pages are allocated and freed via the /proc interface or
-the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
-nodes from which huge pages are allocated or freed are controlled by the
-NUMA memory policy of the task that modifies the nr_hugepages_mempolicy
-sysctl or attribute. When the nr_hugepages attribute is used, mempolicy
+Whether huge pages are allocated and freed via the ``/proc`` interface or
+the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the
+NUMA nodes from which huge pages are allocated or freed are controlled by the
+NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy``
+sysctl or attribute. When the ``nr_hugepages`` attribute is used, mempolicy
is ignored.
The recommended method to allocate or free huge pages to/from the kernel
-huge page pool, using the nr_hugepages example above, is:
+huge page pool, using the ``nr_hugepages`` example above, is::
numactl --interleave <node-list> echo 20 \
>/proc/sys/vm/nr_hugepages_mempolicy
-or, more succinctly:
+or, more succinctly::
numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
-This will allocate or free abs(20 - nr_hugepages) to or from the nodes
+This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes
specified in <node-list>, depending on whether number of persistent huge pages
is initially less than or greater than 20, respectively. No huge pages will be
allocated nor freed on any node not included in the specified <node-list>.
-When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any
+When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any
memory policy mode--bind, preferred, local or interleave--may be used. The
resulting effect on persistent huge page allocation is as follows:
-1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt],
+#. Regardless of mempolicy mode [see
+ :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`],
persistent huge pages will be distributed across the node or nodes
specified in the mempolicy as if "interleave" had been specified.
However, if a node in the policy does not contain sufficient contiguous
@@ -212,7 +230,7 @@ resulting effect on persistent huge page allocation is as follows:
possibly, allocation of persistent huge pages on nodes not allowed by
the task's memory policy.
-2) One or more nodes may be specified with the bind or interleave policy.
+#. One or more nodes may be specified with the bind or interleave policy.
If more than one node is specified with the preferred policy, only the
lowest numeric id will be used. Local policy will select the node where
the task is running at the time the nodes_allowed mask is constructed.
@@ -222,20 +240,20 @@ resulting effect on persistent huge page allocation is as follows:
indeterminate. Thus, local policy is not very useful for this purpose.
Any of the other mempolicy modes may be used to specify a single node.
-3) The nodes allowed mask will be derived from any non-default task mempolicy,
+#. The nodes allowed mask will be derived from any non-default task mempolicy,
whether this policy was set explicitly by the task itself or one of its
ancestors, such as numactl. This means that if the task is invoked from a
shell with non-default policy, that policy will be used. One can specify a
node list of "all" with numactl --interleave or --membind [-m] to achieve
interleaving over all nodes in the system or cpuset.
-4) Any task mempolicy specified--e.g., using numactl--will be constrained by
+#. Any task mempolicy specified--e.g., using numactl--will be constrained by
the resource limits of any cpuset in which the task runs. Thus, there will
be no way for a task with non-default policy running in a cpuset with a
subset of the system nodes to allocate huge pages outside the cpuset
without first moving to a cpuset that contains all of the desired nodes.
-5) Boot-time huge page allocation attempts to distribute the requested number
+#. Boot-time huge page allocation attempts to distribute the requested number
of huge pages over all on-lines nodes with memory.
Per Node Hugepages Attributes
@@ -243,22 +261,22 @@ Per Node Hugepages Attributes
A subset of the contents of the root huge page control directory in sysfs,
described above, will be replicated under each the system device of each
-NUMA node with memory in:
+NUMA node with memory in::
/sys/devices/system/node/node[0-9]*/hugepages/
Under this directory, the subdirectory for each supported huge page size
-contains the following attribute files:
+contains the following attribute files::
nr_hugepages
free_hugepages
surplus_hugepages
-The free_' and surplus_' attribute files are read-only. They return the number
+The free\_' and surplus\_' attribute files are read-only. They return the number
of free and surplus [overcommitted] huge pages, respectively, on the parent
node.
-The nr_hugepages attribute returns the total number of huge pages on the
+The ``nr_hugepages`` attribute returns the total number of huge pages on the
specified node. When this attribute is written, the number of persistent huge
pages on the parent node will be adjusted to the specified value, if sufficient
resources exist, regardless of the task's mempolicy or cpuset constraints.
@@ -267,43 +285,58 @@ Note that the number of overcommit and reserve pages remain global quantities,
as we don't know until fault time, when the faulting task's mempolicy is
applied, from which node the huge page allocation will be attempted.
+.. _using_huge_pages:
Using Huge Pages
================
If the user applications are going to request huge pages using mmap system
call, then it is required that system administrator mount a file system of
-type hugetlbfs:
+type hugetlbfs::
mount -t hugetlbfs \
-o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
min_size=<value>,nr_inodes=<value> none /mnt/huge
This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
-/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid
-options sets the owner and group of the root of the file system. By default
-the uid and gid of the current process are taken. The mode option sets the
-mode of root of file system to value & 01777. This value is given in octal.
-By default the value 0755 is picked. If the platform supports multiple huge
-page sizes, the pagesize option can be used to specify the huge page size and
-associated pool. pagesize is specified in bytes. If pagesize is not specified
-the platform's default huge page size and associated pool will be used. The
-size option sets the maximum value of memory (huge pages) allowed for that
-filesystem (/mnt/huge). The size option can be specified in bytes, or as a
-percentage of the specified huge page pool (nr_hugepages). The size is
-rounded down to HPAGE_SIZE boundary. The min_size option sets the minimum
-value of memory (huge pages) allowed for the filesystem. min_size can be
-specified in the same way as size, either bytes or a percentage of the
-huge page pool. At mount time, the number of huge pages specified by
-min_size are reserved for use by the filesystem. If there are not enough
-free huge pages available, the mount will fail. As huge pages are allocated
-to the filesystem and freed, the reserve count is adjusted so that the sum
-of allocated and reserved huge pages is always at least min_size. The option
-nr_inodes sets the maximum number of inodes that /mnt/huge can use. If the
-size, min_size or nr_inodes option is not provided on command line then
-no limits are set. For pagesize, size, min_size and nr_inodes options, you
-can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For example, size=2K
-has the same meaning as size=2048.
+``/mnt/huge``. Any file created on ``/mnt/huge`` uses huge pages.
+
+The ``uid`` and ``gid`` options sets the owner and group of the root of the
+file system. By default the ``uid`` and ``gid`` of the current process
+are taken.
+
+The ``mode`` option sets the mode of root of file system to value & 01777.
+This value is given in octal. By default the value 0755 is picked.
+
+If the platform supports multiple huge page sizes, the ``pagesize`` option can
+be used to specify the huge page size and associated pool. ``pagesize``
+is specified in bytes. If ``pagesize`` is not specified the platform's
+default huge page size and associated pool will be used.
+
+The ``size`` option sets the maximum value of memory (huge pages) allowed
+for that filesystem (``/mnt/huge``). The ``size`` option can be specified
+in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``).
+The size is rounded down to HPAGE_SIZE boundary.
+
+The ``min_size`` option sets the minimum value of memory (huge pages) allowed
+for the filesystem. ``min_size`` can be specified in the same way as ``size``,
+either bytes or a percentage of the huge page pool.
+At mount time, the number of huge pages specified by ``min_size`` are reserved
+for use by the filesystem.
+If there are not enough free huge pages available, the mount will fail.
+As huge pages are allocated to the filesystem and freed, the reserve count
+is adjusted so that the sum of allocated and reserved huge pages is always
+at least ``min_size``.
+
+The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge``
+can use.
+
+If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on
+command line then no limits are set.
+
+For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can
+use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo.
+For example, size=2K has the same meaning as size=2048.
While read system calls are supported on files that reside on hugetlb
file systems, write system calls are not.
@@ -313,12 +346,12 @@ used to change the file attributes on hugetlbfs.
Also, it is important to note that no such mount command is required if
applications are going to use only shmat/shmget system calls or mmap with
-MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see map_hugetlb
-below.
+MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see
+:ref:`map_hugetlb <map_hugetlb>` below.
-Users who wish to use hugetlb memory via shared memory segment should be a
-member of a supplementary group and system admin needs to configure that gid
-into /proc/sys/vm/hugetlb_shm_group. It is possible for same or different
+Users who wish to use hugetlb memory via shared memory segment should be
+members of a supplementary group and system admin needs to configure that gid
+into ``/proc/sys/vm/hugetlb_shm_group``. It is possible for same or different
applications to use any combination of mmaps and shm* calls, though the mount of
filesystem will be required for using mmap calls without MAP_HUGETLB.
@@ -332,20 +365,18 @@ a hugetlb page and the length is smaller than the hugepage size.
Examples
========
-1) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
+.. _map_hugetlb:
-2) hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c
+``map_hugetlb``
+ see tools/testing/selftests/vm/map_hugetlb.c
-3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
+``hugepage-shm``
+ see tools/testing/selftests/vm/hugepage-shm.c
-4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
- provides a wide range of userspace tools to help with huge page usability,
- environment setup, and control.
+``hugepage-mmap``
+ see tools/testing/selftests/vm/hugepage-mmap.c
-Kernel development regression testing
-=====================================
+The `libhugetlbfs`_ library provides a wide range of userspace tools
+to help with huge page usability, environment setup, and control.
-The most complete set of hugetlb tests are in the libhugetlbfs repository.
-If you modify any hugetlb related code, use the libhugetlbfs test suite
-to check for regressions. In addition, if you add any new hugetlb
-functionality, please add appropriate tests to libhugetlbfs.
+.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/admin-guide/mm/idle_page_tracking.rst
index 85dcc3bb85dc..6f7b7ca1add3 100644
--- a/Documentation/vm/idle_page_tracking.txt
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -1,4 +1,11 @@
-MOTIVATION
+.. _idle_page_tracking:
+
+==================
+Idle Page Tracking
+==================
+
+Motivation
+==========
The idle page tracking feature allows to track which memory pages are being
accessed by a workload and which are idle. This information can be useful for
@@ -8,10 +15,14 @@ or deciding where to place the workload within a compute cluster.
It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
-USER API
+.. _user_api:
-The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
-it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+User API
+========
+
+The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
+Currently, it consists of the only read-write file,
+``/sys/kernel/mm/page_idle/bitmap``.
The file implements a bitmap where each bit corresponds to a memory page. The
bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
@@ -19,8 +30,9 @@ mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
set, the corresponding page is idle.
A page is considered idle if it has not been accessed since it was marked idle
-(for more details on what "accessed" actually means see the IMPLEMENTATION
-DETAILS section). To mark a page idle one has to set the bit corresponding to
+(for more details on what "accessed" actually means see the :ref:`Implementation
+Details <impl_details>` section).
+To mark a page idle one has to set the bit corresponding to
the page by writing to the file. A value written to the file is OR-ed with the
current bitmap value.
@@ -30,9 +42,9 @@ page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
and hence such pages are never reported idle.
For huge pages the idle flag is set only on the head page, so one has to read
-/proc/kpageflags in order to correctly count idle huge pages.
+``/proc/kpageflags`` in order to correctly count idle huge pages.
-Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return
-EINVAL if you are not starting the read/write on an 8-byte boundary, or
if the size of the read/write is not a multiple of 8 bytes. Writing to
this file beyond max PFN will return -ENXIO.
@@ -41,21 +53,26 @@ That said, in order to estimate the amount of pages that are not used by a
workload one should:
1. Mark all the workload's pages as idle by setting corresponding bits in
- /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
- /proc/pid/pagemap if the workload is represented by a process, or by
- filtering out alien pages using /proc/kpagecgroup in case the workload is
- placed in a memory cgroup.
+ ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading
+ ``/proc/pid/pagemap`` if the workload is represented by a process, or by
+ filtering out alien pages using ``/proc/kpagecgroup`` in case the workload
+ is placed in a memory cgroup.
2. Wait until the workload accesses its working set.
- 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
- one wants to ignore certain types of pages, e.g. mlocked pages since they
- are not reclaimable, he or she can filter them out using /proc/kpageflags.
+ 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set.
+ If one wants to ignore certain types of pages, e.g. mlocked pages since they
+ are not reclaimable, he or she can filter them out using
+ ``/proc/kpageflags``.
+
+See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
+information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
+``/proc/kpagecgroup``.
-See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
-/proc/kpageflags, and /proc/kpagecgroup.
+.. _impl_details:
-IMPLEMENTATION DETAILS
+Implementation Details
+======================
The kernel internally keeps track of accesses to user memory pages in order to
reclaim unreferenced pages first on memory shortage conditions. A page is
@@ -77,7 +94,8 @@ When a dirty page is written to swap or disk as a result of memory reclaim or
exceeding the dirty memory limit, it is not marked referenced.
The idle memory tracking feature adds a new page flag, the Idle flag. This flag
-is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
+:ref:`User API <user_api>`
section), and cleared automatically whenever a page is referenced as defined
above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
new file mode 100644
index 000000000000..ceead68c2df7
--- /dev/null
+++ b/Documentation/admin-guide/mm/index.rst
@@ -0,0 +1,36 @@
+=================
+Memory Management
+=================
+
+Linux memory management subsystem is responsible, as the name implies,
+for managing the memory in the system. This includes implemnetation of
+virtual memory and demand paging, memory allocation both for kernel
+internal structures and user space programms, mapping of files into
+processes address space and many other cool things.
+
+Linux memory management is a complex system with many configurable
+settings. Most of these settings are available via ``/proc``
+filesystem and can be quired and adjusted using ``sysctl``. These APIs
+are described in Documentation/sysctl/vm.txt and in `man 5 proc`_.
+
+.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
+
+Linux memory management has its own jargon and if you are not yet
+familiar with it, consider reading
+:ref:`Documentation/admin-guide/mm/concepts.rst <mm_concepts>`.
+
+Here we document in detail how to interact with various mechanisms in
+the Linux memory management.
+
+.. toctree::
+ :maxdepth: 1
+
+ concepts
+ hugetlbpage
+ idle_page_tracking
+ ksm
+ numa_memory_policy
+ pagemap
+ soft-dirty
+ transhuge
+ userfaultfd
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
new file mode 100644
index 000000000000..9303786632d1
--- /dev/null
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -0,0 +1,189 @@
+.. _admin_guide_ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+Overview
+========
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+KSM was originally developed for use with KVM (where it was known as
+Kernel Shared Memory), to fit more virtual machines into physical memory,
+by sharing the data common between them. But it can be useful to any
+application which generates many instances of the same data.
+
+The KSM daemon ksmd periodically scans those areas of user memory
+which have been registered with it, looking for pages of identical
+content which can be replaced by a single write-protected page (which
+is automatically copied if a process later wants to update its
+content). The amount of pages that KSM daemon scans in a single pass
+and the time between the passes are configured using :ref:`sysfs
+intraface <ksm_sysfs>`
+
+KSM only merges anonymous (private) pages, never pagecache (file) pages.
+KSM's merged pages were originally locked into kernel memory, but can now
+be swapped out just like other user pages (but sharing is broken when they
+are swapped back in: ksmd must rediscover their identity and merge again).
+
+Controlling KSM with madvise
+============================
+
+KSM only operates on those areas of address space which an application
+has advised to be likely candidates for merging, by using the madvise(2)
+system call::
+
+ int madvise(addr, length, MADV_MERGEABLE)
+
+The app may call
+
+::
+
+ int madvise(addr, length, MADV_UNMERGEABLE)
+
+to cancel that advice and restore unshared pages: whereupon KSM
+unmerges whatever it merged in that range. Note: this unmerging call
+may suddenly require more memory than is available - possibly failing
+with EAGAIN, but more probably arousing the Out-Of-Memory killer.
+
+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
+and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was
+built with CONFIG_KSM=y, those calls will normally succeed: even if the
+the KSM daemon is not currently running, MADV_MERGEABLE still registers
+the range for whenever the KSM daemon is started; even if the range
+cannot contain any pages which KSM could actually merge; even if
+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
+
+If a region of memory must be split into at least one new MADV_MERGEABLE
+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
+will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.txt).
+
+Like other madvise calls, they are intended for use on mapped areas of
+the user address space: they will report ENOMEM if the specified range
+includes unmapped gaps (though working on the intervening mapped areas),
+and might fail with EAGAIN if not enough memory for internal structures.
+
+Applications should be considerate in their use of MADV_MERGEABLE,
+restricting its use to areas likely to benefit. KSM's scans may use a lot
+of processing power: some installations will disable KSM for that reason.
+
+.. _ksm_sysfs:
+
+KSM daemon sysfs interface
+==========================
+
+The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``,
+readable by all but writable only by root:
+
+pages_to_scan
+ how many pages to scan before ksmd goes to sleep
+ e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan``.
+
+ Default: 100 (chosen for demonstration purposes)
+
+sleep_millisecs
+ how many milliseconds ksmd should sleep before next scan
+ e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs``
+
+ Default: 20 (chosen for demonstration purposes)
+
+merge_across_nodes
+ specifies if pages from different NUMA nodes can be merged.
+ When set to 0, ksm merges only pages which physically reside
+ in the memory area of same NUMA node. That brings lower
+ latency to access of shared pages. Systems with more nodes, at
+ significant NUMA distances, are likely to benefit from the
+ lower latency of setting 0. Smaller systems, which need to
+ minimize memory usage, are likely to benefit from the greater
+ sharing of setting 1 (default). You may wish to compare how
+ your system performs under each setting, before deciding on
+ which to use. ``merge_across_nodes`` setting can be changed only
+ when there are no ksm shared pages in the system: set run 2 to
+ unmerge pages first, then to 1 after changing
+ ``merge_across_nodes``, to remerge according to the new setting.
+
+ Default: 1 (merging across nodes as in earlier releases)
+
+run
+ * set to 0 to stop ksmd from running but keep merged pages,
+ * set to 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``,
+ * set to 2 to stop ksmd and unmerge all pages currently merged, but
+ leave mergeable areas registered for next run.
+
+ Default: 0 (must be changed to 1 to activate KSM, except if
+ CONFIG_SYSFS is disabled)
+
+use_zero_pages
+ specifies whether empty pages (i.e. allocated pages that only
+ contain zeroes) should be treated specially. When set to 1,
+ empty pages are merged with the kernel zero page(s) instead of
+ with each other as it would happen normally. This can improve
+ the performance on architectures with coloured zero pages,
+ depending on the workload. Care should be taken when enabling
+ this setting, as it can potentially degrade the performance of
+ KSM for some workloads, for example if the checksums of pages
+ candidate for merging match the checksum of an empty
+ page. This setting can be changed at any time, it is only
+ effective for pages merged after the change.
+
+ Default: 0 (normal KSM behaviour as in earlier releases)
+
+max_page_sharing
+ Maximum sharing allowed for each KSM page. This enforces a
+ deduplication limit to avoid high latency for virtual memory
+ operations that involve traversal of the virtual mappings that
+ share the KSM page. The minimum value is 2 as a newly created
+ KSM page will have at least two sharers. The higher this value
+ the faster KSM will merge the memory and the higher the
+ deduplication factor will be, but the slower the worst case
+ virtual mappings traversal could be for any given KSM
+ page. Slowing down this traversal means there will be higher
+ latency for certain virtual memory operations happening during
+ swapping, compaction, NUMA balancing and page migration, in
+ turn decreasing responsiveness for the caller of those virtual
+ memory operations. The scheduler latency of other tasks not
+ involved with the VM operations doing the virtual mappings
+ traversal is not affected by this parameter as these
+ traversals are always schedule friendly themselves.
+
+stable_node_chains_prune_millisecs
+ specifies how frequently KSM checks the metadata of the pages
+ that hit the deduplication limit for stale information.
+ Smaller milllisecs values will free up the KSM metadata with
+ lower latency, but they will make ksmd use more CPU during the
+ scan. It's a noop if not a single KSM page hit the
+ ``max_page_sharing`` yet.
+
+The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
+
+pages_shared
+ how many shared pages are being used
+pages_sharing
+ how many more sites are sharing them i.e. how much saved
+pages_unshared
+ how many pages unique but repeatedly checked for merging
+pages_volatile
+ how many pages changing too fast to be placed in a tree
+full_scans
+ how many times all mergeable areas have been scanned
+stable_node_chains
+ the number of KSM pages that hit the ``max_page_sharing`` limit
+stable_node_dups
+ number of duplicated KSM pages
+
+A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
+sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
+indicates wasted effort. ``pages_volatile`` embraces several
+different kinds of activity, but a high proportion there would also
+indicate poor use of madvise MADV_MERGEABLE.
+
+The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
+``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
+be increased accordingly.
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
new file mode 100644
index 000000000000..d78c5b315f72
--- /dev/null
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -0,0 +1,495 @@
+.. _numa_memory_policy:
+
+==================
+NUMA Memory Policy
+==================
+
+What is NUMA Memory Policy?
+============================
+
+In the Linux kernel, "memory policy" determines from which node the kernel will
+allocate memory in a NUMA system or in an emulated NUMA system. Linux has
+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
+The current memory policy support was added to Linux 2.6 around May 2004. This
+document attempts to describe the concepts and APIs of the 2.6 memory policy
+support.
+
+Memory policies should not be confused with cpusets
+(``Documentation/cgroup-v1/cpusets.txt``)
+which is an administrative mechanism for restricting the nodes from which
+memory may be allocated by a set of processes. Memory policies are a
+programming interface that a NUMA-aware application can take advantage of. When
+both cpusets and policies are applied to a task, the restrictions of the cpuset
+takes priority. See :ref:`Memory Policies and cpusets <mem_pol_and_cpusets>`
+below for more details.
+
+Memory Policy Concepts
+======================
+
+Scope of Memory Policies
+------------------------
+
+The Linux kernel supports _scopes_ of memory policy, described here from
+most general to most specific:
+
+System Default Policy
+ this policy is "hard coded" into the kernel. It is the policy
+ that governs all page allocations that aren't controlled by
+ one of the more specific policy scopes discussed below. When
+ the system is "up and running", the system default policy will
+ use "local allocation" described below. However, during boot
+ up, the system default policy will be set to interleave
+ allocations across all nodes with "sufficient" memory, so as
+ not to overload the initial boot node with boot-time
+ allocations.
+
+Task/Process Policy
+ this is an optional, per-task policy. When defined for a
+ specific task, this policy controls all page allocations made
+ by or on behalf of the task that aren't controlled by a more
+ specific scope. If a task does not define a task policy, then
+ all page allocations that would have been controlled by the
+ task policy "fall back" to the System Default Policy.
+
+ The task policy applies to the entire address space of a task. Thus,
+ it is inheritable, and indeed is inherited, across both fork()
+ [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task
+ to establish the task policy for a child task exec()'d from an
+ executable image that has no awareness of memory policy. See the
+ :ref:`Memory Policy APIs <memory_policy_apis>` section,
+ below, for an overview of the system call
+ that a task may use to set/change its task/process policy.
+
+ In a multi-threaded task, task policies apply only to the thread
+ [Linux kernel task] that installs the policy and any threads
+ subsequently created by that thread. Any sibling threads existing
+ at the time a new task policy is installed retain their current
+ policy.
+
+ A task policy applies only to pages allocated after the policy is
+ installed. Any pages already faulted in by the task when the task
+ changes its task policy remain where they were allocated based on
+ the policy at the time they were allocated.
+
+.. _vma_policy:
+
+VMA Policy
+ A "VMA" or "Virtual Memory Area" refers to a range of a task's
+ virtual address space. A task may define a specific policy for a range
+ of its virtual address space. See the
+ :ref:`Memory Policy APIs <memory_policy_apis>` section,
+ below, for an overview of the mbind() system call used to set a VMA
+ policy.
+
+ A VMA policy will govern the allocation of pages that back
+ this region of the address space. Any regions of the task's
+ address space that don't have an explicit VMA policy will fall
+ back to the task policy, which may itself fall back to the
+ System Default Policy.
+
+ VMA policies have a few complicating details:
+
+ * VMA policy applies ONLY to anonymous pages. These include
+ pages allocated for anonymous segments, such as the task
+ stack and heap, and any regions of the address space
+ mmap()ed with the MAP_ANONYMOUS flag. If a VMA policy is
+ applied to a file mapping, it will be ignored if the mapping
+ used the MAP_SHARED flag. If the file mapping used the
+ MAP_PRIVATE flag, the VMA policy will only be applied when
+ an anonymous page is allocated on an attempt to write to the
+ mapping-- i.e., at Copy-On-Write.
+
+ * VMA policies are shared between all tasks that share a
+ virtual address space--a.k.a. threads--independent of when
+ the policy is installed; and they are inherited across
+ fork(). However, because VMA policies refer to a specific
+ region of a task's address space, and because the address
+ space is discarded and recreated on exec*(), VMA policies
+ are NOT inheritable across exec(). Thus, only NUMA-aware
+ applications may use VMA policies.
+
+ * A task may install a new VMA policy on a sub-range of a
+ previously mmap()ed region. When this happens, Linux splits
+ the existing virtual memory area into 2 or 3 VMAs, each with
+ it's own policy.
+
+ * By default, VMA policy applies only to pages allocated after
+ the policy is installed. Any pages already faulted into the
+ VMA range remain where they were allocated based on the
+ policy at the time they were allocated. However, since
+ 2.6.16, Linux supports page migration via the mbind() system
+ call, so that page contents can be moved to match a newly
+ installed policy.
+
+Shared Policy
+ Conceptually, shared policies apply to "memory objects" mapped
+ shared into one or more tasks' distinct address spaces. An
+ application installs shared policies the same way as VMA
+ policies--using the mbind() system call specifying a range of
+ virtual addresses that map the shared object. However, unlike
+ VMA policies, which can be considered to be an attribute of a
+ range of a task's address space, shared policies apply
+ directly to the shared object. Thus, all tasks that attach to
+ the object share the policy, and all pages allocated for the
+ shared object, by any task, will obey the shared policy.
+
+ As of 2.6.22, only shared memory segments, created by shmget() or
+ mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared
+ policy support was added to Linux, the associated data structures were
+ added to hugetlbfs shmem segments. At the time, hugetlbfs did not
+ support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
+ shmem segments were never "hooked up" to the shared policy support.
+ Although hugetlbfs segments now support lazy allocation, their support
+ for shared policy has not been completed.
+
+ As mentioned above in :ref:`VMA policies <vma_policy>` section,
+ allocations of page cache pages for regular files mmap()ed
+ with MAP_SHARED ignore any VMA policy installed on the virtual
+ address range backed by the shared file mapping. Rather,
+ shared page cache pages, including pages backing private
+ mappings that have not yet been written by the task, follow
+ task policy, if any, else System Default Policy.
+
+ The shared policy infrastructure supports different policies on subset
+ ranges of the shared object. However, Linux still splits the VMA of
+ the task that installs the policy for each range of distinct policy.
+ Thus, different tasks that attach to a shared memory segment can have
+ different VMA configurations mapping that one shared object. This
+ can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
+ a shared memory region, when one task has installed shared policy on
+ one or more ranges of the region.
+
+Components of Memory Policies
+-----------------------------
+
+A NUMA memory policy consists of a "mode", optional mode flags, and
+an optional set of nodes. The mode determines the behavior of the
+policy, the optional mode flags determine the behavior of the mode,
+and the optional set of nodes can be viewed as the arguments to the
+policy behavior.
+
+Internally, memory policies are implemented by a reference counted
+structure, struct mempolicy. Details of this structure will be
+discussed in context, below, as required to explain the behavior.
+
+NUMA memory policy supports the following 4 behavioral modes:
+
+Default Mode--MPOL_DEFAULT
+ This mode is only used in the memory policy APIs. Internally,
+ MPOL_DEFAULT is converted to the NULL memory policy in all
+ policy scopes. Any existing non-default policy will simply be
+ removed when MPOL_DEFAULT is specified. As a result,
+ MPOL_DEFAULT means "fall back to the next most specific policy
+ scope."
+
+ For example, a NULL or default task policy will fall back to the
+ system default policy. A NULL or default vma policy will fall
+ back to the task policy.
+
+ When specified in one of the memory policy APIs, the Default mode
+ does not use the optional set of nodes.
+
+ It is an error for the set of nodes specified for this policy to
+ be non-empty.
+
+MPOL_BIND
+ This mode specifies that memory must come from the set of
+ nodes specified by the policy. Memory will be allocated from
+ the node in the set with sufficient free memory that is
+ closest to the node where the allocation takes place.
+
+MPOL_PREFERRED
+ This mode specifies that the allocation should be attempted
+ from the single node specified in the policy. If that
+ allocation fails, the kernel will search other nodes, in order
+ of increasing distance from the preferred node based on
+ information provided by the platform firmware.
+
+ Internally, the Preferred policy uses a single node--the
+ preferred_node member of struct mempolicy. When the internal
+ mode flag MPOL_F_LOCAL is set, the preferred_node is ignored
+ and the policy is interpreted as local allocation. "Local"
+ allocation policy can be viewed as a Preferred policy that
+ starts at the node containing the cpu where the allocation
+ takes place.
+
+ It is possible for the user to specify that local allocation
+ is always preferred by passing an empty nodemask with this
+ mode. If an empty nodemask is passed, the policy cannot use
+ the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags
+ described below.
+
+MPOL_INTERLEAVED
+ This mode specifies that page allocations be interleaved, on a
+ page granularity, across the nodes specified in the policy.
+ This mode also behaves slightly differently, based on the
+ context where it is used:
+
+ For allocation of anonymous pages and shared memory pages,
+ Interleave mode indexes the set of nodes specified by the
+ policy using the page offset of the faulting address into the
+ segment [VMA] containing the address modulo the number of
+ nodes specified by the policy. It then attempts to allocate a
+ page, starting at the selected node, as if the node had been
+ specified by a Preferred policy or had been selected by a
+ local allocation. That is, allocation will follow the per
+ node zonelist.
+
+ For allocation of page cache pages, Interleave mode indexes
+ the set of nodes specified by the policy using a node counter
+ maintained per task. This counter wraps around to the lowest
+ specified node after it reaches the highest specified node.
+ This will tend to spread the pages out over the nodes
+ specified by the policy based on the order in which they are
+ allocated, rather than based on any page offset into an
+ address range or file. During system boot up, the temporary
+ interleaved system default policy works in this mode.
+
+NUMA memory policy supports the following optional mode flags:
+
+MPOL_F_STATIC_NODES
+ This flag specifies that the nodemask passed by
+ the user should not be remapped if the task or VMA's set of allowed
+ nodes changes after the memory policy has been defined.
+
+ Without this flag, any time a mempolicy is rebound because of a
+ change in the set of allowed nodes, the node (Preferred) or
+ nodemask (Bind, Interleave) is remapped to the new set of
+ allowed nodes. This may result in nodes being used that were
+ previously undesired.
+
+ With this flag, if the user-specified nodes overlap with the
+ nodes allowed by the task's cpuset, then the memory policy is
+ applied to their intersection. If the two sets of nodes do not
+ overlap, the Default policy is used.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-5, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since only node
+ 3 is allowed from the user's nodemask, the "interleave" only
+ occurs over that node. If no nodes from the user's nodemask are
+ now allowed, the Default behavior is used.
+
+ MPOL_F_STATIC_NODES cannot be combined with the
+ MPOL_F_RELATIVE_NODES flag. It also cannot be used for
+ MPOL_PREFERRED policies that were created with an empty nodemask
+ (local allocation).
+
+MPOL_F_RELATIVE_NODES
+ This flag specifies that the nodemask passed
+ by the user will be mapped relative to the set of the task or VMA's
+ set of allowed nodes. The kernel stores the user-passed nodemask,
+ and if the allowed nodes changes, then that original nodemask will
+ be remapped relative to the new set of allowed nodes.
+
+ Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+ mempolicy is rebound because of a change in the set of allowed
+ nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+ remapped to the new set of allowed nodes. That remap may not
+ preserve the relative nature of the user's passed nodemask to its
+ set of allowed nodes upon successive rebinds: a nodemask of
+ 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+ allowed nodes is restored to its original state.
+
+ With this flag, the remap is done so that the node numbers from
+ the user's passed nodemask are relative to the set of allowed
+ nodes. In other words, if nodes 0, 2, and 4 are set in the user's
+ nodemask, the policy will be effected over the first (and in the
+ Bind or Interleave case, the third and fifth) nodes in the set of
+ allowed nodes. The nodemask passed by the user represents nodes
+ relative to task or VMA's set of allowed nodes.
+
+ If the user's nodemask includes nodes that are outside the range
+ of the new set of allowed nodes (for example, node 5 is set in
+ the user's nodemask when the set of allowed nodes is only 0-3),
+ then the remap wraps around to the beginning of the nodemask and,
+ if not already set, sets the node in the mempolicy nodemask.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 2-5 that sets an Interleave policy over the same set with
+ MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the
+ interleave now occurs over nodes 3,5-7. If the cpuset's mems
+ then change to 0,2-3,5, then the interleave occurs over nodes
+ 0,2-3,5.
+
+ Thanks to the consistent remapping, applications preparing
+ nodemasks to specify memory policies using this flag should
+ disregard their current, actual cpuset imposed memory placement
+ and prepare the nodemask as if they were always located on
+ memory nodes 0 to N-1, where N is the number of memory nodes the
+ policy is intended to manage. Let the kernel then remap to the
+ set of memory nodes allowed by the task's cpuset, as that may
+ change over time.
+
+ MPOL_F_RELATIVE_NODES cannot be combined with the
+ MPOL_F_STATIC_NODES flag. It also cannot be used for
+ MPOL_PREFERRED policies that were created with an empty nodemask
+ (local allocation).
+
+Memory Policy Reference Counting
+================================
+
+To resolve use/free races, struct mempolicy contains an atomic reference
+count field. Internal interfaces, mpol_get()/mpol_put() increment and
+decrement this reference count, respectively. mpol_put() will only free
+the structure back to the mempolicy kmem cache when the reference count
+goes to zero.
+
+When a new memory policy is allocated, its reference count is initialized
+to '1', representing the reference held by the task that is installing the
+new policy. When a pointer to a memory policy structure is stored in another
+structure, another reference is added, as the task's reference will be dropped
+on completion of the policy installation.
+
+During run-time "usage" of the policy, we attempt to minimize atomic operations
+on the reference count, as this can lead to cache lines bouncing between cpus
+and NUMA nodes. "Usage" here means one of the following:
+
+1) querying of the policy, either by the task itself [using the get_mempolicy()
+ API discussed below] or by another task using the /proc/<pid>/numa_maps
+ interface.
+
+2) examination of the policy to determine the policy mode and associated node
+ or node lists, if any, for page allocation. This is considered a "hot
+ path". Note that for MPOL_BIND, the "usage" extends across the entire
+ allocation process, which may sleep during page reclaimation, because the
+ BIND policy nodemask is used, by reference, to filter ineligible nodes.
+
+We can avoid taking an extra reference during the usages listed above as
+follows:
+
+1) we never need to get/free the system default policy as this is never
+ changed nor freed, once the system is up and running.
+
+2) for querying the policy, we do not need to take an extra reference on the
+ target task's task policy nor vma policies because we always acquire the
+ task's mm's mmap_sem for read during the query. The set_mempolicy() and
+ mbind() APIs [see below] always acquire the mmap_sem for write when
+ installing or replacing task or vma policies. Thus, there is no possibility
+ of a task or thread freeing a policy while another task or thread is
+ querying it.
+
+3) Page allocation usage of task or vma policy occurs in the fault path where
+ we hold them mmap_sem for read. Again, because replacing the task or vma
+ policy requires that the mmap_sem be held for write, the policy can't be
+ freed out from under us while we're using it for page allocation.
+
+4) Shared policies require special consideration. One task can replace a
+ shared memory policy while another task, with a distinct mmap_sem, is
+ querying or allocating a page based on the policy. To resolve this
+ potential race, the shared policy infrastructure adds an extra reference
+ to the shared policy during lookup while holding a spin lock on the shared
+ policy management structure. This requires that we drop this extra
+ reference when we're finished "using" the policy. We must drop the
+ extra reference on shared policies in the same query/allocation paths
+ used for non-shared policies. For this reason, shared policies are marked
+ as such, and the extra reference is dropped "conditionally"--i.e., only
+ for shared policies.
+
+ Because of this extra reference counting, and because we must lookup
+ shared policies in a tree structure under spinlock, shared policies are
+ more expensive to use in the page allocation path. This is especially
+ true for shared policies on shared memory regions shared by tasks running
+ on different NUMA nodes. This extra overhead can be avoided by always
+ falling back to task or system default policy for shared memory regions,
+ or by prefaulting the entire shared memory region into memory and locking
+ it down. However, this might not be appropriate for all applications.
+
+.. _memory_policy_apis:
+
+Memory Policy APIs
+==================
+
+Linux supports 3 system calls for controlling memory policy. These APIS
+always affect only the calling task, the calling task's address space, or
+some shared object mapped into the calling task's address space.
+
+.. note::
+ the headers that define these APIs and the parameter data types for
+ user space applications reside in a package that is not part of the
+ Linux kernel. The kernel system call interfaces, with the 'sys\_'
+ prefix, are defined in <linux/syscalls.h>; the mode and flag
+ definitions are defined in <linux/mempolicy.h>.
+
+Set [Task] Memory Policy::
+
+ long set_mempolicy(int mode, const unsigned long *nmask,
+ unsigned long maxnode);
+
+Set's the calling task's "task/process memory policy" to mode
+specified by the 'mode' argument and the set of nodes defined by
+'nmask'. 'nmask' points to a bit mask of node ids containing at least
+'maxnode' ids. Optional mode flags may be passed by combining the
+'mode' argument with the flag (for example: MPOL_INTERLEAVE |
+MPOL_F_STATIC_NODES).
+
+See the set_mempolicy(2) man page for more details
+
+
+Get [Task] Memory Policy or Related Information::
+
+ long get_mempolicy(int *mode,
+ const unsigned long *nmask, unsigned long maxnode,
+ void *addr, int flags);
+
+Queries the "task/process memory policy" of the calling task, or the
+policy or location of a specified virtual address, depending on the
+'flags' argument.
+
+See the get_mempolicy(2) man page for more details
+
+
+Install VMA/Shared Policy for a Range of Task's Address Space::
+
+ long mbind(void *start, unsigned long len, int mode,
+ const unsigned long *nmask, unsigned long maxnode,
+ unsigned flags);
+
+mbind() installs the policy specified by (mode, nmask, maxnodes) as a
+VMA policy for the range of the calling task's address space specified
+by the 'start' and 'len' arguments. Additional actions may be
+requested via the 'flags' argument.
+
+See the mbind(2) man page for more details.
+
+Memory Policy Command Line Interface
+====================================
+
+Although not strictly part of the Linux implementation of memory policy,
+a command line tool, numactl(8), exists that allows one to:
+
++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
+ exec(2)
+
++ set the shared policy for a shared memory segment via mbind(2)
+
+The numactl(8) tool is packaged with the run-time version of the library
+containing the memory policy system call wrappers. Some distributions
+package the headers and compile-time libraries in a separate development
+package.
+
+.. _mem_pol_and_cpusets:
+
+Memory Policies and cpusets
+===========================
+
+Memory policies work within cpusets as described above. For memory policies
+that require a node or set of nodes, the nodes are restricted to the set of
+nodes whose memories are allowed by the cpuset constraints. If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used. If the
+result is the empty set, the policy is considered invalid and cannot be
+installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies. Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region. Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/admin-guide/mm/pagemap.rst
index eafcefa15261..577af85beb41 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -1,21 +1,25 @@
-pagemap, from the userspace perspective
----------------------------------------
+.. _pagemap:
+
+=============================
+Examining Process Page Tables
+=============================
pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
userspace programs to examine the page tables and related information by
-reading files in /proc.
+reading files in ``/proc``.
There are four components to pagemap:
- * /proc/pid/pagemap. This file lets a userspace process find out which
+ * ``/proc/pid/pagemap``. This file lets a userspace process find out which
physical frame each virtual page is mapped to. It contains one 64-bit
value for each virtual page, containing the following data (from
- fs/proc/task_mmu.c, above pagemap_read):
+ ``fs/proc/task_mmu.c``, above pagemap_read):
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 55 pte is soft-dirty (see
+ :ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
* Bit 56 page exclusively mapped (since 4.2)
* Bits 57-60 zero
* Bit 61 page is file-page or shared-anon (since 3.5)
@@ -33,28 +37,28 @@ There are four components to pagemap:
precisely which pages are mapped (or in swap) and comparing mapped
pages between processes.
- Efficient users of this interface will use /proc/pid/maps to
+ Efficient users of this interface will use ``/proc/pid/maps`` to
determine which areas of memory are actually mapped and llseek to
skip over unmapped regions.
- * /proc/kpagecount. This file contains a 64-bit count of the number of
+ * ``/proc/kpagecount``. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
- * /proc/kpageflags. This file contains a 64-bit set of flags for each
+ * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
page, indexed by PFN.
- The flags are (from fs/proc/page.c, above kpageflags_read):
-
- 0. LOCKED
- 1. ERROR
- 2. REFERENCED
- 3. UPTODATE
- 4. DIRTY
- 5. LRU
- 6. ACTIVE
- 7. SLAB
- 8. WRITEBACK
- 9. RECLAIM
+ The flags are (from ``fs/proc/page.c``, above kpageflags_read):
+
+ 0. LOCKED
+ 1. ERROR
+ 2. REFERENCED
+ 3. UPTODATE
+ 4. DIRTY
+ 5. LRU
+ 6. ACTIVE
+ 7. SLAB
+ 8. WRITEBACK
+ 9. RECLAIM
10. BUDDY
11. MMAP
12. ANON
@@ -72,98 +76,111 @@ There are four components to pagemap:
24. ZERO_PAGE
25. IDLE
- * /proc/kpagecgroup. This file contains a 64-bit inode number of the
+ * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
-Short descriptions to the page flags:
-
- 0. LOCKED
- page is being locked for exclusive access, eg. by undergoing read/write IO
-
- 7. SLAB
- page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
- When compound page is used, SLUB/SLQB will only set this flag on the head
- page; SLOB will not flag it at all.
+Short descriptions to the page flags
+====================================
-10. BUDDY
+0 - LOCKED
+ page is being locked for exclusive access, e.g. by undergoing read/write IO
+7 - SLAB
+ page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+ When compound page is used, SLUB/SLQB will only set this flag on the head
+ page; SLOB will not flag it at all.
+10 - BUDDY
a free memory block managed by the buddy system allocator
The buddy system organizes free memory in blocks of various orders.
An order N block has 2^N physically contiguous pages, with the BUDDY flag
set for and _only_ for the first page.
-
-15. COMPOUND_HEAD
-16. COMPOUND_TAIL
+15 - COMPOUND_HEAD
A compound page with order N consists of 2^N physically contiguous pages.
A compound page with order 2 takes the form of "HTTT", where H donates its
head page and T donates its tail page(s). The major consumers of compound
- pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
- memory allocators and various device drivers. However in this interface,
- only huge/giga pages are made visible to end users.
-17. HUGE
+ pages are hugeTLB pages
+ (:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`),
+ the SLUB etc. memory allocators and various device drivers.
+ However in this interface, only huge/giga pages are made visible
+ to end users.
+16 - COMPOUND_TAIL
+ A compound page tail (see description above).
+17 - HUGE
this is an integral part of a HugeTLB page
-
-19. HWPOISON
+19 - HWPOISON
hardware detected memory corruption on this page: don't touch the data!
-
-20. NOPAGE
+20 - NOPAGE
no page frame exists at the requested address
-
-21. KSM
+21 - KSM
identical memory pages dynamically shared between one or more processes
-
-22. THP
+22 - THP
contiguous pages which construct transparent hugepages
-
-23. BALLOON
+23 - BALLOON
balloon compaction page
-
-24. ZERO_PAGE
+24 - ZERO_PAGE
zero page for pfn_zero or huge_zero page
-
-25. IDLE
+25 - IDLE
page has not been accessed since it was marked idle (see
- Documentation/vm/idle_page_tracking.txt). Note that this flag may be
- stale in case the page was accessed via a PTE. To make sure the flag
- is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
-
- [IO related page flags]
- 1. ERROR IO error occurred
- 3. UPTODATE page has up-to-date data
- ie. for file backed page: (in-memory data revision >= on-disk one)
- 4. DIRTY page has been written to, hence contains new data
- ie. for file backed page: (in-memory data revision > on-disk one)
- 8. WRITEBACK page is being synced to disk
-
- [LRU related page flags]
- 5. LRU page is in one of the LRU lists
- 6. ACTIVE page is in the active LRU list
-18. UNEVICTABLE page is in the unevictable (non-)LRU list
- It is somehow pinned and not a candidate for LRU page reclaims,
- eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
- 2. REFERENCED page has been referenced since last LRU list enqueue/requeue
- 9. RECLAIM page will be reclaimed soon after its pageout IO completed
-11. MMAP a memory mapped page
-12. ANON a memory mapped page that is not part of a file
-13. SWAPCACHE page is mapped to swap space, ie. has an associated swap entry
-14. SWAPBACKED page is backed by swap/RAM
+ :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
+ Note that this flag may be stale in case the page was accessed via
+ a PTE. To make sure the flag is up-to-date one has to read
+ ``/sys/kernel/mm/page_idle/bitmap`` first.
+
+IO related page flags
+---------------------
+
+1 - ERROR
+ IO error occurred
+3 - UPTODATE
+ page has up-to-date data
+ ie. for file backed page: (in-memory data revision >= on-disk one)
+4 - DIRTY
+ page has been written to, hence contains new data
+ i.e. for file backed page: (in-memory data revision > on-disk one)
+8 - WRITEBACK
+ page is being synced to disk
+
+LRU related page flags
+----------------------
+
+5 - LRU
+ page is in one of the LRU lists
+6 - ACTIVE
+ page is in the active LRU list
+18 - UNEVICTABLE
+ page is in the unevictable (non-)LRU list It is somehow pinned and
+ not a candidate for LRU page reclaims, e.g. ramfs pages,
+ shmctl(SHM_LOCK) and mlock() memory segments
+2 - REFERENCED
+ page has been referenced since last LRU list enqueue/requeue
+9 - RECLAIM
+ page will be reclaimed soon after its pageout IO completed
+11 - MMAP
+ a memory mapped page
+12 - ANON
+ a memory mapped page that is not part of a file
+13 - SWAPCACHE
+ page is mapped to swap space, i.e. has an associated swap entry
+14 - SWAPBACKED
+ page is backed by swap/RAM
The page-types tool in the tools/vm directory can be used to query the
above flags.
-Using pagemap to do something useful:
+Using pagemap to do something useful
+====================================
The general procedure for using pagemap to find out about a process' memory
usage goes like this:
- 1. Read /proc/pid/maps to determine which parts of the memory space are
+ 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are
mapped to what.
2. Select the maps you are interested in -- all of them, or a particular
library, or the stack or the heap, etc.
- 3. Open /proc/pid/pagemap and seek to the pages you would like to examine.
+ 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine.
4. Read a u64 for each page from pagemap.
- 5. Open /proc/kpagecount and/or /proc/kpageflags. For each PFN you just
- read, seek to that entry in the file, and read the data you want.
+ 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``. For each PFN you
+ just read, seek to that entry in the file, and read the data you want.
For example, to find the "unique set size" (USS), which is the amount of
memory that a process is using that is not shared with any other process,
@@ -171,7 +188,8 @@ you can go through every map in the process, find the PFNs, look those up
in kpagecount, and tally up the number of pages that are only referenced
once.
-Other notes:
+Other notes
+===========
Reading from any of the files will return -EINVAL if you are not starting
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/admin-guide/mm/soft-dirty.rst
index 55684d11a1e8..cb0cfd6672fa 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/admin-guide/mm/soft-dirty.rst
@@ -1,34 +1,38 @@
- SOFT-DIRTY PTEs
+.. _soft_dirty:
- The soft-dirty is a bit on a PTE which helps to track which pages a task
+===============
+Soft-Dirty PTEs
+===============
+
+The soft-dirty is a bit on a PTE which helps to track which pages a task
writes to. In order to do this tracking one should
1. Clear soft-dirty bits from the task's PTEs.
- This is done by writing "4" into the /proc/PID/clear_refs file of the
+ This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the
task in question.
2. Wait some time.
3. Read soft-dirty bits from the PTEs.
- This is done by reading from the /proc/PID/pagemap. The bit 55 of the
+ This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the
64-bit qword is the soft-dirty one. If set, the respective PTE was
written to since step 1.
- Internally, to do this tracking, the writable bit is cleared from PTEs
+Internally, to do this tracking, the writable bit is cleared from PTEs
when the soft-dirty bit is cleared. So, after this, when the task tries to
modify a page at some virtual address the #PF occurs and the kernel sets
the soft-dirty bit on the respective PTE.
- Note, that although all the task's address space is marked as r/o after the
+Note, that although all the task's address space is marked as r/o after the
soft-dirty bits clear, the #PF-s that occur after that are processed fast.
This is so, since the pages are still mapped to physical memory, and thus all
the kernel does is finds this fact out and puts both writable and soft-dirty
bits on the PTE.
- While in most cases tracking memory changes by #PF-s is more than enough
+While in most cases tracking memory changes by #PF-s is more than enough
there is still a scenario when we can lose soft dirty bits -- a task
unmaps a previously mapped memory region and then maps a new one at exactly
the same place. When unmap is called, the kernel internally clears PTE values
@@ -36,7 +40,7 @@ including soft dirty bits. To notify user space application about such
memory region renewal the kernel always marks new memory regions (and
expanded regions) as soft dirty.
- This feature is actively used by the checkpoint-restore project. You
+This feature is actively used by the checkpoint-restore project. You
can find more details about it on http://criu.org
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
new file mode 100644
index 000000000000..7ab93a8404b9
--- /dev/null
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -0,0 +1,418 @@
+.. _admin_guide_transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+Objective
+=========
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently THP only works for anonymous memory mappings and tmpfs/shmem.
+But in the future it can expand to other filesystems.
+
+.. note::
+ in the examples below we presume that the basic page size is 4K and
+ the huge page size is 2M, although the actual numbers may vary
+ depending on the CPU architecture.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components:
+
+1) the TLB miss will run faster (especially with virtualization using
+ nested pagetables but almost always also on bare metal without
+ virtualization)
+
+2) a single TLB entry will be mapping a much larger amount of virtual
+ memory in turn reducing the number of TLB misses. With
+ virtualization and nested pagetables the TLB can be mapped of
+ larger size only if both KVM and the Linux guest are using
+ hugepages but a significant speedup already happens if only one of
+ the two is using hugepages just because of the fact the TLB miss is
+ going to run faster.
+
+THP can be enabled system wide or restricted to certain tasks or even
+memory ranges inside task's address space. Unless THP is completely
+disabled, there is ``khugepaged`` daemon that scans memory and
+collapses sequences of basic pages into huge pages.
+
+The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
+interface and using madivse(2) and prctl(2) system calls.
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+.. _thp_sysfs:
+
+sysfs
+=====
+
+Global THP controls
+-------------------
+
+Transparent Hugepage Support for anonymous memory can be entirely disabled
+(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
+regions (to avoid the risk of consuming more memory resources) or enabled
+system wide. This can be achieved with one of::
+
+ echo always >/sys/kernel/mm/transparent_hugepage/enabled
+ echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+ echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+anonymous hugepages in case they're not immediately free to madvise
+regions or to never try to defrag memory and simply fallback to regular
+pages unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact we
+use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+::
+
+ echo always >/sys/kernel/mm/transparent_hugepage/defrag
+ echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+ echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
+ echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+ echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+always
+ means that an application requesting THP will stall on
+ allocation failure and directly reclaim pages and compact
+ memory in an effort to allocate a THP immediately. This may be
+ desirable for virtual machines that benefit heavily from THP
+ use and are willing to delay the VM start to utilise them.
+
+defer
+ means that an application will wake kswapd in the background
+ to reclaim pages and wake kcompactd to compact memory so that
+ THP is available in the near future. It's the responsibility
+ of khugepaged to then install the THP pages later.
+
+defer+madvise
+ will enter direct reclaim and compaction like ``always``, but
+ only for regions that have used madvise(MADV_HUGEPAGE); all
+ other regions will wake kswapd in the background to reclaim
+ pages and wake kcompactd to compact memory so that THP is
+ available in the near future.
+
+madvise
+ will enter direct reclaim like ``always`` but only for regions
+ that are have used madvise(MADV_HUGEPAGE). This is the default
+ behaviour.
+
+never
+ should be self-explanatory.
+
+By default kernel tries to use huge zero page on read page fault to
+anonymous mapping. It's possible to disable huge zero page by writing 0
+or enable it back by writing 1::
+
+ echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+ echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+
+Some userspace (such as a test program, or an optimized memory allocation
+library) may want to know the size (in bytes) of a transparent hugepage::
+
+ cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+Khugepaged controls
+-------------------
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged by writing 0 or enable
+defrag in khugepaged by writing 1::
+
+ echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+ echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core)::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+``max_ptes_none`` specifies how many extra small pages (that are
+not already mapped) can be allocated when collapsing a group
+of small pages into one large page::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
+
+A higher value leads to use additional memory for programs.
+A lower value leads to gain less thp performance. Value of
+max_ptes_none can waste cpu time very little, you can
+ignore it.
+
+``max_ptes_swap`` specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
+Boot parameter
+==============
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter ``transparent_hugepage=always`` or
+``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
+to the kernel command line.
+
+Hugepages in tmpfs/shmem
+========================
+
+You can control hugepage allocation policy in tmpfs with mount option
+``huge=``. It can have following values:
+
+always
+ Attempt to allocate huge pages every time we need a new page;
+
+never
+ Do not allocate huge pages;
+
+within_size
+ Only allocate huge page if it will be fully within i_size.
+ Also respect fadvise()/madvise() hints;
+
+advise
+ Only allocate huge pages if requested with fadvise()/madvise();
+
+The default policy is ``never``.
+
+``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
+``huge=never`` will not attempt to break up huge pages at all, just stop more
+from being allocated.
+
+There's also sysfs knob to control hugepage allocation policy for internal
+shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
+is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
+MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
+
+In addition to policies listed above, shmem_enabled allows two further
+values:
+
+deny
+ For use in emergencies, to force the huge option off from
+ all mounts;
+force
+ Force the huge option on for all - very useful for testing;
+
+Need of application restart
+===========================
+
+The transparent_hugepage/enabled values and tmpfs mount option only affect
+future behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to the
+regions registered in khugepaged.
+
+Monitoring usage
+================
+
+The number of anonymous transparent huge pages currently used by the
+system is available by reading the AnonHugePages field in ``/proc/meminfo``.
+To identify what applications are using anonymous transparent huge pages,
+it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
+for each mapping.
+
+The number of file transparent huge pages mapped to userspace is available
+by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
+To identify what applications are mapping file transparent huge pages, it
+is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
+for each mapping.
+
+Note that reading the smaps file is expensive and reading it
+frequently will incur overhead.
+
+There are a number of counters in ``/proc/vmstat`` that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc
+ is incremented every time a huge page is successfully
+ allocated to handle a page fault. This applies to both the
+ first time a page is faulted and for COW faults.
+
+thp_collapse_alloc
+ is incremented by khugepaged when it has found
+ a range of pages to collapse into one huge page and has
+ successfully allocated a new huge page to store the data.
+
+thp_fault_fallback
+ is incremented if a page fault fails to allocate
+ a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed
+ is incremented if khugepaged found a range
+ of pages that should be collapsed into one huge page but failed
+ the allocation.
+
+thp_file_alloc
+ is incremented every time a file huge page is successfully
+ allocated.
+
+thp_file_mapped
+ is incremented every time a file huge page is mapped into
+ user address space.
+
+thp_split_page
+ is incremented every time a huge page is split into base
+ pages. This can happen for a variety of reasons but a common
+ reason is that a huge page is old and is being reclaimed.
+ This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed
+ is incremented if kernel fails to split huge
+ page. This can happen if the page was pinned by somebody.
+
+thp_deferred_split_page
+ is incremented when a huge page is put onto split
+ queue. This happens when a huge page is partially unmapped and
+ splitting it would free up some memory. Pages on split queue are
+ going to be split under memory pressure.
+
+thp_split_pmd
+ is incremented every time a PMD split into table of PTEs.
+ This can happen, for instance, when application calls mprotect() or
+ munmap() on part of huge page. It doesn't split huge page, only
+ page table entry.
+
+thp_zero_page_alloc
+ is incremented every time a huge zero page is
+ successfully allocated. It includes allocations which where
+ dropped due race with other allocation. Note, it doesn't count
+ every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed
+ is incremented if kernel fails to allocate
+ huge zero page and falls back to using small pages.
+
+thp_swpout
+ is incremented every time a huge page is swapout in one
+ piece without splitting.
+
+thp_swpout_fallback
+ is incremented if a huge page has to be split before swapout.
+ Usually because failed to allocate some continuous swap space
+ for the huge page.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in ``/proc/vmstat`` to help
+monitor this overhead.
+
+compact_stall
+ is incremented every time a process stalls to run
+ memory compaction so that a huge page is free for use.
+
+compact_success
+ is incremented if the system compacted memory and
+ freed a huge page for use.
+
+compact_fail
+ is incremented if the system tries to compact memory
+ but failed.
+
+compact_pages_moved
+ is incremented each time a page is moved. If
+ this value is increasing rapidly, it implies that the system
+ is copying a lot of data to satisfy the huge page allocation.
+ It is possible that the cost of copying exceeds any savings
+ from reduced TLB misses.
+
+compact_pagemigrate_failed
+ is incremented when the underlying mechanism
+ for moving a page failed.
+
+compact_blocks_moved
+ is incremented each time memory compaction examines
+ a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
+Optimizing the applications
+===========================
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+Hugetlbfs
+=========
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/admin-guide/mm/userfaultfd.rst
index bb2f945f87ab..5048cf661a8a 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -1,6 +1,11 @@
-= Userfaultfd =
+.. _userfaultfd:
-== Objective ==
+===========
+Userfaultfd
+===========
+
+Objective
+=========
Userfaults allow the implementation of on-demand paging from userland
and more generally they allow userland to take control of various
@@ -9,7 +14,8 @@ memory page faults, something otherwise only the kernel code could do.
For example userfaults allows a proper and more optimal implementation
of the PROT_NONE+SIGSEGV trick.
-== Design ==
+Design
+======
Userfaults are delivered and resolved through the userfaultfd syscall.
@@ -41,7 +47,8 @@ different processes without them being aware about what is going on
themselves on the same region the manager is already tracking, which
is a corner case that would currently return -EBUSY).
-== API ==
+API
+===
When first opened the userfaultfd must be enabled invoking the
UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
@@ -101,7 +108,8 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
half copied page since it'll keep userfaulting until the copy has
finished.
-== QEMU/KVM ==
+QEMU/KVM
+========
QEMU/KVM is using the userfaultfd syscall to implement postcopy live
migration. Postcopy live migration is one form of memory
@@ -163,7 +171,8 @@ sending the same page twice (in case the userfault is read by the
postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
thread).
-== Non-cooperative userfaultfd ==
+Non-cooperative userfaultfd
+===========================
When the userfaultfd is monitored by an external manager, the manager
must be able to track changes in the process virtual memory
@@ -172,27 +181,30 @@ the same read(2) protocol as for the page fault notifications. The
manager has to explicitly enable these events by setting appropriate
bits in uffdio_api.features passed to UFFDIO_API ioctl:
-UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When
-this feature is enabled, the userfaultfd context of the parent process
-is duplicated into the newly created process. The manager receives
-UFFD_EVENT_FORK with file descriptor of the new userfaultfd context in
-the uffd_msg.fork.
-
-UFFD_FEATURE_EVENT_REMAP - enable notifications about mremap()
-calls. When the non-cooperative process moves a virtual memory area to
-a different location, the manager will receive UFFD_EVENT_REMAP. The
-uffd_msg.remap will contain the old and new addresses of the area and
-its original length.
-
-UFFD_FEATURE_EVENT_REMOVE - enable notifications about
-madvise(MADV_REMOVE) and madvise(MADV_DONTNEED) calls. The event
-UFFD_EVENT_REMOVE will be generated upon these calls to madvise. The
-uffd_msg.remove will contain start and end addresses of the removed
-area.
-
-UFFD_FEATURE_EVENT_UNMAP - enable notifications about memory
-unmapping. The manager will get UFFD_EVENT_UNMAP with uffd_msg.remove
-containing start and end addresses of the unmapped area.
+UFFD_FEATURE_EVENT_FORK
+ enable userfaultfd hooks for fork(). When this feature is
+ enabled, the userfaultfd context of the parent process is
+ duplicated into the newly created process. The manager
+ receives UFFD_EVENT_FORK with file descriptor of the new
+ userfaultfd context in the uffd_msg.fork.
+
+UFFD_FEATURE_EVENT_REMAP
+ enable notifications about mremap() calls. When the
+ non-cooperative process moves a virtual memory area to a
+ different location, the manager will receive
+ UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and
+ new addresses of the area and its original length.
+
+UFFD_FEATURE_EVENT_REMOVE
+ enable notifications about madvise(MADV_REMOVE) and
+ madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will
+ be generated upon these calls to madvise. The uffd_msg.remove
+ will contain start and end addresses of the removed area.
+
+UFFD_FEATURE_EVENT_UNMAP
+ enable notifications about memory unmapping. The manager will
+ get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and
+ end addresses of the unmapped area.
Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP
are pretty similar, they quite differ in the action expected from the
diff --git a/Documentation/admin-guide/ramoops.rst b/Documentation/admin-guide/ramoops.rst
index 4efd7ce77565..6dbcc5481000 100644
--- a/Documentation/admin-guide/ramoops.rst
+++ b/Documentation/admin-guide/ramoops.rst
@@ -61,7 +61,7 @@ Setting the ramoops parameters can be done in several different manners:
mem=128M ramoops.mem_address=0x8000000 ramoops.ecc=1
B. Use Device Tree bindings, as described in
- ``Documentation/device-tree/bindings/reserved-memory/admin-guide/ramoops.rst``.
+ ``Documentation/devicetree/bindings/reserved-memory/ramoops.txt``.
For example::
reserved-memory {
diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README
index b5bb7f518840..56ada27c53be 100644
--- a/Documentation/arm/Marvell/README
+++ b/Documentation/arm/Marvell/README
@@ -302,19 +302,15 @@ Berlin family (Multimedia Solutions)
88DE3010, Armada 1000 (no Linux support)
Core: Marvell PJ1 (ARMv5TE), Dual-core
Product Brief: http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
- 88DE3005, Armada 1500-mini
88DE3005, Armada 1500 Mini
Design name: BG2CD
Core: ARM Cortex-A9, PL310 L2CC
- Homepage: http://www.marvell.com/multimedia-solutions/armada-1500-mini/
- 88DE3006, Armada 1500 Mini Plus
- Design name: BG2CDP
- Core: Dual Core ARM Cortex-A7
- Homepage: http://www.marvell.com/multimedia-solutions/armada-1500-mini-plus/
+ 88DE3006, Armada 1500 Mini Plus
+ Design name: BG2CDP
+ Core: Dual Core ARM Cortex-A7
88DE3100, Armada 1500
Design name: BG2
Core: Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
- Product Brief: http://www.marvell.com/digital-entertainment/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf
88DE3114, Armada 1500 Pro
Design name: BG2Q
Core: Quad Core ARM Cortex-A9, PL310 L2CC
@@ -324,13 +320,16 @@ Berlin family (Multimedia Solutions)
88DE3218, ARMADA 1500 Ultra
Core: ARM Cortex-A53
- Homepage: http://www.marvell.com/multimedia-solutions/
+ Homepage: https://www.synaptics.com/products/multimedia-solutions
Directory: arch/arm/mach-berlin
Comments:
+
* This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
+ * The Berlin family was acquired by Synaptics from Marvell in 2017.
+
CPU Cores
---------
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
index 525b9f6d7fb4..760a3f7c3ed4 100644
--- a/Documentation/block/cmdline-partition.txt
+++ b/Documentation/block/cmdline-partition.txt
@@ -1,7 +1,9 @@
Embedded device command line partition parsing
=====================================================================
-Support for reading the block device partition table from the command line.
+The "blkdevparts" command line option adds support for reading the
+block device partition table from the kernel command line.
+
It is typically used for fixed block (eMMC) embedded devices.
It has no MBR, so saves storage space. Bootloader can be easily accessed
by absolute address of data on the block device.
@@ -14,22 +16,27 @@ blkdevparts=<blkdev-def>[;<blkdev-def>]
<partdef> := <size>[@<offset>](part-name)
<blkdev-id>
- block device disk name, embedded device used fixed block device,
- it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0.
+ block device disk name. Embedded device uses fixed block device.
+ Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
<size>
partition size, in bytes, such as: 512, 1m, 1G.
+ size may contain an optional suffix of (upper or lower case):
+ K, M, G, T, P, E.
+ "-" is used to denote all remaining space.
<offset>
partition start address, in bytes.
+ offset may contain an optional suffix of (upper or lower case):
+ K, M, G, T, P, E.
(part-name)
- partition name, kernel send uevent with "PARTNAME". application can create
- a link to block device partition with the name "PARTNAME".
- user space application can access partition by partition name.
+ partition name. Kernel sends uevent with "PARTNAME". Application can
+ create a link to block device partition with the name "PARTNAME".
+ User space application can access partition by partition name.
Example:
- eMMC disk name is "mmcblk0" and "mmcblk0boot0"
+ eMMC disk names are "mmcblk0" and "mmcblk0boot0".
bootargs:
'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst
index fce929144ccd..2e7165f86f55 100644
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -111,7 +111,6 @@ If the compiler can prove that do_something() does not store to the
variable a, then the compiler is within its rights transforming this to
the following::
- tmp = a;
if (a > 0)
for (;;)
do_something();
@@ -119,7 +118,7 @@ the following::
If you don't want the compiler to do this (and you probably don't), then
you should use something like the following::
- while (READ_ONCE(a) < 0)
+ while (READ_ONCE(a) > 0)
do_something();
Alternatively, you could place a barrier() call in the loop.
@@ -467,10 +466,12 @@ Like the above, except that these routines return a boolean which
indicates whether the changed bit was set _BEFORE_ the atomic bit
operation.
-WARNING! It is incredibly important that the value be a boolean,
-ie. "0" or "1". Do not try to be fancy and save a few instructions by
-declaring the above to return "long" and just returning something like
-"old_val & mask" because that will not work.
+
+.. warning::
+ It is incredibly important that the value be a boolean, ie. "0" or "1".
+ Do not try to be fancy and save a few instructions by declaring the
+ above to return "long" and just returning something like "old_val &
+ mask" because that will not work.
For one thing, this return value gets truncated to int in many code
paths using these interfaces, so on 64-bit if the bit is set in the
diff --git a/Documentation/cachetlb.txt b/Documentation/core-api/cachetlb.rst
index 6eb9d3f090cd..6eb9d3f090cd 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/core-api/cachetlb.rst
diff --git a/Documentation/circular-buffers.txt b/Documentation/core-api/circular-buffers.rst
index 53e51caa3347..53e51caa3347 100644
--- a/Documentation/circular-buffers.txt
+++ b/Documentation/core-api/circular-buffers.rst
diff --git a/Documentation/core-api/gfp_mask-from-fs-io.rst b/Documentation/core-api/gfp_mask-from-fs-io.rst
new file mode 100644
index 000000000000..e0df8f416582
--- /dev/null
+++ b/Documentation/core-api/gfp_mask-from-fs-io.rst
@@ -0,0 +1,66 @@
+=================================
+GFP masks used from FS/IO context
+=================================
+
+:Date: May, 2018
+:Author: Michal Hocko <mhocko@kernel.org>
+
+Introduction
+============
+
+Code paths in the filesystem and IO stacks must be careful when
+allocating memory to prevent recursion deadlocks caused by direct
+memory reclaim calling back into the FS or IO paths and blocking on
+already held resources (e.g. locks - most commonly those used for the
+transaction context).
+
+The traditional way to avoid this deadlock problem is to clear __GFP_FS
+respectively __GFP_IO (note the latter implies clearing the first as well) in
+the gfp mask when calling an allocator. GFP_NOFS respectively GFP_NOIO can be
+used as shortcut. It turned out though that above approach has led to
+abuses when the restricted gfp mask is used "just in case" without a
+deeper consideration which leads to problems because an excessive use
+of GFP_NOFS/GFP_NOIO can lead to memory over-reclaim or other memory
+reclaim issues.
+
+New API
+========
+
+Since 4.12 we do have a generic scope API for both NOFS and NOIO context
+``memalloc_nofs_save``, ``memalloc_nofs_restore`` respectively ``memalloc_noio_save``,
+``memalloc_noio_restore`` which allow to mark a scope to be a critical
+section from a filesystem or I/O point of view. Any allocation from that
+scope will inherently drop __GFP_FS respectively __GFP_IO from the given
+mask so no memory allocation can recurse back in the FS/IO.
+
+.. kernel-doc:: include/linux/sched/mm.h
+ :functions: memalloc_nofs_save memalloc_nofs_restore
+.. kernel-doc:: include/linux/sched/mm.h
+ :functions: memalloc_noio_save memalloc_noio_restore
+
+FS/IO code then simply calls the appropriate save function before
+any critical section with respect to the reclaim is started - e.g.
+lock shared with the reclaim context or when a transaction context
+nesting would be possible via reclaim. The restore function should be
+called when the critical section ends. All that ideally along with an
+explanation what is the reclaim context for easier maintenance.
+
+Please note that the proper pairing of save/restore functions
+allows nesting so it is safe to call ``memalloc_noio_save`` or
+``memalloc_noio_restore`` respectively from an existing NOIO or NOFS
+scope.
+
+What about __vmalloc(GFP_NOFS)
+==============================
+
+vmalloc doesn't support GFP_NOFS semantic because there are hardcoded
+GFP_KERNEL allocations deep inside the allocator which are quite non-trivial
+to fix up. That means that calling ``vmalloc`` with GFP_NOFS/GFP_NOIO is
+almost always a bug. The good news is that the NOFS/NOIO semantic can be
+achieved by the scope API.
+
+In the ideal world, upper layers should already mark dangerous contexts
+and so no special care is required and vmalloc should be called without
+any problems. Sometimes if the context is not really clear or there are
+layering violations then the recommended way around that is to wrap ``vmalloc``
+by the scope API with a comment explaining the problem.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index c670a8031786..f5a66b72f984 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -14,6 +14,7 @@ Core utilities
kernel-api
assoc_array
atomic_ops
+ cachetlb
refcount-vs-atomic
cpu_hotplug
idr
@@ -25,6 +26,8 @@ Core utilities
genalloc
errseq
printk-formats
+ circular-buffers
+ gfp_mask-from-fs-io
Interfaces for kernel debugging
===============================
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 92f30006adae..8e44aea366c2 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -39,17 +39,17 @@ String Manipulation
.. kernel-doc:: lib/string.c
:export:
+Basic Kernel Library Functions
+==============================
+
+The Linux kernel provides more basic utility functions.
+
Bit Operations
--------------
.. kernel-doc:: arch/x86/include/asm/bitops.h
:internal:
-Basic Kernel Library Functions
-==============================
-
-The Linux kernel provides more basic utility functions.
-
Bitmap Operations
-----------------
@@ -80,6 +80,31 @@ Command-line Parsing
.. kernel-doc:: lib/cmdline.c
:export:
+Sorting
+-------
+
+.. kernel-doc:: lib/sort.c
+ :export:
+
+.. kernel-doc:: lib/list_sort.c
+ :export:
+
+Text Searching
+--------------
+
+.. kernel-doc:: lib/textsearch.c
+ :doc: ts_intro
+
+.. kernel-doc:: lib/textsearch.c
+ :export:
+
+.. kernel-doc:: include/linux/textsearch.h
+ :functions: textsearch_find textsearch_next \
+ textsearch_get_pattern textsearch_get_pattern_len
+
+CRC and Math Functions in Linux
+===============================
+
CRC Functions
-------------
@@ -103,9 +128,6 @@ CRC Functions
.. kernel-doc:: lib/crc-itu-t.c
:export:
-Math Functions in Linux
-=======================
-
Base 2 log and power Functions
------------------------------
@@ -127,28 +149,6 @@ Division Functions
.. kernel-doc:: lib/gcd.c
:export:
-Sorting
--------
-
-.. kernel-doc:: lib/sort.c
- :export:
-
-.. kernel-doc:: lib/list_sort.c
- :export:
-
-Text Searching
---------------
-
-.. kernel-doc:: lib/textsearch.c
- :doc: ts_intro
-
-.. kernel-doc:: lib/textsearch.c
- :export:
-
-.. kernel-doc:: include/linux/textsearch.h
- :functions: textsearch_find textsearch_next \
- textsearch_get_pattern textsearch_get_pattern_len
-
UUID/GUID
---------
diff --git a/Documentation/core-api/refcount-vs-atomic.rst b/Documentation/core-api/refcount-vs-atomic.rst
index 83351c258cdb..322851bada16 100644
--- a/Documentation/core-api/refcount-vs-atomic.rst
+++ b/Documentation/core-api/refcount-vs-atomic.rst
@@ -17,7 +17,7 @@ in order to help maintainers validate their code against the change in
these memory ordering guarantees.
The terms used through this document try to follow the formal LKMM defined in
-github.com/aparri/memory-model/blob/master/Documentation/explanation.txt
+tools/memory-model/Documentation/explanation.txt.
memory-barriers.txt and atomic_t.txt provide more background to the
memory ordering in general and for atomic operations specifically.
diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 94c4786f2573..c4ff5d791233 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -20,5 +20,6 @@ for cryptographic use cases, as well as programming examples.
architecture
devel-algos
userspace-if
+ crypto_engine
api
api-samples
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index f7a18f274357..aabc8738b3d8 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -120,7 +120,7 @@ A typical out of bounds access report looks like this::
The header of the report discribe what kind of bug happened and what kind of
access caused it. It's followed by the description of the accessed slub object
-(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+(see 'SLUB Debug output' section in Documentation/vm/slub.rst for details) and
the description of the accessed memory page.
In the last section the report shows memory state around the accessed address.
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index e80850eefe13..3bf371a938d0 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -151,6 +151,11 @@ Contributing new tests (details)
TEST_FILES, TEST_GEN_FILES mean it is the file which is used by
test.
+ * First use the headers inside the kernel source and/or git repo, and then the
+ system headers. Headers for the kernel release as opposed to headers
+ installed by the distro on the system should be the primary focus to be able
+ to find regressions.
+
Test Harness
============
diff --git a/Documentation/devicetree/bindings/hwmon/gpio-fan.txt b/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
index 439a7430fc68..2becdcfdc840 100644
--- a/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
+++ b/Documentation/devicetree/bindings/hwmon/gpio-fan.txt
@@ -11,7 +11,7 @@ Optional properties:
must have the RPM values in ascending order.
- alarm-gpios: This pin going active indicates something is wrong with
the fan, and a udev event will be fired.
-- cooling-cells: If used as a cooling device, must be <2>
+- #cooling-cells: If used as a cooling device, must be <2>
Also see: Documentation/devicetree/bindings/thermal/thermal.txt
min and max states are derived from the speed-map of the fan.
diff --git a/Documentation/devicetree/bindings/hwmon/ltc2990.txt b/Documentation/devicetree/bindings/hwmon/ltc2990.txt
new file mode 100644
index 000000000000..f92f54029e84
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ltc2990.txt
@@ -0,0 +1,36 @@
+ltc2990: Linear Technology LTC2990 power monitor
+
+Required properties:
+- compatible: Must be "lltc,ltc2990"
+- reg: I2C slave address
+- lltc,meas-mode:
+ An array of two integers for configuring the chip measurement mode.
+
+ The first integer defines the bits 2..0 in the control register. In all
+ cases the internal temperature and supply voltage are measured. In
+ addition the following input measurements are enabled per mode:
+
+ 0: V1, V2, TR2
+ 1: V1-V2, TR2
+ 2: V1-V2, V3, V4
+ 3: TR1, V3, V4
+ 4: TR1, V3-V4
+ 5: TR1, TR2
+ 6: V1-V2, V3-V4
+ 7: V1, V2, V3, V4
+
+ The second integer defines the bits 4..3 in the control register. This
+ allows a subset of the measurements to be enabled:
+
+ 0: Internal temperature and supply voltage only
+ 1: TR1, V1 or V1-V2 only per mode
+ 2: TR2, V3 or V3-V4 only per mode
+ 3: All measurements per mode
+
+Example:
+
+ltc2990@4c {
+ compatible = "lltc,ltc2990";
+ reg = <0x4c>;
+ lltc,meas-mode = <7 3>; /* V1, V2, V3, V4 */
+};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
index a83f9a5734ca..89674ad8a097 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
@@ -9,11 +9,12 @@ number of interrupt exposed depends on the SoC.
Required properties:
-- compatible : must have "amlogic,meson8-gpio-intc†and either
- “amlogic,meson8-gpio-intc†for meson8 SoCs (S802) or
- “amlogic,meson8b-gpio-intc†for meson8b SoCs (S805) or
- “amlogic,meson-gxbb-gpio-intc†for GXBB SoCs (S905) or
- “amlogic,meson-gxl-gpio-intc†for GXL SoCs (S905X, S912)
+- compatible : must have "amlogic,meson8-gpio-intc" and either
+ "amlogic,meson8-gpio-intc" for meson8 SoCs (S802) or
+ "amlogic,meson8b-gpio-intc" for meson8b SoCs (S805) or
+ "amlogic,meson-gxbb-gpio-intc" for GXBB SoCs (S905) or
+ "amlogic,meson-gxl-gpio-intc" for GXL SoCs (S905X, S912)
+ "amlogic,meson-axg-gpio-intc" for AXG SoCs (A113D, A113X)
- interrupt-parent : a phandle to the GIC the interrupts are routed to.
Usually this is provided at the root level of the device tree as it is
common to most of the SoC.
diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt
index 0a57f2f4167d..3ea78c4ef887 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt
@@ -57,6 +57,20 @@ Optional
occupied by the redistributors. Required if more than one such
region is present.
+- msi-controller: Boolean property. Identifies the node as an MSI
+ controller. Only present if the Message Based Interrupt
+ functionnality is being exposed by the HW, and the mbi-ranges
+ property present.
+
+- mbi-ranges: A list of pairs <intid span>, where "intid" is the first
+ SPI of a range that can be used an MBI, and "span" the size of that
+ range. Multiple ranges can be provided. Requires "msi-controller" to
+ be set.
+
+- mbi-alias: Address property. Base address of an alias of the GICD
+ region containing only the {SET,CLR}SPI registers to be used if
+ isolation is required, and if supported by the HW.
+
Sub-nodes:
PPI affinity can be expressed as a single "ppi-partitions" node,
@@ -99,6 +113,9 @@ Examples:
<0x0 0x2c020000 0 0x2000>; // GICV
interrupts = <1 9 4>;
+ msi-controller;
+ mbi-ranges = <256 128>;
+
gic-its@2c200000 {
compatible = "arm,gic-v3-its";
msi-controller;
diff --git a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
index edf03f09244b..136bd612bd83 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
@@ -5,11 +5,14 @@ Required properties:
- compatible: Should be:
"st,stm32-exti"
"st,stm32h7-exti"
+ "st,stm32mp1-exti"
- reg: Specifies base physical address and size of the registers
- interrupt-controller: Indentifies the node as an interrupt controller
- #interrupt-cells: Specifies the number of cells to encode an interrupt
specifier, shall be 2
- interrupts: interrupts references to primary interrupt controller
+ (only needed for exti controller with multiple exti under
+ same parent interrupt: st,stm32-exti and st,stm32h7-exti")
Example:
diff --git a/Documentation/clk.txt b/Documentation/driver-api/clk.rst
index 511628bb3d3a..511628bb3d3a 100644
--- a/Documentation/clk.txt
+++ b/Documentation/driver-api/clk.rst
diff --git a/Documentation/driver-api/device_connection.rst b/Documentation/driver-api/device_connection.rst
index affbc5566ab0..ba364224c349 100644
--- a/Documentation/driver-api/device_connection.rst
+++ b/Documentation/driver-api/device_connection.rst
@@ -40,4 +40,4 @@ API
---
.. kernel-doc:: drivers/base/devcon.c
- : functions: device_connection_find_match device_connection_find device_connection_add device_connection_remove
+ :functions: device_connection_find_match device_connection_find device_connection_add device_connection_remove
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 505ee906d7d9..cbe0242842d1 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -44,7 +44,7 @@ common to each controller of that type:
- methods to establish GPIO line direction
- methods used to access GPIO line values
- - method to set electrical configuration to a a given GPIO line
+ - method to set electrical configuration for a given GPIO line
- method to return the IRQ number associated to a given GPIO line
- flag saying whether calls to its methods may sleep
- optional line names array to identify lines
@@ -143,7 +143,7 @@ resistor will make the line tend to high level unless one of the transistors on
the rail actively pulls it down.
The level on the line will go as high as the VDD on the pull-up resistor, which
-may be higher than the level supported by the transistor, achieveing a
+may be higher than the level supported by the transistor, achieving a
level-shift to the higher VDD.
Integrated electronics often have an output driver stage in the form of a CMOS
@@ -382,7 +382,7 @@ Real-Time compliance for GPIO IRQ chips
Any provider of irqchips needs to be carefully tailored to support Real Time
preemption. It is desirable that all irqchips in the GPIO subsystem keep this
-in mind and does the proper testing to assure they are real time-enabled.
+in mind and do the proper testing to assure they are real time-enabled.
So, pay attention on above " RT_FULL:" notes, please.
The following is a checklist to follow when preparing a driver for real
time-compliance:
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 6d8352c0f354..5d04296f5ce0 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -17,7 +17,9 @@ available subsections can be seen below.
basics
infrastructure
pm/index
+ clk
device-io
+ device_connection
dma-buf
device_link
message-based
diff --git a/Documentation/driver-api/uio-howto.rst b/Documentation/driver-api/uio-howto.rst
index 92056c20e070..fb2eb73be4a3 100644
--- a/Documentation/driver-api/uio-howto.rst
+++ b/Documentation/driver-api/uio-howto.rst
@@ -711,7 +711,8 @@ The vmbus device regions are mapped into uio device resources:
If a subchannel is created by a request to host, then the uio_hv_generic
device driver will create a sysfs binary file for the per-channel ring buffer.
-For example:
+For example::
+
/sys/bus/vmbus/devices/3811fe4d-0fa0-4b62-981a-74fc1084c757/channels/21/ring
Further information
diff --git a/Documentation/features/lib/strncasecmp/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt
index 4f3a6a0e4e68..90459cdde314 100644
--- a/Documentation/features/lib/strncasecmp/arch-support.txt
+++ b/Documentation/features/core/cBPF-JIT/arch-support.txt
@@ -1,7 +1,7 @@
#
-# Feature name: strncasecmp
-# Kconfig: __HAVE_ARCH_STRNCASECMP
-# description: arch provides an optimized strncasecmp() function
+# Feature name: cBPF-JIT
+# Kconfig: HAVE_CBPF_JIT
+# description: arch supports cBPF JIT optimizations
#
-----------------------
| arch |status|
@@ -16,14 +16,16 @@
| ia64: | TODO |
| m68k: | TODO |
| microblaze: | TODO |
- | mips: | TODO |
+ | mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | TODO |
+ | powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
- | sparc: | TODO |
+ | sparc: | ok |
| um: | TODO |
| unicore32: | TODO |
| x86: | TODO |
diff --git a/Documentation/features/core/BPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt
index 0b96b4e1e7d4..c90a0382fe66 100644
--- a/Documentation/features/core/BPF-JIT/arch-support.txt
+++ b/Documentation/features/core/eBPF-JIT/arch-support.txt
@@ -1,7 +1,7 @@
#
-# Feature name: BPF-JIT
-# Kconfig: HAVE_BPF_JIT
-# description: arch supports BPF JIT optimizations
+# Feature name: eBPF-JIT
+# Kconfig: HAVE_EBPF_JIT
+# description: arch supports eBPF JIT optimizations
#
-----------------------
| arch |status|
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | ok |
diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt
index 372a2b18a617..0ef6acdb991c 100644
--- a/Documentation/features/core/generic-idle-thread/arch-support.txt
+++ b/Documentation/features/core/generic-idle-thread/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
- | openrisc: | TODO |
+ | openrisc: | ok |
| parisc: | ok |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt
index ad97217b003b..27cbd63abfd2 100644
--- a/Documentation/features/core/jump-labels/arch-support.txt
+++ b/Documentation/features/core/jump-labels/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | ok |
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index 36ee7bef5d18..f44c274e40ed 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | ok |
| nios2: | ok |
| openrisc: | ok |
| parisc: | ok |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt
index f5c99fa576d3..282ecc8ea1da 100644
--- a/Documentation/features/debug/KASAN/arch-support.txt
+++ b/Documentation/features/debug/KASAN/arch-support.txt
@@ -17,15 +17,17 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
| um: | TODO |
| unicore32: | TODO |
- | x86: | ok | 64-bit only
+ | x86: | ok |
| xtensa: | ok |
-----------------------
diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt
index 5170a9934843..01b2b3004e0a 100644
--- a/Documentation/features/debug/gcov-profile-all/arch-support.txt
+++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | ok |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
| sparc: | TODO |
diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt
index 13b6e994ae1f..3b4dff22329f 100644
--- a/Documentation/features/debug/kgdb/arch-support.txt
+++ b/Documentation/features/debug/kgdb/arch-support.txt
@@ -11,16 +11,18 @@
| arm: | ok |
| arm64: | ok |
| c6x: | TODO |
- | h8300: | TODO |
+ | h8300: | ok |
| hexagon: | ok |
| ia64: | TODO |
| m68k: | TODO |
| microblaze: | ok |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | ok |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index 419bb38820e7..7e963d0ae646 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt
index 52b3ace0a030..4ada027faf16 100644
--- a/Documentation/features/debug/kprobes/arch-support.txt
+++ b/Documentation/features/debug/kprobes/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | ok |
| arm: | ok |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt
index 180d24419518..044e13fcca5d 100644
--- a/Documentation/features/debug/kretprobes/arch-support.txt
+++ b/Documentation/features/debug/kretprobes/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | ok |
| arm: | ok |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt
index 0a1241f45e41..dce7669c918f 100644
--- a/Documentation/features/debug/optprobes/arch-support.txt
+++ b/Documentation/features/debug/optprobes/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | TODO |
+ | powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 570019572383..74b89a9c8b3a 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | ok |
| sparc: | TODO |
diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt
index 0b8d922eb799..1a3f9d3229bf 100644
--- a/Documentation/features/debug/uprobes/arch-support.txt
+++ b/Documentation/features/debug/uprobes/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | ok |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
@@ -17,13 +17,15 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
- | sparc: | TODO |
+ | sparc: | ok |
| um: | TODO |
| unicore32: | TODO |
| x86: | ok |
diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt
index 13852ae62e9e..1d78d1069a5f 100644
--- a/Documentation/features/debug/user-ret-profiler/arch-support.txt
+++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/io/dma-api-debug/arch-support.txt b/Documentation/features/io/dma-api-debug/arch-support.txt
deleted file mode 100644
index e438ed675623..000000000000
--- a/Documentation/features/io/dma-api-debug/arch-support.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Feature name: dma-api-debug
-# Kconfig: HAVE_DMA_API_DEBUG
-# description: arch supports DMA debug facilities
-#
- -----------------------
- | arch |status|
- -----------------------
- | alpha: | TODO |
- | arc: | TODO |
- | arm: | ok |
- | arm64: | ok |
- | c6x: | ok |
- | h8300: | TODO |
- | hexagon: | TODO |
- | ia64: | ok |
- | m68k: | TODO |
- | microblaze: | ok |
- | mips: | ok |
- | nios2: | TODO |
- | openrisc: | TODO |
- | parisc: | TODO |
- | powerpc: | ok |
- | s390: | ok |
- | sh: | ok |
- | sparc: | ok |
- | um: | TODO |
- | unicore32: | TODO |
- | x86: | ok |
- | xtensa: | ok |
- -----------------------
diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt
index 47f64a433df0..30c072d2b67c 100644
--- a/Documentation/features/io/dma-contiguous/arch-support.txt
+++ b/Documentation/features/io/dma-contiguous/arch-support.txt
@@ -17,11 +17,13 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
- | s390: | TODO |
+ | riscv: | ok |
+ | s390: | ok |
| sh: | TODO |
| sparc: | TODO |
| um: | TODO |
diff --git a/Documentation/features/io/sg-chain/arch-support.txt b/Documentation/features/io/sg-chain/arch-support.txt
index 07f357fadbff..6554f0372c3f 100644
--- a/Documentation/features/io/sg-chain/arch-support.txt
+++ b/Documentation/features/io/sg-chain/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | ok |
diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt
index 482a0b09d1f8..51704a2dc8d1 100644
--- a/Documentation/features/locking/cmpxchg-local/arch-support.txt
+++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | TODO |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt
index bb35c5ba6286..bd39c5edd460 100644
--- a/Documentation/features/locking/lockdep/arch-support.txt
+++ b/Documentation/features/locking/lockdep/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | ok |
| mips: | ok |
+ | nds32: | ok |
| nios2: | TODO |
- | openrisc: | TODO |
+ | openrisc: | ok |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt
index 627e9a6b2db9..da7aff3bee0b 100644
--- a/Documentation/features/locking/queued-rwlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt
@@ -9,21 +9,23 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | TODO |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
| ia64: | TODO |
| m68k: | TODO |
| microblaze: | TODO |
- | mips: | TODO |
+ | mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
- | openrisc: | TODO |
+ | openrisc: | ok |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
- | sparc: | TODO |
+ | sparc: | ok |
| um: | TODO |
| unicore32: | TODO |
| x86: | ok |
diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt
index 9edda216cdfb..478e9101322c 100644
--- a/Documentation/features/locking/queued-spinlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt
@@ -16,14 +16,16 @@
| ia64: | TODO |
| m68k: | TODO |
| microblaze: | TODO |
- | mips: | TODO |
+ | mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
- | openrisc: | TODO |
+ | openrisc: | ok |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
- | sparc: | TODO |
+ | sparc: | ok |
| um: | TODO |
| unicore32: | TODO |
| x86: | ok |
diff --git a/Documentation/features/locking/rwsem-optimized/arch-support.txt b/Documentation/features/locking/rwsem-optimized/arch-support.txt
index 8d9afb10b16e..e54b1f1a8091 100644
--- a/Documentation/features/locking/rwsem-optimized/arch-support.txt
+++ b/Documentation/features/locking/rwsem-optimized/arch-support.txt
@@ -1,6 +1,6 @@
#
# Feature name: rwsem-optimized
-# Kconfig: Optimized asm/rwsem.h
+# Kconfig: !RWSEM_GENERIC_SPINLOCK
# description: arch provides optimized rwsem APIs
#
-----------------------
@@ -8,8 +8,8 @@
-----------------------
| alpha: | ok |
| arc: | TODO |
- | arm: | TODO |
- | arm64: | TODO |
+ | arm: | ok |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | TODO |
@@ -17,14 +17,16 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
- | um: | TODO |
+ | um: | ok |
| unicore32: | TODO |
| x86: | ok |
| xtensa: | ok |
diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt
index d01239ee34b3..7331402d1887 100644
--- a/Documentation/features/perf/kprobes-event/arch-support.txt
+++ b/Documentation/features/perf/kprobes-event/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | ok |
- | arm64: | TODO |
+ | arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
| hexagon: | ok |
@@ -17,13 +17,15 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | ok |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
- | sparc: | TODO |
+ | sparc: | ok |
| um: | TODO |
| unicore32: | TODO |
| x86: | ok |
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt
index 458faba5311a..53feeee6cdad 100644
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -17,11 +17,13 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
- | s390: | TODO |
+ | riscv: | TODO |
+ | s390: | ok |
| sh: | TODO |
| sparc: | TODO |
| um: | TODO |
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt
index 545d01c69c88..16164348e0ea 100644
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -17,11 +17,13 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
- | s390: | TODO |
+ | riscv: | TODO |
+ | s390: | ok |
| sh: | TODO |
| sparc: | TODO |
| um: | TODO |
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 85a6c9d4571c..dbdf62907703 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -40,10 +40,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt
index 347508863872..c68bb2c2cb62 100644
--- a/Documentation/features/sched/numa-balancing/arch-support.txt
+++ b/Documentation/features/sched/numa-balancing/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | .. |
| arm: | .. |
- | arm64: | .. |
+ | arm64: | ok |
| c6x: | .. |
| h8300: | .. |
| hexagon: | .. |
@@ -17,11 +17,13 @@
| m68k: | .. |
| microblaze: | .. |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | .. |
| openrisc: | .. |
| parisc: | .. |
| powerpc: | ok |
- | s390: | .. |
+ | riscv: | TODO |
+ | s390: | ok |
| sh: | .. |
| sparc: | TODO |
| um: | .. |
diff --git a/Documentation/features/scripts/features-refresh.sh b/Documentation/features/scripts/features-refresh.sh
new file mode 100755
index 000000000000..9e72d38a0720
--- /dev/null
+++ b/Documentation/features/scripts/features-refresh.sh
@@ -0,0 +1,98 @@
+#
+# Small script that refreshes the kernel feature support status in place.
+#
+
+for F_FILE in Documentation/features/*/*/arch-support.txt; do
+ F=$(grep "^# Kconfig:" "$F_FILE" | cut -c26-)
+
+ #
+ # Each feature F is identified by a pair (O, K), where 'O' can
+ # be either the empty string (for 'nop') or "not" (the logical
+ # negation operator '!'); other operators are not supported.
+ #
+ O=""
+ K=$F
+ if [[ "$F" == !* ]]; then
+ O="not"
+ K=$(echo $F | sed -e 's/^!//g')
+ fi
+
+ #
+ # F := (O, K) is 'valid' iff there is a Kconfig file (for some
+ # arch) which contains K.
+ #
+ # Notice that this definition entails an 'asymmetry' between
+ # the case 'O = ""' and the case 'O = "not"'. E.g., F may be
+ # _invalid_ if:
+ #
+ # [case 'O = ""']
+ # 1) no arch provides support for F,
+ # 2) K does not exist (e.g., it was renamed/mis-typed);
+ #
+ # [case 'O = "not"']
+ # 3) all archs provide support for F,
+ # 4) as in (2).
+ #
+ # The rationale for adopting this definition (and, thus, for
+ # keeping the asymmetry) is:
+ #
+ # We want to be able to 'detect' (2) (or (4)).
+ #
+ # (1) and (3) may further warn the developers about the fact
+ # that K can be removed.
+ #
+ F_VALID="false"
+ for ARCH_DIR in arch/*/; do
+ K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+ K_GREP=$(grep "$K" $K_FILES)
+ if [ ! -z "$K_GREP" ]; then
+ F_VALID="true"
+ break
+ fi
+ done
+ if [ "$F_VALID" = "false" ]; then
+ printf "WARNING: '%s' is not a valid Kconfig\n" "$F"
+ fi
+
+ T_FILE="$F_FILE.tmp"
+ grep "^#" $F_FILE > $T_FILE
+ echo " -----------------------" >> $T_FILE
+ echo " | arch |status|" >> $T_FILE
+ echo " -----------------------" >> $T_FILE
+ for ARCH_DIR in arch/*/; do
+ ARCH=$(echo $ARCH_DIR | sed -e 's/arch//g' | sed -e 's/\///g')
+ K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+ K_GREP=$(grep "$K" $K_FILES)
+ #
+ # Arch support status values for (O, K) are updated according
+ # to the following rules.
+ #
+ # - ("", K) is 'supported by a given arch', if there is a
+ # Kconfig file for that arch which contains K;
+ #
+ # - ("not", K) is 'supported by a given arch', if there is
+ # no Kconfig file for that arch which contains K;
+ #
+ # - otherwise: preserve the previous status value (if any),
+ # default to 'not yet supported'.
+ #
+ # Notice that, according these rules, invalid features may be
+ # updated/modified.
+ #
+ if [ "$O" = "" ] && [ ! -z "$K_GREP" ]; then
+ printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE
+ elif [ "$O" = "not" ] && [ -z "$K_GREP" ]; then
+ printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE
+ else
+ S=$(grep -v "^#" "$F_FILE" | grep " $ARCH:")
+ if [ ! -z "$S" ]; then
+ echo "$S" >> $T_FILE
+ else
+ printf " |%12s: | TODO |\n" "$ARCH" \
+ >> $T_FILE
+ fi
+ fi
+ done
+ echo " -----------------------" >> $T_FILE
+ mv $T_FILE $F_FILE
+done
diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
index e4fad58a05e5..d4271b493b41 100644
--- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt
+++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
- | parisc: | TODO |
- | powerpc: | TODO |
+ | parisc: | ok |
+ | powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
index 8052904b25fc..83d9e68462bb 100644
--- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt
+++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
@@ -17,12 +17,14 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
- | sh: | TODO |
+ | sh: | ok |
| sparc: | TODO |
| um: | TODO |
| unicore32: | TODO |
diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt
index 7c76b946297e..3d4908fce6da 100644
--- a/Documentation/features/time/clockevents/arch-support.txt
+++ b/Documentation/features/time/clockevents/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | ok |
| microblaze: | ok |
| mips: | ok |
+ | nds32: | ok |
| nios2: | ok |
| openrisc: | ok |
- | parisc: | TODO |
+ | parisc: | ok |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index 9433b3e523b3..c29974afffaa 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | ok |
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index 212dde0b578c..8d73c463ec27 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | .. |
- | powerpc: | .. |
+ | powerpc: | ok |
+ | riscv: | TODO |
| s390: | .. |
| sh: | TODO |
| sparc: | .. |
diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt
index 4074028f72f7..e7c6ea6b8fb3 100644
--- a/Documentation/features/time/modern-timekeeping/arch-support.txt
+++ b/Documentation/features/time/modern-timekeeping/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | ok |
| mips: | ok |
+ | nds32: | ok |
| nios2: | ok |
| openrisc: | ok |
| parisc: | ok |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt
index a394d8820517..4646457461cf 100644
--- a/Documentation/features/time/virt-cpuacct/arch-support.txt
+++ b/Documentation/features/time/virt-cpuacct/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | ok |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | ok |
diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt
index 082f93d5b40e..1f71d090ff2c 100644
--- a/Documentation/features/vm/ELF-ASLR/arch-support.txt
+++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
- | parisc: | TODO |
+ | parisc: | ok |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt
index 605e0abb756d..fbd5aa463b0a 100644
--- a/Documentation/features/vm/PG_uncached/arch-support.txt
+++ b/Documentation/features/vm/PG_uncached/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt
index 7a8eb0bd5ca8..5d7ecc378f29 100644
--- a/Documentation/features/vm/THP/arch-support.txt
+++ b/Documentation/features/vm/THP/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | .. |
| microblaze: | .. |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | .. |
| openrisc: | .. |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | .. |
| sparc: | ok |
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 35fb99b2b3ea..f7af9678eb66 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | .. |
| microblaze: | .. |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | .. |
| openrisc: | .. |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index ed8b943ad8fc..d0713ccc7117 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | TODO |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt
index 589947bdf0a8..8527601a3739 100644
--- a/Documentation/features/vm/ioremap_prot/arch-support.txt
+++ b/Documentation/features/vm/ioremap_prot/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | TODO |
| sh: | ok |
| sparc: | TODO |
diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt
index 8b8bea0318a0..1a988052cd24 100644
--- a/Documentation/features/vm/numa-memblock/arch-support.txt
+++ b/Documentation/features/vm/numa-memblock/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | .. |
| arm: | .. |
- | arm64: | .. |
+ | arm64: | ok |
| c6x: | .. |
| h8300: | .. |
| hexagon: | .. |
@@ -17,10 +17,12 @@
| m68k: | .. |
| microblaze: | ok |
| mips: | ok |
+ | nds32: | TODO |
| nios2: | .. |
| openrisc: | .. |
| parisc: | .. |
| powerpc: | ok |
+ | riscv: | ok |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 055004f467d2..6a608a6dcf71 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -17,10 +17,12 @@
| m68k: | TODO |
| microblaze: | TODO |
| mips: | TODO |
+ | nds32: | TODO |
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
+ | riscv: | TODO |
| s390: | ok |
| sh: | ok |
| sparc: | ok |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 15853d522941..2c391338c675 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -440,7 +440,9 @@ prototypes:
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
- unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ __poll_t (*poll) (struct file *, struct poll_table_struct *);
+ struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+ __poll_t (*poll_mask) (struct file *, __poll_t);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
@@ -471,7 +473,7 @@ prototypes:
};
locking rules:
- All may block.
+ All except for ->poll_mask may block.
->llseek() locking has moved from llseek to the individual llseek
implementations. If your fs is not using generic_file_llseek, you
@@ -503,6 +505,9 @@ in sys_read() and friends.
the lease within the individual filesystem to record the result of the
operation
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
--------------------------- dquot_operations -------------------------------
prototypes:
int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2a84bb334894..520f6a84cf50 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -515,7 +515,8 @@ guarantees:
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
bits on both physical and virtual pages associated with a process, and the
-soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
+soft-dirty bit on pte (see Documentation/admin-guide/mm/soft-dirty.rst
+for details).
To clear the bits for all the pages associated with the process
> echo 1 > /proc/PID/clear_refs
@@ -536,7 +537,8 @@ Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
using /proc/kpageflags and number of times a page is mapped using
-/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+/proc/kpagecount. For detailed explanation, see
+Documentation/admin-guide/mm/pagemap.rst.
The /proc/pid/numa_maps is an extension based on maps, showing the memory
locality and binding policy, as well as the memory usage (in pages) of
@@ -564,7 +566,7 @@ address policy mapping details
Where:
"address" is the starting address for the mapping;
-"policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt);
+"policy" reports the NUMA memory policy set for the mapping (see Documentation/admin-guide/mm/numa_memory_policy.rst);
"mapping details" summarizes mapping data such as mapping type, page usage counters,
node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page
size, in KB, that is backing the mapping up.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index a85355cf85f4..d06e9a59a9f4 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -105,8 +105,9 @@ policy for the file will revert to "default" policy.
NUMA memory allocation policies have optional flags that can be used in
conjunction with their modes. These optional flags can be specified
when tmpfs is mounted by appending them to the mode before the NodeList.
-See Documentation/vm/numa_memory_policy.txt for a list of all available
-memory allocation policy mode flags and their effect on memory policy.
+See Documentation/admin-guide/mm/numa_memory_policy.rst for a list of
+all available memory allocation policy mode flags and their effect on
+memory policy.
=static is equivalent to MPOL_F_STATIC_NODES
=relative is equivalent to MPOL_F_RELATIVE_NODES
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..829a7b7857a4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,9 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
- unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ __poll_t (*poll) (struct file *, struct poll_table_struct *);
+ struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+ __poll_t (*poll_mask) (struct file *, __poll_t);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
activity on this file and (optionally) go to sleep until there
is activity. Called by the select(2) and poll(2) system calls
+ get_poll_head: Returns the struct wait_queue_head that callers can
+ wait on. Callers need to check the returned events using ->poll_mask
+ once woken. Can return NULL to indicate polling is not supported,
+ or any error code using the ERR_PTR convention to indicate that a
+ grave error occured and ->poll_mask shall not be called.
+
+ poll_mask: return the mask of EPOLL* values describing the file descriptor
+ state. Called either before going to sleep on the waitqueue returned by
+ get_poll_head, or after it has been woken. If ->get_poll_head and
+ ->poll_mask are implemented ->poll does not need to be implement.
+
unlocked_ioctl: called by the ioctl(2) system call.
compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/Documentation/hwmon/hwmon-kernel-api.txt b/Documentation/hwmon/hwmon-kernel-api.txt
index 53a806696c64..eb7a78aebb38 100644
--- a/Documentation/hwmon/hwmon-kernel-api.txt
+++ b/Documentation/hwmon/hwmon-kernel-api.txt
@@ -71,7 +71,8 @@ hwmon_device_register_with_info is the most comprehensive and preferred means
to register a hardware monitoring device. It creates the standard sysfs
attributes in the hardware monitoring core, letting the driver focus on reading
from and writing to the chip instead of having to bother with sysfs attributes.
-Its parameters are described in more detail below.
+The parent device parameter cannot be NULL with non-NULL chip info. Its
+parameters are described in more detail below.
devm_hwmon_device_register_with_info is similar to
hwmon_device_register_with_info. However, it is device managed, meaning the
diff --git a/Documentation/hwmon/ltc2990 b/Documentation/hwmon/ltc2990
index c25211e90bdc..3ed68f676c0f 100644
--- a/Documentation/hwmon/ltc2990
+++ b/Documentation/hwmon/ltc2990
@@ -8,6 +8,7 @@ Supported chips:
Datasheet: http://www.linear.com/product/ltc2990
Author: Mike Looijmans <mike.looijmans@topic.nl>
+ Tom Levens <tom.levens@cern.ch>
Description
@@ -16,10 +17,8 @@ Description
LTC2990 is a Quad I2C Voltage, Current and Temperature Monitor.
The chip's inputs can measure 4 voltages, or two inputs together (1+2 and 3+4)
can be combined to measure a differential voltage, which is typically used to
-measure current through a series resistor, or a temperature.
-
-This driver currently uses the 2x differential mode only. In order to support
-other modes, the driver will need to be expanded.
+measure current through a series resistor, or a temperature with an external
+diode.
Usage Notes
@@ -32,12 +31,19 @@ devices explicitly.
Sysfs attributes
----------------
+in0_input Voltage at Vcc pin in millivolt (range 2.5V to 5V)
+temp1_input Internal chip temperature in millidegrees Celcius
+
+A subset of the following attributes are visible, depending on the measurement
+mode of the chip.
+
+in[1-4]_input Voltage at V[1-4] pin in millivolt
+temp2_input External temperature sensor TR1 in millidegrees Celcius
+temp3_input External temperature sensor TR2 in millidegrees Celcius
+curr1_input Current in mA across V1-V2 assuming a 1mOhm sense resistor
+curr2_input Current in mA across V3-V4 assuming a 1mOhm sense resistor
+
The "curr*_input" measurements actually report the voltage drop across the
input pins in microvolts. This is equivalent to the current through a 1mOhm
sense resistor. Divide the reported value by the actual sense resistor value
in mOhm to get the actual value.
-
-in0_input Voltage at Vcc pin in millivolt (range 2.5V to 5V)
-temp1_input Internal chip temperature in millidegrees Celcius
-curr1_input Current in mA across v1-v2 assuming a 1mOhm sense resistor.
-curr2_input Current in mA across v3-v4 assuming a 1mOhm sense resistor.
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 3b99ab931d41..fdc585703498 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -45,7 +45,7 @@ the kernel interface as seen by application developers.
.. toctree::
:maxdepth: 2
- userspace-api/index
+ userspace-api/index
Introduction to kernel development
@@ -89,6 +89,7 @@ needed).
sound/index
crypto/index
filesystems/index
+ vm/index
Architecture-specific documentation
-----------------------------------
diff --git a/Documentation/ioctl/botching-up-ioctls.txt b/Documentation/ioctl/botching-up-ioctls.txt
index d02cfb48901c..883fb034bd04 100644
--- a/Documentation/ioctl/botching-up-ioctls.txt
+++ b/Documentation/ioctl/botching-up-ioctls.txt
@@ -73,7 +73,9 @@ will have a second iteration or at least an extension for any given interface.
future extensions is going right down the gutters since someone will submit
an ioctl struct with random stack garbage in the yet unused parts. Which
then bakes in the ABI that those fields can never be used for anything else
- but garbage.
+ but garbage. This is also the reason why you must explicitly pad all
+ structures, even if you never use them in an array - the padding the compiler
+ might insert could contain garbage.
* Have simple testcases for all of the above.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 6dafc8085acc..a02d6bbfc9d0 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1920,9 +1920,6 @@ There are some more advanced barrier functions:
/* assign ownership */
desc->status = DEVICE_OWN;
- /* force memory to sync before notifying device via MMIO */
- wmb();
-
/* notify device of new descriptors */
writel(DESC_NOTIFY, doorbell);
}
@@ -1930,11 +1927,15 @@ There are some more advanced barrier functions:
The dma_rmb() allows us guarantee the device has released ownership
before we read the data from the descriptor, and the dma_wmb() allows
us to guarantee the data is written to the descriptor before the device
- can see it now has ownership. The wmb() is needed to guarantee that the
- cache coherent memory writes have completed before attempting a write to
- the cache incoherent MMIO region.
+ can see it now has ownership. Note that, when using writel(), a prior
+ wmb() is not needed to guarantee that the cache coherent memory writes
+ have completed before writing to the MMIO region. The cheaper
+ writel_relaxed() does not provide this guarantee and must not be used
+ here.
- See Documentation/DMA-API.txt for more information on consistent memory.
+ See the subsection "Kernel I/O barrier effects" for more information on
+ relaxed I/O accessors and the Documentation/DMA-API.txt file for more
+ information on consistent memory.
MMIO WRITE BARRIER
@@ -2903,7 +2904,7 @@ is discarded from the CPU's cache and reloaded. To deal with this, the
appropriate part of the kernel must invalidate the overlapping bits of the
cache on each CPU.
-See Documentation/cachetlb.txt for more information on cache management.
+See Documentation/core-api/cachetlb.rst for more information on cache management.
CACHE COHERENCY VS MMIO
@@ -3083,7 +3084,7 @@ CIRCULAR BUFFERS
Memory barriers can be used to implement circular buffering without the need
of a lock to serialise the producer with the consumer. See:
- Documentation/circular-buffers.txt
+ Documentation/core-api/circular-buffers.rst
for details.
diff --git a/Documentation/process/2.Process.rst b/Documentation/process/2.Process.rst
index ce5561bb3f8e..a9c46dd0706b 100644
--- a/Documentation/process/2.Process.rst
+++ b/Documentation/process/2.Process.rst
@@ -18,17 +18,17 @@ major kernel release happening every two or three months. The recent
release history looks like this:
====== =================
- 2.6.38 March 14, 2011
- 2.6.37 January 4, 2011
- 2.6.36 October 20, 2010
- 2.6.35 August 1, 2010
- 2.6.34 May 15, 2010
- 2.6.33 February 24, 2010
+ 4.11 April 30, 2017
+ 4.12 July 2, 2017
+ 4.13 September 3, 2017
+ 4.14 November 12, 2017
+ 4.15 January 28, 2018
+ 4.16 April 1, 2018
====== =================
-Every 2.6.x release is a major kernel release with new features, internal
-API changes, and more. A typical 2.6 release can contain nearly 10,000
-changesets with changes to several hundred thousand lines of code. 2.6 is
+Every 4.x release is a major kernel release with new features, internal
+API changes, and more. A typical 4.x release contain about 13,000
+changesets with changes to several hundred thousand lines of code. 4.x is
thus the leading edge of Linux kernel development; the kernel uses a
rolling development model which is continually integrating major changes.
@@ -70,20 +70,19 @@ will get up to somewhere between -rc6 and -rc9 before the kernel is
considered to be sufficiently stable and the final 2.6.x release is made.
At that point the whole process starts over again.
-As an example, here is how the 2.6.38 development cycle went (all dates in
-2011):
+As an example, here is how the 4.16 development cycle went (all dates in
+2018):
============== ===============================
- January 4 2.6.37 stable release
- January 18 2.6.38-rc1, merge window closes
- January 21 2.6.38-rc2
- February 1 2.6.38-rc3
- February 7 2.6.38-rc4
- February 15 2.6.38-rc5
- February 21 2.6.38-rc6
- March 1 2.6.38-rc7
- March 7 2.6.38-rc8
- March 14 2.6.38 stable release
+ January 28 4.15 stable release
+ February 11 4.16-rc1, merge window closes
+ February 18 4.16-rc2
+ February 25 4.16-rc3
+ March 4 4.16-rc4
+ March 11 4.16-rc5
+ March 18 4.16-rc6
+ March 25 4.16-rc7
+ April 1 4.17 stable release
============== ===============================
How do the developers decide when to close the development cycle and create
@@ -99,37 +98,42 @@ release is made. In the real world, this kind of perfection is hard to
achieve; there are just too many variables in a project of this size.
There comes a point where delaying the final release just makes the problem
worse; the pile of changes waiting for the next merge window will grow
-larger, creating even more regressions the next time around. So most 2.6.x
+larger, creating even more regressions the next time around. So most 4.x
kernels go out with a handful of known regressions though, hopefully, none
of them are serious.
Once a stable release is made, its ongoing maintenance is passed off to the
"stable team," currently consisting of Greg Kroah-Hartman. The stable team
-will release occasional updates to the stable release using the 2.6.x.y
+will release occasional updates to the stable release using the 4.x.y
numbering scheme. To be considered for an update release, a patch must (1)
fix a significant bug, and (2) already be merged into the mainline for the
next development kernel. Kernels will typically receive stable updates for
a little more than one development cycle past their initial release. So,
-for example, the 2.6.36 kernel's history looked like:
+for example, the 4.13 kernel's history looked like:
============== ===============================
- October 10 2.6.36 stable release
- November 22 2.6.36.1
- December 9 2.6.36.2
- January 7 2.6.36.3
- February 17 2.6.36.4
+ September 3 4.13 stable release
+ September 13 4.13.1
+ September 20 4.13.2
+ September 27 4.13.3
+ October 5 4.13.4
+ October 12 4.13.5
+ ... ...
+ November 24 4.13.16
============== ===============================
-2.6.36.4 was the final stable update for the 2.6.36 release.
+4.13.16 was the final stable update of the 4.13 release.
Some kernels are designated "long term" kernels; they will receive support
for a longer period. As of this writing, the current long term kernels
and their maintainers are:
- ====== ====================== ===========================
- 2.6.27 Willy Tarreau (Deep-frozen stable kernel)
- 2.6.32 Greg Kroah-Hartman
- 2.6.35 Andi Kleen (Embedded flag kernel)
+ ====== ====================== ==============================
+ 3.16 Ben Hutchings (very long-term stable kernel)
+ 4.1 Sasha Levin
+ 4.4 Greg Kroah-Hartman (very long-term stable kernel)
+ 4.9 Greg Kroah-Hartman
+ 4.14 Greg Kroah-Hartman
====== ====================== ===========================
The selection of a kernel for long-term support is purely a matter of a
diff --git a/Documentation/process/5.Posting.rst b/Documentation/process/5.Posting.rst
index c209d70da66f..c418c5d6cae4 100644
--- a/Documentation/process/5.Posting.rst
+++ b/Documentation/process/5.Posting.rst
@@ -10,8 +10,8 @@ of conventions and procedures which are used in the posting of patches;
following them will make life much easier for everybody involved. This
document will attempt to cover these expectations in reasonable detail;
more information can also be found in the files process/submitting-patches.rst,
-process/submitting-drivers.rst, and process/submit-checklist.rst in the kernel documentation
-directory.
+process/submitting-drivers.rst, and process/submit-checklist.rst in the kernel
+documentation directory.
When to post
@@ -198,8 +198,8 @@ pass it to diff with the "-X" option.
The tags mentioned above are used to describe how various developers have
been associated with the development of this patch. They are described in
-detail in the process/submitting-patches.rst document; what follows here is a brief
-summary. Each of these lines has the format:
+detail in the process/submitting-patches.rst document; what follows here is a
+brief summary. Each of these lines has the format:
::
@@ -210,8 +210,8 @@ The tags in common use are:
- Signed-off-by: this is a developer's certification that he or she has
the right to submit the patch for inclusion into the kernel. It is an
agreement to the Developer's Certificate of Origin, the full text of
- which can be found in Documentation/process/submitting-patches.rst. Code without a
- proper signoff cannot be merged into the mainline.
+ which can be found in Documentation/process/submitting-patches.rst. Code
+ without a proper signoff cannot be merged into the mainline.
- Co-developed-by: states that the patch was also created by another developer
along with the original author. This is useful at times when multiple
@@ -226,8 +226,8 @@ The tags in common use are:
it to work.
- Reviewed-by: the named developer has reviewed the patch for correctness;
- see the reviewer's statement in Documentation/process/submitting-patches.rst for more
- detail.
+ see the reviewer's statement in Documentation/process/submitting-patches.rst
+ for more detail.
- Reported-by: names a user who reported a problem which is fixed by this
patch; this tag is used to give credit to the (often underappreciated)
diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst
index 1c9fe657ed01..37bd0628b6ee 100644
--- a/Documentation/process/index.rst
+++ b/Documentation/process/index.rst
@@ -52,6 +52,7 @@ lack of a better place.
adding-syscalls
magic-number
volatile-considered-harmful
+ clang-format
.. only:: subproject and html
diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index b453561a7148..aff9b1a4d77b 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -219,7 +219,7 @@ Our goal is to protect your master key by moving it to offline media, so
if you only have a combined **[SC]** key, then you should create a separate
signing subkey::
- $ gpg --quick-add-key [fpr] ed25519 sign
+ $ gpg --quick-addkey [fpr] ed25519 sign
Remember to tell the keyservers about this change, so others can pull down
your new subkey::
@@ -450,11 +450,18 @@ functionality. There are several options available:
others. If you want to use ECC keys, your best bet among commercially
available devices is the Nitrokey Start.
+.. note::
+
+ If you are listed in MAINTAINERS or have an account at kernel.org,
+ you `qualify for a free Nitrokey Start`_ courtesy of The Linux
+ Foundation.
+
.. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6
.. _`Nitrokey Pro`: https://shop.nitrokey.com/shop/product/nitrokey-pro-3
.. _`Yubikey 4`: https://www.yubico.com/product/yubikey-4-series/
.. _Gnuk: http://www.fsij.org/doc-gnuk/
.. _`LWN has a good review`: https://lwn.net/Articles/736231/
+.. _`qualify for a free Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html
Configure your smartcard device
-------------------------------
@@ -482,7 +489,7 @@ there are no convenient command-line switches::
You should set the user PIN (1), Admin PIN (3), and the Reset Code (4).
Please make sure to record and store these in a safe place -- especially
the Admin PIN and the Reset Code (which allows you to completely wipe
-the smartcard). You so rarely need to use the Admin PIN, that you will
+the smartcard). You so rarely need to use the Admin PIN, that you will
inevitably forget what it is if you do not record it.
Getting back to the main card menu, you can also set other values (such
@@ -494,6 +501,12 @@ additionally leak information about your smartcard should you lose it.
Despite having the name "PIN", neither the user PIN nor the admin
PIN on the card need to be numbers.
+.. warning::
+
+ Some devices may require that you move the subkeys onto the device
+ before you can change the passphrase. Please check the documentation
+ provided by the device manufacturer.
+
Move the subkeys to your smartcard
----------------------------------
@@ -655,6 +668,20 @@ want to import these changes back into your regular working directory::
$ gpg --export | gpg --homedir ~/.gnupg --import
$ unset GNUPGHOME
+Using gpg-agent over ssh
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can forward your gpg-agent over ssh if you need to sign tags or
+commits on a remote system. Please refer to the instructions provided
+on the GnuPG wiki:
+
+- `Agent Forwarding over SSH`_
+
+It works more smoothly if you can modify the sshd server settings on the
+remote end.
+
+.. _`Agent Forwarding over SSH`: https://wiki.gnupg.org/AgentForwarding
+
Using PGP with Git
==================
@@ -692,6 +719,7 @@ should be used (``[fpr]`` is the fingerprint of your key)::
tell git to always use it instead of the legacy ``gpg`` from version 1::
$ git config --global gpg.program gpg2
+ $ git config --global gpgv.program gpgv2
How to work with signed tags
----------------------------
@@ -731,6 +759,13 @@ If you are verifying someone else's git tag, then you will need to
import their PGP key. Please refer to the
":ref:`verify_identities`" section below.
+.. note::
+
+ If you get "``gpg: Can't check signature: unknown pubkey
+ algorithm``" error, you need to tell git to use gpgv2 for
+ verification, so it properly processes signatures made by ECC keys.
+ See instructions at the start of this section.
+
Configure git to always sign annotated tags
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index f7152ed565e5..908bb55be407 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -761,7 +761,7 @@ requests, especially from new, unknown developers. If in doubt you can use
the pull request as the cover letter for a normal posting of the patch
series, giving the maintainer the option of using either.
-A pull request should have [GIT] or [PULL] in the subject line. The
+A pull request should have [GIT PULL] in the subject line. The
request itself should include the repository name and the branch of
interest on a single line; it should look something like::
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 8ce78f82ae23..b14e03ff3528 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -49,7 +49,7 @@ CONTENTS
2.1 Main algorithm
------------------
- SCHED_DEADLINE uses three parameters, named "runtime", "period", and
+ SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
these "runtime" microseconds are available within "deadline" microseconds
@@ -117,6 +117,10 @@ CONTENTS
scheduling deadline = scheduling deadline + period
remaining runtime = remaining runtime + runtime
+ The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
+ to get informed about runtime overruns through the delivery of SIGXCPU
+ signals.
+
2.2 Bandwidth reclaiming
------------------------
@@ -279,6 +283,19 @@ CONTENTS
running_bw is incremented.
+2.3 Energy-aware scheduling
+------------------------
+
+ When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
+ GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
+ value that still allows to meet the deadlines. This behavior is currently
+ implemented only for ARM architectures.
+
+ A particular care must be taken in case the time needed for changing frequency
+ is of the same order of magnitude of the reservation period. In such cases,
+ setting a fixed CPU frequency results in a lower amount of deadline misses.
+
+
3. Scheduling Real-Time Tasks
=============================
@@ -505,6 +522,12 @@ CONTENTS
17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
Computing, 2016.
+ 18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
+ Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
+ 2016.
+ 19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
+ the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
+ 2018), Pau, France, April 2018.
4. Bandwidth management
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index 298a94a33f05..85492bfca530 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -9,5 +9,7 @@ Security Documentation
IMA-templates
keys/index
LSM
+ LSM-sctp
+ SELinux-sctp
self-protection
tpm/index
diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst
index aed6b4fb8e46..ab5761148163 100644
--- a/Documentation/sound/alsa-configuration.rst
+++ b/Documentation/sound/alsa-configuration.rst
@@ -1062,7 +1062,7 @@ output (with ``--no-upload`` option) to kernel bugzilla or alsa-devel
ML (see the section `Links and Addresses`_).
``power_save`` and ``power_save_controller`` options are for power-saving
-mode. See powersave.txt for details.
+mode. See powersave.rst for details.
Note 2: If you get click noises on output, try the module option
``position_fix=1`` or ``2``. ``position_fix=1`` will use the SD_LPIB
@@ -1133,7 +1133,7 @@ line_outs_monitor
enable_monitor
Enable Analog Out on Channel 63/64 by default.
-See hdspm.txt for details.
+See hdspm.rst for details.
Module snd-ice1712
------------------
diff --git a/Documentation/sound/soc/codec.rst b/Documentation/sound/soc/codec.rst
index f87612b94812..240770ea761e 100644
--- a/Documentation/sound/soc/codec.rst
+++ b/Documentation/sound/soc/codec.rst
@@ -139,7 +139,7 @@ DAPM description
----------------
The Dynamic Audio Power Management description describes the codec power
components and their relationships and registers to the ASoC core.
-Please read dapm.txt for details of building the description.
+Please read dapm.rst for details of building the description.
Please also see the examples in other codec drivers.
diff --git a/Documentation/sound/soc/platform.rst b/Documentation/sound/soc/platform.rst
index d5574904d981..02c93a8b9c3b 100644
--- a/Documentation/sound/soc/platform.rst
+++ b/Documentation/sound/soc/platform.rst
@@ -66,7 +66,7 @@ Each SoC DAI driver must provide the following features:-
4. SYSCLK configuration
5. Suspend and resume (optional)
-Please see codec.txt for a description of items 1 - 4.
+Please see codec.rst for a description of items 1 - 4.
SoC DSP Drivers
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 17256f2ad919..697ef8c225df 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -515,7 +515,7 @@ nr_hugepages
Change the minimum size of the hugepage pool.
-See Documentation/vm/hugetlbpage.txt
+See Documentation/admin-guide/mm/hugetlbpage.rst
==============================================================
@@ -524,7 +524,7 @@ nr_overcommit_hugepages
Change the maximum size of the hugepage pool. The maximum is
nr_hugepages + nr_overcommit_hugepages.
-See Documentation/vm/hugetlbpage.txt
+See Documentation/admin-guide/mm/hugetlbpage.rst
==============================================================
@@ -667,7 +667,7 @@ and don't use much of it.
The default value is 0.
-See Documentation/vm/overcommit-accounting and
+See Documentation/vm/overcommit-accounting.rst and
mm/mmap.c::__vm_enough_memory() for more information.
==============================================================
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 6f0120c3a4f1..1d74ad0202b6 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -187,13 +187,19 @@ that can be performed on them (see "struct coresight_ops"). The
specific to that component only. "Implementation defined" customisations are
expected to be accessed and controlled using those entries.
-Last but not least, "struct module *owner" is expected to be set to reflect
-the information carried in "THIS_MODULE".
How to use the tracer modules
-----------------------------
-Before trace collection can start, a coresight sink needs to be identify.
+There are two ways to use the Coresight framework: 1) using the perf cmd line
+tools and 2) interacting directly with the Coresight devices using the sysFS
+interface. Preference is given to the former as using the sysFS interface
+requires a deep understanding of the Coresight HW. The following sections
+provide details on using both methods.
+
+1) Using the sysFS interface:
+
+Before trace collection can start, a coresight sink needs to be identified.
There is no limit on the amount of sinks (nor sources) that can be enabled at
any given moment. As a generic operation, all device pertaining to the sink
class will have an "active" entry in sysfs:
@@ -298,42 +304,48 @@ Instruction 13570831 0x8026B584 E28DD00C false ADD
Instruction 0 0x8026B588 E8BD8000 true LDM sp!,{pc}
Timestamp Timestamp: 17107041535
-How to use the STM module
--------------------------
+2) Using perf framework:
-Using the System Trace Macrocell module is the same as the tracers - the only
-difference is that clients are driving the trace capture rather
-than the program flow through the code.
+Coresight tracers are represented using the Perf framework's Performance
+Monitoring Unit (PMU) abstraction. As such the perf framework takes charge of
+controlling when tracing gets enabled based on when the process of interest is
+scheduled. When configured in a system, Coresight PMUs will be listed when
+queried by the perf command line tool:
-As with any other CoreSight component, specifics about the STM tracer can be
-found in sysfs with more information on each entry being found in [1]:
+ linaro@linaro-nano:~$ ./perf list pmu
-root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
-enable_source hwevent_select port_enable subsystem uevent
-hwevent_enable mgmt port_select traceid
-root@genericarmv8:~#
+ List of pre-defined events (to be used in -e):
-Like any other source a sink needs to be identified and the STM enabled before
-being used:
+ cs_etm// [Kernel PMU event]
-root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
-root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+ linaro@linaro-nano:~$
-From there user space applications can request and use channels using the devfs
-interface provided for that purpose by the generic STM API:
+Regardless of the number of tracers available in a system (usually equal to the
+amount of processor cores), the "cs_etm" PMU will be listed only once.
-root@genericarmv8:~# ls -l /dev/20100000.stm
-crw------- 1 root root 10, 61 Jan 3 18:11 /dev/20100000.stm
-root@genericarmv8:~#
+A Coresight PMU works the same way as any other PMU, i.e the name of the PMU is
+listed along with configuration options within forward slashes '/'. Since a
+Coresight system will typically have more than one sink, the name of the sink to
+work with needs to be specified as an event option. Names for sink to choose
+from are listed in sysFS under ($SYSFS)/bus/coresight/devices:
-Details on how to use the generic STM API can be found here [2].
+ root@linaro-nano:~# ls /sys/bus/coresight/devices/
+ 20010000.etf 20040000.funnel 20100000.stm 22040000.etm
+ 22140000.etm 230c0000.funnel 23240000.etm 20030000.tpiu
+ 20070000.etr 20120000.replicator 220c0000.funnel
+ 23040000.etm 23140000.etm 23340000.etm
-[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
-[2]. Documentation/trace/stm.txt
+ root@linaro-nano:~# perf record -e cs_etm/@20070000.etr/u --per-thread program
+The syntax within the forward slashes '/' is important. The '@' character
+tells the parser that a sink is about to be specified and that this is the sink
+to use for the trace session.
-Using perf tools
-----------------
+More information on the above and other example on how to use Coresight with
+the perf tools can be found in the "HOWTO.md" file of the openCSD gitHub
+repository [3].
+
+2.1) AutoFDO analysis using the perf tools:
perf can be used to record and analyze trace of programs.
@@ -381,3 +393,38 @@ sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tuto
$ taskset -c 2 ./sort_autofdo
Bubble sorting array of 30000 elements
5806 ms
+
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source hwevent_select port_enable subsystem uevent
+hwevent_enable mgmt port_select traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw------- 1 root root 10, 61 Jan 3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
+[3]. https://github.com/Linaro/perf-opencsd
diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 998a60a93015..00283b6dd101 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -12,7 +12,7 @@ Written for: 4.14
Introduction
============
-The ftrace infrastructure was originially created to attach callbacks to the
+The ftrace infrastructure was originally created to attach callbacks to the
beginning of functions in order to record and trace the flow of the kernel.
But callbacks to the start of a function can have other use cases. Either
for live kernel patching, or for security monitoring. This document describes
@@ -30,7 +30,7 @@ The ftrace context
This requires extra care to what can be done inside a callback. A callback
can be called outside the protective scope of RCU.
-The ftrace infrastructure has some protections agains recursions and RCU
+The ftrace infrastructure has some protections against recursions and RCU
but one must still be very careful how they use the callbacks.
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 67d9c38e95eb..6b80ac4bbaae 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -224,6 +224,8 @@ of ftrace. Here is a list of some of the key files:
has a side effect of enabling or disabling specific functions
to be traced. Echoing names of functions into this file
will limit the trace to only those functions.
+ This influences the tracers "function" and "function_graph"
+ and thus also function profiling (see "function_profile_enabled").
The functions listed in "available_filter_functions" are what
can be written into this file.
@@ -265,6 +267,8 @@ of ftrace. Here is a list of some of the key files:
Functions listed in this file will cause the function graph
tracer to only trace these functions and the functions that
they call. (See the section "dynamic ftrace" for more details).
+ Note, set_ftrace_filter and set_ftrace_notrace still affects
+ what functions are being traced.
set_graph_notrace:
@@ -277,7 +281,8 @@ of ftrace. Here is a list of some of the key files:
This lists the functions that ftrace has processed and can trace.
These are the function names that you can pass to
- "set_ftrace_filter" or "set_ftrace_notrace".
+ "set_ftrace_filter", "set_ftrace_notrace",
+ "set_graph_function", or "set_graph_notrace".
(See the section "dynamic ftrace" below for more details.)
dyn_ftrace_total_info:
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 0a0930ab4156..921739d00f69 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -36,6 +36,9 @@ Documentation/memory-barriers.txt
ë¶€ë¶„ë„ ìžˆê³ , ì˜ë„하진 않았지만 ì‚¬ëžŒì— ì˜í•´ 쓰였다보니 불완전한 ë¶€ë¶„ë„ ìžˆìŠµë‹ˆë‹¤.
ì´ ë¬¸ì„œëŠ” 리눅스ì—ì„œ 제공하는 다양한 메모리 ë°°ë¦¬ì–´ë“¤ì„ ì‚¬ìš©í•˜ê¸° 위한
안내서입니다만, 뭔가 ì´ìƒí•˜ë‹¤ 싶으면 (그런게 ë§Žì„ ê²ë‹ˆë‹¤) ì§ˆë¬¸ì„ ë¶€íƒë“œë¦½ë‹ˆë‹¤.
+ì¼ë¶€ ì´ìƒí•œ ì ë“¤ì€ ê³µì‹ì ì¸ 메모리 ì¼ê´€ì„± 모ë¸ê³¼ tools/memory-model/ ì— ìžˆëŠ”
+관련 문서를 참고해서 í•´ê²°ë  ìˆ˜ ìžˆì„ ê²ë‹ˆë‹¤. 그러나, ì´ ë©”ëª¨ë¦¬ 모ë¸ì¡°ì°¨ë„ ê·¸
+관리ìžë“¤ì˜ ì˜ê²¬ì˜ 집합으로 ë´ì•¼ì§€, 절대 ì˜³ì€ ì˜ˆì–¸ìžë¡œ 신봉해선 ì•ˆë  ê²ë‹ˆë‹¤.
다시 ë§í•˜ì§€ë§Œ, ì´ ë¬¸ì„œëŠ” 리눅스가 í•˜ë“œì›¨ì–´ì— ê¸°ëŒ€í•˜ëŠ” ì‚¬í•­ì— ëŒ€í•œ 명세서가
아닙니다.
@@ -77,7 +80,7 @@ Documentation/memory-barriers.txt
- 메모리 ë°°ë¦¬ì–´ì˜ ì¢…ë¥˜.
- 메모리 ë°°ë¦¬ì–´ì— ëŒ€í•´ 가정해선 ì•ˆë  ê²ƒ.
- - ë°ì´í„° ì˜ì¡´ì„± 배리어.
+ - ë°ì´í„° ì˜ì¡´ì„± 배리어 (역사ì ).
- 컨트롤 ì˜ì¡´ì„±.
- SMP 배리어 ì§ë§žì¶”기.
- 메모리 배리어 ì‹œí€€ìŠ¤ì˜ ì˜ˆ.
@@ -255,17 +258,20 @@ CPU ì—게 기대할 수 있는 ìµœì†Œí•œì˜ ë³´ìž¥ì‚¬í•­ 몇가지가 있습니
(*) ì–´ë–¤ CPU ë“ , ì˜ì¡´ì„±ì´ 존재하는 메모리 ì•¡ì„¸ìŠ¤ë“¤ì€ í•´ë‹¹ CPU ìžì‹ ì—게
있어서는 순서대로 메모리 ì‹œìŠ¤í…œì— ìˆ˜í–‰ 요청ë©ë‹ˆë‹¤. 즉, 다ìŒì— 대해서:
- Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
+ Q = READ_ONCE(P); D = READ_ONCE(*Q);
CPU 는 다ìŒê³¼ ê°™ì€ ë©”ëª¨ë¦¬ 오í¼ë ˆì´ì…˜ 시퀀스를 수행 요청합니다:
Q = LOAD P, D = LOAD *Q
- 그리고 ê·¸ 시퀀스 ë‚´ì—ì„œì˜ ìˆœì„œëŠ” í•­ìƒ ì§€ì¼œì§‘ë‹ˆë‹¤. ëŒ€ë¶€ë¶„ì˜ ì‹œìŠ¤í…œì—ì„œ
- smp_read_barrier_depends() 는 아무ì¼ë„ 안하지만 DEC Alpha ì—서는
- 명시ì ìœ¼ë¡œ 사용ë˜ì–´ì•¼ 합니다. ë³´í†µì˜ ê²½ìš°ì—는 smp_read_barrier_depends()
- 를 ì§ì ‘ 사용하는 대신 rcu_dereference() ê°™ì€ ê²ƒë“¤ì„ ì‚¬ìš©í•´ì•¼ 함ì„
- 알아ë‘세요.
+ 그리고 ê·¸ 시퀀스 ë‚´ì—ì„œì˜ ìˆœì„œëŠ” í•­ìƒ ì§€ì¼œì§‘ë‹ˆë‹¤. 하지만, DEC Alpha ì—ì„œ
+ READ_ONCE() 는 메모리 배리어 ëª…ë ¹ë„ ë‚´ê²Œ ë˜ì–´ 있어서, DEC Alpha CPU 는
+ 다ìŒê³¼ ê°™ì€ ë©”ëª¨ë¦¬ 오í¼ë ˆì´ì…˜ë“¤ì„ 내놓게 ë©ë‹ˆë‹¤:
+
+ Q = LOAD P, MEMORY_BARRIER, D = LOAD *Q, MEMORY_BARRIER
+
+ DEC Alpha ì—ì„œ 수행ë˜ë“  아니든, READ_ONCE() 는 컴파ì¼ëŸ¬ë¡œë¶€í„°ì˜ ì•…ì˜í–¥
+ ë˜í•œ 제거합니다.
(*) 특정 CPU ë‚´ì—ì„œ 겹치는 ì˜ì—­ì˜ ë©”ëª¨ë¦¬ì— í–‰í•´ì§€ëŠ” 로드와 스토어 ë“¤ì€ í•´ë‹¹
CPU 안ì—서는 순서가 바뀌지 ì•Šì€ ê²ƒìœ¼ë¡œ 보여집니다. 즉, 다ìŒì— 대해서:
@@ -421,8 +427,8 @@ CPU ì—게 기대할 수 있는 ìµœì†Œí•œì˜ ë³´ìž¥ì‚¬í•­ 몇가지가 있습니
ë°ì´í„° ì˜ì¡´ì„± 배리어는 ì½ê¸° ë°°ë¦¬ì–´ì˜ ë³´ë‹¤ ì™„í™”ëœ í˜•íƒœìž…ë‹ˆë‹¤. ë‘ê°œì˜ ë¡œë“œ
오í¼ë ˆì´ì…˜ì´ 있고 ë‘번째 ê²ƒì´ ì²«ë²ˆì§¸ ê²ƒì˜ ê²°ê³¼ì— ì˜ì¡´í•˜ê³  ìžˆì„ ë•Œ(예:
ë‘번째 로드가 참조할 주소를 첫번째 로드가 ì½ëŠ” 경우), ë‘번째 로드가 ì½ì–´ì˜¬
- ë°ì´í„°ëŠ” 첫번째 ë¡œë“œì— ì˜í•´ ê·¸ 주소가 얻어지기 ì „ì— ì—…ë°ì´íŠ¸ ë˜ì–´ 있ìŒì„
- 보장하기 위해서 ë°ì´í„° ì˜ì¡´ì„± 배리어가 필요할 수 있습니다.
+ ë°ì´í„°ëŠ” 첫번째 ë¡œë“œì— ì˜í•´ ê·¸ 주소가 얻어진 ë’¤ì— ì—…ë°ì´íŠ¸ ë¨ì„ 보장하기
+ 위해서 ë°ì´í„° ì˜ì¡´ì„± 배리어가 필요할 수 있습니다.
ë°ì´í„° ì˜ì¡´ì„± 배리어는 ìƒí˜¸ ì˜ì¡´ì ì¸ 로드 오í¼ë ˆì´ì…˜ë“¤ 사ì´ì˜ ë¶€ë¶„ì  ìˆœì„œ
세우기입니다; 스토어 오í¼ë ˆì´ì…˜ë“¤ì´ë‚˜ ë…립ì ì¸ 로드들, ë˜ëŠ” 중복ë˜ëŠ”
@@ -570,8 +576,14 @@ ACQUIRE 는 해당 오í¼ë ˆì´ì…˜ì˜ 로드 부분ì—만 ì ìš©ë˜ê³  RELEASE ë
Documentation/DMA-API.txt
-ë°ì´í„° ì˜ì¡´ì„± 배리어
---------------------
+ë°ì´í„° ì˜ì¡´ì„± 배리어 (역사ì )
+-----------------------------
+
+리눅스 ì»¤ë„ v4.15 기준으로, smp_read_barrier_depends() ê°€ READ_ONCE() ì—
+추가ë˜ì—ˆëŠ”ë°, ì´ëŠ” ì´ ì„¹ì…˜ì— ì£¼ì˜ë¥¼ 기울여야 하는 ì‚¬ëžŒë“¤ì€ DEC Alpha 아키í…ì³
+ì „ìš© 코드를 만드는 사람들과 READ_ONCE() ìžì²´ë¥¼ 만드는 사람들 ë¿ìž„ì„ ì˜ë¯¸í•©ë‹ˆë‹¤.
+그런 ë¶„ë“¤ì„ ìœ„í•´, 그리고 ì—­ì‚¬ì— ê´€ì‹¬ 있는 ë¶„ë“¤ì„ ìœ„í•´, 여기 ë°ì´í„° ì˜ì¡´ì„±
+ë°°ë¦¬ì–´ì— ëŒ€í•œ ì´ì•¼ê¸°ë¥¼ ì ìŠµë‹ˆë‹¤.
ë°ì´í„° ì˜ì¡´ì„± ë°°ë¦¬ì–´ì˜ ì‚¬ìš©ì— ìžˆì–´ 지켜야 하는 ì‚¬í•­ë“¤ì€ ì•½ê°„ 미묘하고, ë°ì´í„°
ì˜ì¡´ì„± 배리어가 사용ë˜ì–´ì•¼ 하는 ìƒí™©ë„ í•­ìƒ ëª…ë°±í•˜ì§€ëŠ” 않습니다. ì„¤ëª…ì„ ìœ„í•´
@@ -1787,7 +1799,7 @@ CPU 메모리 배리어
범용 mb() smp_mb()
쓰기 wmb() smp_wmb()
ì½ê¸° rmb() smp_rmb()
- ë°ì´í„° ì˜ì¡´ì„± read_barrier_depends() smp_read_barrier_depends()
+ ë°ì´í„° ì˜ì¡´ì„± READ_ONCE()
ë°ì´í„° ì˜ì¡´ì„± 배리어를 제외한 모든 메모리 배리어는 컴파ì¼ëŸ¬ 배리어를
@@ -2796,8 +2808,9 @@ CPU 2 는 C/D 를 갖습니다)ê°€ 병렬로 ì—°ê²°ë˜ì–´ 있는 ì‹œìŠ¤í…œì„ ë‹
ì—¬ê¸°ì— ê°œìž…í•˜ê¸° 위해선, ë°ì´í„° ì˜ì¡´ì„± 배리어나 ì½ê¸° 배리어를 로드 오í¼ë ˆì´ì…˜ë“¤
-사ì´ì— 넣어야 합니다. ì´ë ‡ê²Œ í•¨ìœ¼ë¡œì¨ ìºì‹œê°€ ë‹¤ìŒ ìš”ì²­ì„ ì²˜ë¦¬í•˜ê¸° ì „ì— ì¼ê´€ì„±
-í를 처리하ë„ë¡ ê°•ì œí•˜ê²Œ ë©ë‹ˆë‹¤.
+사ì´ì— 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매í¬ë¡œì— ì˜í•´ 무조건ì ìœ¼ë¡œ
+그렇게 ë©ë‹ˆë‹¤). ì´ë ‡ê²Œ í•¨ìœ¼ë¡œì¨ ìºì‹œê°€ ë‹¤ìŒ ìš”ì²­ì„ ì²˜ë¦¬í•˜ê¸° ì „ì— ì¼ê´€ì„± í를
+처리하ë„ë¡ ê°•ì œí•˜ê²Œ ë©ë‹ˆë‹¤.
CPU 1 CPU 2 COMMENT
=============== =============== =======================================
@@ -2826,7 +2839,10 @@ CPU 2 는 C/D 를 갖습니다)ê°€ 병렬로 ì—°ê²°ë˜ì–´ 있는 ì‹œìŠ¤í…œì„ ë‹
다른 CPU ë“¤ë„ ë¶„í• ëœ ìºì‹œë¥¼ 가지고 ìžˆì„ ìˆ˜ 있지만, 그런 CPU ë“¤ì€ í‰ë²”í•œ 메모리
액세스를 ìœ„í•´ì„œë„ ì´ ë¶„í• ëœ ìºì‹œë“¤ 사ì´ì˜ ì¡°ì •ì„ í•´ì•¼ë§Œ 합니다. Alpha 는 가장
약한 메모리 순서 시맨틱 (semantic) ì„ ì„ íƒí•¨ìœ¼ë¡œì¨ 메모리 배리어가 명시ì ìœ¼ë¡œ
-사용ë˜ì§€ ì•Šì•˜ì„ ë•Œì—는 그런 ì¡°ì •ì´ í•„ìš”í•˜ì§€ 않게 했습니다.
+사용ë˜ì§€ ì•Šì•˜ì„ ë•Œì—는 그런 ì¡°ì •ì´ í•„ìš”í•˜ì§€ 않게 했으며, ì´ëŠ” Alpha ê°€ 당시ì—
+ë” ë†’ì€ CPU í´ë½ ì†ë„를 가질 수 있게 했습니다. 하지만, (다시 ë§í•˜ê±´ëŒ€, v4.15
+ì´í›„부터는) Alpha 아키í…ì³ ì „ìš© 코드와 READ_ONCE() 매í¬ë¡œ 내부ì—서를 제외하고는
+smp_read_barrier_depends() ê°€ 사용ë˜ì§€ 않아야 í•¨ì„ ì•Œì•„ë‘시기 ë°”ëžë‹ˆë‹¤.
ìºì‹œ ì¼ê´€ì„± VS DMA
@@ -2846,7 +2862,7 @@ CPU ì˜ ìºì‹œì—ì„œ RAM 으로 쓰여지는 ë”í‹° ìºì‹œ ë¼ì¸ì— ì˜í•´ ë®ì
문제를 해결하기 위해선, 커ë„ì˜ ì ì ˆí•œ 부분ì—ì„œ ê° CPU ì˜ ìºì‹œ ì•ˆì˜ ë¬¸ì œê°€ ë˜ëŠ”
ë¹„íŠ¸ë“¤ì„ ë¬´íš¨í™” 시켜야 합니다.
-ìºì‹œ ê´€ë¦¬ì— ëŒ€í•œ ë” ë§Žì€ ì •ë³´ë¥¼ 위해선 Documentation/cachetlb.txt 를
+ìºì‹œ ê´€ë¦¬ì— ëŒ€í•œ ë” ë§Žì€ ì •ë³´ë¥¼ 위해선 Documentation/core-api/cachetlb.rst 를
참고하세요.
@@ -2988,7 +3004,9 @@ Alpha CPU ì˜ ì¼ë¶€ ë²„ì „ì€ ë¶„í• ëœ ë°ì´í„° ìºì‹œë¥¼ 가지고 있어서
메모리 ì¼ê´€ì„± 시스템과 함께 ë‘ê°œì˜ ìºì‹œë¥¼ ë™ê¸°í™” 시켜서, í¬ì¸í„° 변경과 새로운
ë°ì´í„°ì˜ ë°œê²¬ì„ ì˜¬ë°”ë¥¸ 순서로 ì¼ì–´ë‚˜ê²Œ 하기 때문입니다.
-리눅스 커ë„ì˜ ë©”ëª¨ë¦¬ 배리어 모ë¸ì€ Alpha ì— ê¸°ì´ˆí•´ì„œ ì •ì˜ë˜ì—ˆìŠµë‹ˆë‹¤.
+리눅스 커ë„ì˜ ë©”ëª¨ë¦¬ 배리어 모ë¸ì€ Alpha ì— ê¸°ì´ˆí•´ì„œ ì •ì˜ë˜ì—ˆìŠµë‹ˆë‹¤ë§Œ, v4.15
+부터는 리눅스 커ë„ì´ READ_ONCE() ë‚´ì— smp_read_barrier_depends() 를 추가해서
+Alpha ì˜ ë©”ëª¨ë¦¬ 모ë¸ë¡œì˜ ì˜í–¥ë ¥ì´ í¬ê²Œ 줄어들긴 했습니다.
ìœ„ì˜ "ìºì‹œ ì¼ê´€ì„±" ì„œë¸Œì„¹ì…˜ì„ ì°¸ê³ í•˜ì„¸ìš”.
@@ -3023,7 +3041,7 @@ smp_mb() ê°€ ì•„ë‹ˆë¼ virt_mb() 를 사용해야 합니다.
ë™ê¸°í™”ì— ë½ì„ 사용하지 ì•Šê³  구현하는ë°ì— ì‚¬ìš©ë  ìˆ˜ 있습니다. ë” ìžì„¸í•œ ë‚´ìš©ì„
위해선 다ìŒì„ 참고하세요:
- Documentation/circular-buffers.txt
+ Documentation/core-api/circular-buffers.rst
=========
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index ef6a5111eaa1..f1a4d3c3ba0b 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -252,15 +252,14 @@ into VFIO core. When devices are bound and unbound to the driver,
the driver should call vfio_add_group_dev() and vfio_del_group_dev()
respectively::
- extern int vfio_add_group_dev(struct iommu_group *iommu_group,
- struct device *dev,
+ extern int vfio_add_group_dev(struct device *dev,
const struct vfio_device_ops *ops,
void *device_data);
extern void *vfio_del_group_dev(struct device *dev);
vfio_add_group_dev() indicates to the core to begin tracking the
-specified iommu_group and register the specified dev as owned by
+iommu_group of the specified dev and register the dev as owned by
a VFIO bus driver. The driver provides an ops structure for callbacks
similar to a file operations structure::
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 0278f2c85efb..f4a4f3e884cf 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,62 +1,50 @@
00-INDEX
- this file.
-active_mm.txt
+active_mm.rst
- An explanation from Linus about tsk->active_mm vs tsk->mm.
-balance
+balance.rst
- various information on memory balancing.
-cleancache.txt
+cleancache.rst
- Intro to cleancache and page-granularity victim cache.
-frontswap.txt
+frontswap.rst
- Outline frontswap, part of the transcendent memory frontend.
-highmem.txt
+highmem.rst
- Outline of highmem and common issues.
-hmm.txt
+hmm.rst
- Documentation of heterogeneous memory management
-hugetlbpage.txt
- - a brief summary of hugetlbpage support in the Linux kernel.
-hugetlbfs_reserv.txt
+hugetlbfs_reserv.rst
- A brief overview of hugetlbfs reservation design/implementation.
-hwpoison.txt
+hwpoison.rst
- explains what hwpoison is
-idle_page_tracking.txt
- - description of the idle page tracking feature.
-ksm.txt
+ksm.rst
- how to use the Kernel Samepage Merging feature.
-mmu_notifier.txt
+mmu_notifier.rst
- a note about clearing pte/pmd and mmu notifications
-numa
+numa.rst
- information about NUMA specific code in the Linux vm.
-numa_memory_policy.txt
- - documentation of concepts and APIs of the 2.6 memory policy support.
-overcommit-accounting
+overcommit-accounting.rst
- description of the Linux kernels overcommit handling modes.
-page_frags
+page_frags.rst
- description of page fragments allocator
-page_migration
+page_migration.rst
- description of page migration in NUMA systems.
-pagemap.txt
- - pagemap, from the userspace perspective
-page_owner.txt
+page_owner.rst
- tracking about who allocated each page
-remap_file_pages.txt
+remap_file_pages.rst
- a note about remap_file_pages() system call
-slub.txt
+slub.rst
- a short users guide for SLUB.
-soft-dirty.txt
- - short explanation for soft-dirty PTEs
-split_page_table_lock
+split_page_table_lock.rst
- Separate per-table lock to improve scalability of the old page_table_lock.
-swap_numa.txt
+swap_numa.rst
- automatic binding of swap device to numa node
-transhuge.txt
+transhuge.rst
- Transparent Hugepage Support, alternative way of using hugepages.
-unevictable-lru.txt
+unevictable-lru.rst
- Unevictable LRU infrastructure
-userfaultfd.txt
- - description of userfaultfd system call
z3fold.txt
- outline of z3fold allocator for storing compressed pages
-zsmalloc.txt
+zsmalloc.rst
- outline of zsmalloc allocator for storing compressed pages
-zswap.txt
+zswap.rst
- Intro to compressed cache for swap pages
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
new file mode 100644
index 000000000000..c84471b180f8
--- /dev/null
+++ b/Documentation/vm/active_mm.rst
@@ -0,0 +1,91 @@
+.. _active_mm:
+
+=========
+Active MM
+=========
+
+::
+
+ List: linux-kernel
+ Subject: Re: active_mm
+ From: Linus Torvalds <torvalds () transmeta ! com>
+ Date: 1999-07-30 21:36:24
+
+ Cc'd to linux-kernel, because I don't write explanations all that often,
+ and when I do I feel better about more people reading them.
+
+ On Fri, 30 Jul 1999, David Mosberger wrote:
+ >
+ > Is there a brief description someplace on how "mm" vs. "active_mm" in
+ > the task_struct are supposed to be used? (My apologies if this was
+ > discussed on the mailing lists---I just returned from vacation and
+ > wasn't able to follow linux-kernel for a while).
+
+ Basically, the new setup is:
+
+ - we have "real address spaces" and "anonymous address spaces". The
+ difference is that an anonymous address space doesn't care about the
+ user-level page tables at all, so when we do a context switch into an
+ anonymous address space we just leave the previous address space
+ active.
+
+ The obvious use for a "anonymous address space" is any thread that
+ doesn't need any user mappings - all kernel threads basically fall into
+ this category, but even "real" threads can temporarily say that for
+ some amount of time they are not going to be interested in user space,
+ and that the scheduler might as well try to avoid wasting time on
+ switching the VM state around. Currently only the old-style bdflush
+ sync does that.
+
+ - "tsk->mm" points to the "real address space". For an anonymous process,
+ tsk->mm will be NULL, for the logical reason that an anonymous process
+ really doesn't _have_ a real address space at all.
+
+ - however, we obviously need to keep track of which address space we
+ "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+ which shows what the currently active address space is.
+
+ The rule is that for a process with a real address space (ie tsk->mm is
+ non-NULL) the active_mm obviously always has to be the same as the real
+ one.
+
+ For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+ "borrowed" mm while the anonymous process is running. When the
+ anonymous process gets scheduled away, the borrowed address space is
+ returned and cleared.
+
+ To support all that, the "struct mm_struct" now has two counters: a
+ "mm_users" counter that is how many "real address space users" there are,
+ and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+ users) plus one if there are any real users.
+
+ Usually there is at least one real user, but it could be that the real
+ user exited on another CPU while a lazy user was still active, so you do
+ actually get cases where you have a address space that is _only_ used by
+ lazy users. That is often a short-lived state, because once that thread
+ gets scheduled away in favour of a real thread, the "zombie" mm gets
+ released because "mm_users" becomes zero.
+
+ Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+ more. "init_mm" should be considered just a "lazy context when no other
+ context is available", and in fact it is mainly used just at bootup when
+ no real VM has yet been created. So code that used to check
+
+ if (current->mm == &init_mm)
+
+ should generally just do
+
+ if (!current->mm)
+
+ instead (which makes more sense anyway - the test is basically one of "do
+ we have a user context", and is generally done by the page fault handler
+ and things like that).
+
+ Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+ because it slightly changes the interfaces to accommodate the alpha (who
+ would have thought it, but the alpha actually ends up having one of the
+ ugliest context switch codes - unlike the other architectures where the MM
+ and register state is separate, the alpha PALcode joins the two, and you
+ need to switch both together).
+
+ (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
deleted file mode 100644
index dbf45817405f..000000000000
--- a/Documentation/vm/active_mm.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-List: linux-kernel
-Subject: Re: active_mm
-From: Linus Torvalds <torvalds () transmeta ! com>
-Date: 1999-07-30 21:36:24
-
-Cc'd to linux-kernel, because I don't write explanations all that often,
-and when I do I feel better about more people reading them.
-
-On Fri, 30 Jul 1999, David Mosberger wrote:
->
-> Is there a brief description someplace on how "mm" vs. "active_mm" in
-> the task_struct are supposed to be used? (My apologies if this was
-> discussed on the mailing lists---I just returned from vacation and
-> wasn't able to follow linux-kernel for a while).
-
-Basically, the new setup is:
-
- - we have "real address spaces" and "anonymous address spaces". The
- difference is that an anonymous address space doesn't care about the
- user-level page tables at all, so when we do a context switch into an
- anonymous address space we just leave the previous address space
- active.
-
- The obvious use for a "anonymous address space" is any thread that
- doesn't need any user mappings - all kernel threads basically fall into
- this category, but even "real" threads can temporarily say that for
- some amount of time they are not going to be interested in user space,
- and that the scheduler might as well try to avoid wasting time on
- switching the VM state around. Currently only the old-style bdflush
- sync does that.
-
- - "tsk->mm" points to the "real address space". For an anonymous process,
- tsk->mm will be NULL, for the logical reason that an anonymous process
- really doesn't _have_ a real address space at all.
-
- - however, we obviously need to keep track of which address space we
- "stole" for such an anonymous user. For that, we have "tsk->active_mm",
- which shows what the currently active address space is.
-
- The rule is that for a process with a real address space (ie tsk->mm is
- non-NULL) the active_mm obviously always has to be the same as the real
- one.
-
- For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
- "borrowed" mm while the anonymous process is running. When the
- anonymous process gets scheduled away, the borrowed address space is
- returned and cleared.
-
-To support all that, the "struct mm_struct" now has two counters: a
-"mm_users" counter that is how many "real address space users" there are,
-and a "mm_count" counter that is the number of "lazy" users (ie anonymous
-users) plus one if there are any real users.
-
-Usually there is at least one real user, but it could be that the real
-user exited on another CPU while a lazy user was still active, so you do
-actually get cases where you have a address space that is _only_ used by
-lazy users. That is often a short-lived state, because once that thread
-gets scheduled away in favour of a real thread, the "zombie" mm gets
-released because "mm_users" becomes zero.
-
-Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
-more. "init_mm" should be considered just a "lazy context when no other
-context is available", and in fact it is mainly used just at bootup when
-no real VM has yet been created. So code that used to check
-
- if (current->mm == &init_mm)
-
-should generally just do
-
- if (!current->mm)
-
-instead (which makes more sense anyway - the test is basically one of "do
-we have a user context", and is generally done by the page fault handler
-and things like that).
-
-Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
-because it slightly changes the interfaces to accommodate the alpha (who
-would have thought it, but the alpha actually ends up having one of the
-ugliest context switch codes - unlike the other architectures where the MM
-and register state is separate, the alpha PALcode joins the two, and you
-need to switch both together).
-
-(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance.rst
index 964595481af6..6a1fadf3e173 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance.rst
@@ -1,3 +1,9 @@
+.. _balance:
+
+================
+Memory Balancing
+================
+
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
@@ -62,11 +68,11 @@ for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
so as to give a fighting chance for replace_with_highmem() to get a
HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
fall back into regular zone. This also makes sure that HIGHMEM pages
-are not leaked (for example, in situations where a HIGHMEM page is in
+are not leaked (for example, in situations where a HIGHMEM page is in
the swapcache but is not being used by anyone)
kswapd also needs to know about the zones it should balance. kswapd is
-primarily needed in a situation where balancing can not be done,
+primarily needed in a situation where balancing can not be done,
probably because all allocation requests are coming from intr context
and all process contexts are sleeping. For 2.3, kswapd does not really
need to balance the highmem zone, since intr context does not request
@@ -89,7 +95,8 @@ pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
(Good) Ideas that I have heard:
+
1. Dynamic experience should influence balancing: number of failed requests
-for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+ for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
-dma pages. (lkd@tantalophile.demon.co.uk)
+ dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.rst
index e4b49df7a048..68cba9131c31 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.rst
@@ -1,4 +1,11 @@
-MOTIVATION
+.. _cleancache:
+
+==========
+Cleancache
+==========
+
+Motivation
+==========
Cleancache is a new optional feature provided by the VFS layer that
potentially dramatically increases page cache effectiveness for
@@ -21,9 +28,10 @@ Transcendent memory "drivers" for cleancache are currently implemented
in Xen (using hypervisor memory) and zcache (using in-kernel compressed
memory) and other implementations are in development.
-FAQs are included below.
+:ref:`FAQs <faq>` are included below.
-IMPLEMENTATION OVERVIEW
+Implementation Overview
+=======================
A cleancache "backend" that provides transcendent memory registers itself
to the kernel's cleancache "frontend" by calling cleancache_register_ops,
@@ -80,22 +88,33 @@ different Linux threads are simultaneously putting and invalidating a page
with the same handle, the results are indeterminate. Callers must
lock the page to ensure serial behavior.
-CLEANCACHE PERFORMANCE METRICS
+Cleancache Performance Metrics
+==============================
If properly configured, monitoring of cleancache is done via debugfs in
-the /sys/kernel/debug/cleancache directory. The effectiveness of cleancache
+the `/sys/kernel/debug/cleancache` directory. The effectiveness of cleancache
can be measured (across all filesystems) with:
-succ_gets - number of gets that were successful
-failed_gets - number of gets that failed
-puts - number of puts attempted (all "succeed")
-invalidates - number of invalidates attempted
+``succ_gets``
+ number of gets that were successful
+
+``failed_gets``
+ number of gets that failed
+
+``puts``
+ number of puts attempted (all "succeed")
+
+``invalidates``
+ number of invalidates attempted
A backend implementation may provide additional metrics.
+.. _faq:
+
FAQ
+===
-1) Where's the value? (Andrew Morton)
+* Where's the value? (Andrew Morton)
Cleancache provides a significant performance benefit to many workloads
in many environments with negligible overhead by improving the
@@ -137,8 +156,8 @@ device that stores pages of data in a compressed state. And
the proposed "RAMster" driver shares RAM across multiple physical
systems.
-2) Why does cleancache have its sticky fingers so deep inside the
- filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+* Why does cleancache have its sticky fingers so deep inside the
+ filesystems and VFS? (Andrew Morton and Christoph Hellwig)
The core hooks for cleancache in VFS are in most cases a single line
and the minimum set are placed precisely where needed to maintain
@@ -168,9 +187,9 @@ filesystems in the future.
The total impact of the hooks to existing fs and mm files is only
about 40 lines added (not counting comments and blank lines).
-3) Why not make cleancache asynchronous and batched so it can
- more easily interface with real devices with DMA instead
- of copying each individual page? (Minchan Kim)
+* Why not make cleancache asynchronous and batched so it can more
+ easily interface with real devices with DMA instead of copying each
+ individual page? (Minchan Kim)
The one-page-at-a-time copy semantics simplifies the implementation
on both the frontend and backend and also allows the backend to
@@ -182,8 +201,8 @@ are avoided. While the interface seems odd for a "real device"
or for real kernel-addressable RAM, it makes perfect sense for
transcendent memory.
-4) Why is non-shared cleancache "exclusive"? And where is the
- page "invalidated" after a "get"? (Minchan Kim)
+* Why is non-shared cleancache "exclusive"? And where is the
+ page "invalidated" after a "get"? (Minchan Kim)
The main reason is to free up space in transcendent memory and
to avoid unnecessary cleancache_invalidate calls. If you want inclusive,
@@ -193,7 +212,7 @@ be easily extended to add a "get_no_invalidate" call.
The invalidate is done by the cleancache backend implementation.
-5) What's the performance impact?
+* What's the performance impact?
Performance analysis has been presented at OLS'09 and LCA'10.
Briefly, performance gains can be significant on most workloads,
@@ -206,7 +225,7 @@ single-core systems with slow memory-copy speeds, cleancache
has little value, but in newer multicore machines, especially
consolidated/virtualized machines, it has great value.
-6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+* How do I add cleancache support for filesystem X? (Boaz Harrash)
Filesystems that are well-behaved and conform to certain
restrictions can utilize cleancache simply by making a call to
@@ -217,26 +236,26 @@ not enable the optional cleancache.
Some points for a filesystem to consider:
-- The FS should be block-device-based (e.g. a ram-based FS such
- as tmpfs should not enable cleancache)
-- To ensure coherency/correctness, the FS must ensure that all
- file removal or truncation operations either go through VFS or
- add hooks to do the equivalent cleancache "invalidate" operations
-- To ensure coherency/correctness, either inode numbers must
- be unique across the lifetime of the on-disk file OR the
- FS must provide an "encode_fh" function.
-- The FS must call the VFS superblock alloc and deactivate routines
- or add hooks to do the equivalent cleancache calls done there.
-- To maximize performance, all pages fetched from the FS should
- go through the do_mpag_readpage routine or the FS should add
- hooks to do the equivalent (cf. btrfs)
-- Currently, the FS blocksize must be the same as PAGESIZE. This
- is not an architectural restriction, but no backends currently
- support anything different.
-- A clustered FS should invoke the "shared_init_fs" cleancache
- hook to get best performance for some backends.
-
-7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+ - The FS should be block-device-based (e.g. a ram-based FS such
+ as tmpfs should not enable cleancache)
+ - To ensure coherency/correctness, the FS must ensure that all
+ file removal or truncation operations either go through VFS or
+ add hooks to do the equivalent cleancache "invalidate" operations
+ - To ensure coherency/correctness, either inode numbers must
+ be unique across the lifetime of the on-disk file OR the
+ FS must provide an "encode_fh" function.
+ - The FS must call the VFS superblock alloc and deactivate routines
+ or add hooks to do the equivalent cleancache calls done there.
+ - To maximize performance, all pages fetched from the FS should
+ go through the do_mpag_readpage routine or the FS should add
+ hooks to do the equivalent (cf. btrfs)
+ - Currently, the FS blocksize must be the same as PAGESIZE. This
+ is not an architectural restriction, but no backends currently
+ support anything different.
+ - A clustered FS should invoke the "shared_init_fs" cleancache
+ hook to get best performance for some backends.
+
+* Why not use the KVA of the inode as the key? (Christoph Hellwig)
If cleancache would use the inode virtual address instead of
inode/filehandle, the pool id could be eliminated. But, this
@@ -251,7 +270,7 @@ of cleancache would be lost because the cache of pages in cleanache
is potentially much larger than the kernel pagecache and is most
useful if the pages survive inode cache removal.
-8) Why is a global variable required?
+* Why is a global variable required?
The cleancache_enabled flag is checked in all of the frequently-used
cleancache hooks. The alternative is a function call to check a static
@@ -262,14 +281,14 @@ global variable allows cleancache to be enabled by default at compile
time, but have insignificant performance impact when cleancache remains
disabled at runtime.
-9) Does cleanache work with KVM?
+* Does cleanache work with KVM?
The memory model of KVM is sufficiently different that a cleancache
backend may have less value for KVM. This remains to be tested,
especially in an overcommitted system.
-10) Does cleancache work in userspace? It sounds useful for
- memory hungry caches like web browsers. (Jamie Lokier)
+* Does cleancache work in userspace? It sounds useful for
+ memory hungry caches like web browsers. (Jamie Lokier)
No plans yet, though we agree it sounds useful, at least for
apps that bypass the page cache (e.g. O_DIRECT).
diff --git a/Documentation/vm/conf.py b/Documentation/vm/conf.py
new file mode 100644
index 000000000000..3b0b601af558
--- /dev/null
+++ b/Documentation/vm/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = "Linux Memory Management Documentation"
+
+tags.add("subproject")
+
+latex_documents = [
+ ('index', 'memory-management.tex', project,
+ 'The kernel development community', 'manual'),
+]
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.rst
index c71a019be600..1979f430c1c5 100644
--- a/Documentation/vm/frontswap.txt
+++ b/Documentation/vm/frontswap.rst
@@ -1,13 +1,20 @@
+.. _frontswap:
+
+=========
+Frontswap
+=========
+
Frontswap provides a "transcendent memory" interface for swap pages.
In some environments, dramatic performance savings may be obtained because
swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
-(Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends"
+(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends"
and the only necessary changes to the core kernel for transcendent memory;
all other supporting code -- the "backends" -- is implemented as drivers.
-See the LWN.net article "Transcendent memory in a nutshell" for a detailed
-overview of frontswap and related kernel parts:
-https://lwn.net/Articles/454795/ )
+See the LWN.net article `Transcendent memory in a nutshell`_
+for a detailed overview of frontswap and related kernel parts)
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
Frontswap is so named because it can be thought of as the opposite of
a "backing" store for a swap device. The storage is assumed to be
@@ -50,19 +57,27 @@ or the store fails AND the page is invalidated. This ensures stale data may
never be obtained from frontswap.
If properly configured, monitoring of frontswap is done via debugfs in
-the /sys/kernel/debug/frontswap directory. The effectiveness of
+the `/sys/kernel/debug/frontswap` directory. The effectiveness of
frontswap can be measured (across all swap devices) with:
-failed_stores - how many store attempts have failed
-loads - how many loads were attempted (all should succeed)
-succ_stores - how many store attempts have succeeded
-invalidates - how many invalidates were attempted
+``failed_stores``
+ how many store attempts have failed
+
+``loads``
+ how many loads were attempted (all should succeed)
+
+``succ_stores``
+ how many store attempts have succeeded
+
+``invalidates``
+ how many invalidates were attempted
A backend implementation may provide additional metrics.
FAQ
+===
-1) Where's the value?
+* Where's the value?
When a workload starts swapping, performance falls through the floor.
Frontswap significantly increases performance in many such workloads by
@@ -117,8 +132,8 @@ A KVM implementation is underway and has been RFC'ed to lkml. And,
using frontswap, investigation is also underway on the use of NVM as
a memory extension technology.
-2) Sure there may be performance advantages in some situations, but
- what's the space/time overhead of frontswap?
+* Sure there may be performance advantages in some situations, but
+ what's the space/time overhead of frontswap?
If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
nothingness and the only overhead is a few extra bytes per swapon'ed
@@ -148,8 +163,8 @@ pressure that can potentially outweigh the other advantages. A
backend, such as zcache, must implement policies to carefully (but
dynamically) manage memory limits to ensure this doesn't happen.
-3) OK, how about a quick overview of what this frontswap patch does
- in terms that a kernel hacker can grok?
+* OK, how about a quick overview of what this frontswap patch does
+ in terms that a kernel hacker can grok?
Let's assume that a frontswap "backend" has registered during
kernel initialization; this registration indicates that this
@@ -188,9 +203,9 @@ and (potentially) a swap device write are replaced by a "frontswap backend
store" and (possibly) a "frontswap backend loads", which are presumably much
faster.
-4) Can't frontswap be configured as a "special" swap device that is
- just higher priority than any real swap device (e.g. like zswap,
- or maybe swap-over-nbd/NFS)?
+* Can't frontswap be configured as a "special" swap device that is
+ just higher priority than any real swap device (e.g. like zswap,
+ or maybe swap-over-nbd/NFS)?
No. First, the existing swap subsystem doesn't allow for any kind of
swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy,
@@ -240,9 +255,9 @@ installation, frontswap is useless. Swapless portable devices
can still use frontswap but a backend for such devices must configure
some kind of "ghost" swap device and ensure that it is never used.
-5) Why this weird definition about "duplicate stores"? If a page
- has been previously successfully stored, can't it always be
- successfully overwritten?
+* Why this weird definition about "duplicate stores"? If a page
+ has been previously successfully stored, can't it always be
+ successfully overwritten?
Nearly always it can, but no, sometimes it cannot. Consider an example
where data is compressed and the original 4K page has been compressed
@@ -254,7 +269,7 @@ the old data and ensure that it is no longer accessible. Since the
swap subsystem then writes the new data to the read swap device,
this is the correct course of action to ensure coherency.
-6) What is frontswap_shrink for?
+* What is frontswap_shrink for?
When the (non-frontswap) swap subsystem swaps out a page to a real
swap device, that page is only taking up low-value pre-allocated disk
@@ -267,7 +282,7 @@ to "repatriate" pages sent to a remote machine back to the local machine;
this is driven using the frontswap_shrink mechanism when memory pressure
subsides.
-7) Why does the frontswap patch create the new include file swapfile.h?
+* Why does the frontswap patch create the new include file swapfile.h?
The frontswap code depends on some swap-subsystem-internal data
structures that have, over the years, moved back and forth between
diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.rst
index 4324d24ffacd..0f69a9fec34d 100644
--- a/Documentation/vm/highmem.txt
+++ b/Documentation/vm/highmem.rst
@@ -1,25 +1,14 @@
+.. _highmem:
- ====================
- HIGH MEMORY HANDLING
- ====================
+====================
+High Memory Handling
+====================
By: Peter Zijlstra <a.p.zijlstra@chello.nl>
-Contents:
-
- (*) What is high memory?
-
- (*) Temporary virtual mappings.
-
- (*) Using kmap_atomic.
-
- (*) Cost of temporary mappings.
-
- (*) i386 PAE.
+.. contents:: :local:
-
-====================
-WHAT IS HIGH MEMORY?
+What Is High Memory?
====================
High memory (highmem) is used when the size of physical memory approaches or
@@ -38,7 +27,7 @@ kernel entry/exit. This means the available virtual memory space (4GiB on
i386) has to be divided between user and kernel space.
The traditional split for architectures using this approach is 3:1, 3GiB for
-userspace and the top 1GiB for kernel space:
+userspace and the top 1GiB for kernel space::
+--------+ 0xffffffff
| Kernel |
@@ -58,40 +47,38 @@ and user maps. Some hardware (like some ARMs), however, have limited virtual
space when they use mm context tags.
-==========================
-TEMPORARY VIRTUAL MAPPINGS
+Temporary Virtual Mappings
==========================
The kernel contains several ways of creating temporary mappings:
- (*) vmap(). This can be used to make a long duration mapping of multiple
- physical pages into a contiguous virtual space. It needs global
- synchronization to unmap.
+* vmap(). This can be used to make a long duration mapping of multiple
+ physical pages into a contiguous virtual space. It needs global
+ synchronization to unmap.
- (*) kmap(). This permits a short duration mapping of a single page. It needs
- global synchronization, but is amortized somewhat. It is also prone to
- deadlocks when using in a nested fashion, and so it is not recommended for
- new code.
+* kmap(). This permits a short duration mapping of a single page. It needs
+ global synchronization, but is amortized somewhat. It is also prone to
+ deadlocks when using in a nested fashion, and so it is not recommended for
+ new code.
- (*) kmap_atomic(). This permits a very short duration mapping of a single
- page. Since the mapping is restricted to the CPU that issued it, it
- performs well, but the issuing task is therefore required to stay on that
- CPU until it has finished, lest some other task displace its mappings.
+* kmap_atomic(). This permits a very short duration mapping of a single
+ page. Since the mapping is restricted to the CPU that issued it, it
+ performs well, but the issuing task is therefore required to stay on that
+ CPU until it has finished, lest some other task displace its mappings.
- kmap_atomic() may also be used by interrupt contexts, since it is does not
- sleep and the caller may not sleep until after kunmap_atomic() is called.
+ kmap_atomic() may also be used by interrupt contexts, since it is does not
+ sleep and the caller may not sleep until after kunmap_atomic() is called.
- It may be assumed that k[un]map_atomic() won't fail.
+ It may be assumed that k[un]map_atomic() won't fail.
-=================
-USING KMAP_ATOMIC
+Using kmap_atomic
=================
When and where to use kmap_atomic() is straightforward. It is used when code
wants to access the contents of a page that might be allocated from high memory
(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two
-functions, and they can be used in a manner similar to the following:
+functions, and they can be used in a manner similar to the following::
/* Find the page of interest. */
struct page *page = find_get_page(mapping, offset);
@@ -109,7 +96,7 @@ Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
not the argument.
If you need to map two pages because you want to copy from one page to
-another you need to keep the kmap_atomic calls strictly nested, like:
+another you need to keep the kmap_atomic calls strictly nested, like::
vaddr1 = kmap_atomic(page1);
vaddr2 = kmap_atomic(page2);
@@ -120,8 +107,7 @@ another you need to keep the kmap_atomic calls strictly nested, like:
kunmap_atomic(vaddr1);
-==========================
-COST OF TEMPORARY MAPPINGS
+Cost of Temporary Mappings
==========================
The cost of creating temporary mappings can be quite high. The arch has to
@@ -136,25 +122,24 @@ If CONFIG_MMU is not set, then there can be no temporary mappings and no
highmem. In such a case, the arithmetic approach will also be used.
-========
i386 PAE
========
The i386 arch, under some circumstances, will permit you to stick up to 64GiB
of RAM into your 32-bit machine. This has a number of consequences:
- (*) Linux needs a page-frame structure for each page in the system and the
- pageframes need to live in the permanent mapping, which means:
+* Linux needs a page-frame structure for each page in the system and the
+ pageframes need to live in the permanent mapping, which means:
- (*) you can have 896M/sizeof(struct page) page-frames at most; with struct
- page being 32-bytes that would end up being something in the order of 112G
- worth of pages; the kernel, however, needs to store more than just
- page-frames in that memory...
+* you can have 896M/sizeof(struct page) page-frames at most; with struct
+ page being 32-bytes that would end up being something in the order of 112G
+ worth of pages; the kernel, however, needs to store more than just
+ page-frames in that memory...
- (*) PAE makes your page tables larger - which slows the system down as more
- data has to be accessed to traverse in TLB fills and the like. One
- advantage is that PAE has more PTE bits and can provide advanced features
- like NX and PAT.
+* PAE makes your page tables larger - which slows the system down as more
+ data has to be accessed to traverse in TLB fills and the like. One
+ advantage is that PAE has more PTE bits and can provide advanced features
+ like NX and PAT.
The general recommendation is that you don't use more than 8GiB on a 32-bit
machine - although more might work for you and your workload, you're pretty
diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.rst
index 2d1d6f69e91b..cdf3911582c8 100644
--- a/Documentation/vm/hmm.txt
+++ b/Documentation/vm/hmm.rst
@@ -1,4 +1,8 @@
+.. hmm:
+
+=====================================
Heterogeneous Memory Management (HMM)
+=====================================
Provide infrastructure and helpers to integrate non-conventional memory (device
memory like GPU on board memory) into regular kernel path, with the cornerstone
@@ -6,10 +10,10 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
this document).
HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with the
-CPU meaning that any valid pointer on the CPU is also a valid pointer for the
-device. This is becoming mandatory to simplify the use of advanced hetero-
-geneous computing where GPU, DSP, or FPGA are used to perform various
+allowing a device to transparently access program address coherently with
+the CPU meaning that any valid pointer on the CPU is also a valid pointer
+for the device. This is becoming mandatory to simplify the use of advanced
+heterogeneous computing where GPU, DSP, or FPGA are used to perform various
computations on behalf of a process.
This document is divided as follows: in the first section I expose the problems
@@ -21,19 +25,10 @@ fifth section deals with how device memory is represented inside the kernel.
Finally, the last section presents a new migration helper that allows lever-
aging the device DMA engine.
+.. contents:: :local:
-1) Problems of using a device specific memory allocator:
-2) I/O bus, device memory characteristics
-3) Shared address space and migration
-4) Address space mirroring implementation and API
-5) Represent and manage device memory from core kernel point of view
-6) Migration to and from device memory
-7) Memory cgroup (memcg) and rss accounting
-
-
--------------------------------------------------------------------------------
-
-1) Problems of using a device specific memory allocator:
+Problems of using a device specific memory allocator
+====================================================
Devices with a large amount of on board memory (several gigabytes) like GPUs
have historically managed their memory through dedicated driver specific APIs.
@@ -77,9 +72,8 @@ are only do-able with a shared address space. It is also more reasonable to use
a shared address space for all other patterns.
--------------------------------------------------------------------------------
-
-2) I/O bus, device memory characteristics
+I/O bus, device memory characteristics
+======================================
I/O buses cripple shared address spaces due to a few limitations. Most I/O
buses only allow basic memory access from device to main memory; even cache
@@ -109,9 +103,8 @@ access any memory but we must also permit any memory to be migrated to device
memory while device is using it (blocking CPU access while it happens).
--------------------------------------------------------------------------------
-
-3) Shared address space and migration
+Shared address space and migration
+==================================
HMM intends to provide two main features. First one is to share the address
space by duplicating the CPU page table in the device page table so the same
@@ -148,23 +141,23 @@ ages device memory by migrating the part of the data set that is actively being
used by the device.
--------------------------------------------------------------------------------
-
-4) Address space mirroring implementation and API
+Address space mirroring implementation and API
+==============================================
Address space mirroring's main objective is to allow duplication of a range of
CPU page table into a device page table; HMM helps keep both synchronized. A
device driver that wants to mirror a process address space must start with the
-registration of an hmm_mirror struct:
+registration of an hmm_mirror struct::
int hmm_mirror_register(struct hmm_mirror *mirror,
struct mm_struct *mm);
int hmm_mirror_register_locked(struct hmm_mirror *mirror,
struct mm_struct *mm);
+
The locked variant is to be used when the driver is already holding mmap_sem
of the mm in write mode. The mirror struct has a set of callbacks that are used
-to propagate CPU page tables:
+to propagate CPU page tables::
struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
@@ -193,10 +186,10 @@ The device driver must perform the update action to the range (mark range
read only, or fully unmap, ...). The device must be done with the update before
the driver callback returns.
-
When the device driver wants to populate a range of virtual addresses, it can
-use either:
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
+use either::
+
+ int hmm_vma_get_pfns(struct vm_area_struct *vma,
struct hmm_range *range,
unsigned long start,
unsigned long end,
@@ -221,7 +214,7 @@ provides a set of flags to help the driver identify special CPU page table
entries.
Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is:
+respect in order to keep things properly synchronized. The usage pattern is::
int driver_populate_range(...)
{
@@ -262,9 +255,8 @@ report commands as executed is serialized (there is no point in doing this
concurrently).
--------------------------------------------------------------------------------
-
-5) Represent and manage device memory from core kernel point of view
+Represent and manage device memory from core kernel point of view
+=================================================================
Several different designs were tried to support device memory. First one used
a device specific data structure to keep information about migrated memory and
@@ -280,14 +272,14 @@ unaware of the difference. We only need to make sure that no one ever tries to
map those pages from the CPU side.
HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API:
+region needing a struct page. This is offered through a very simple API::
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
struct device *device,
unsigned long size);
void hmm_devmem_remove(struct hmm_devmem *devmem);
-The hmm_devmem_ops is where most of the important things are:
+The hmm_devmem_ops is where most of the important things are::
struct hmm_devmem_ops {
void (*free)(struct hmm_devmem *devmem, struct page *page);
@@ -306,13 +298,12 @@ which it cannot do. This second callback must trigger a migration back to
system memory.
--------------------------------------------------------------------------------
-
-6) Migration to and from device memory
+Migration to and from device memory
+===================================
Because the CPU cannot access device memory, migration must use the device DMA
engine to perform copy from and to device memory. For this we need a new
-migration helper:
+migration helper::
int migrate_vma(const struct migrate_vma_ops *ops,
struct vm_area_struct *vma,
@@ -331,7 +322,7 @@ migration might be for a range of addresses the device is actively accessing.
The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
controls destination memory allocation and copy operation. Second one is there
-to allow the device driver to perform cleanup operations after migration.
+to allow the device driver to perform cleanup operations after migration::
struct migrate_vma_ops {
void (*alloc_and_copy)(struct vm_area_struct *vma,
@@ -365,9 +356,8 @@ bandwidth but this is considered as a rare event and a price that we are
willing to pay to keep all the code simpler.
--------------------------------------------------------------------------------
-
-7) Memory cgroup (memcg) and rss accounting
+Memory cgroup (memcg) and rss accounting
+========================================
For now device memory is accounted as any regular page in rss counters (either
anonymous if device page is used for anonymous, file if device page is used for
diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.rst
index 9aca09a76bed..9d200762114f 100644
--- a/Documentation/vm/hugetlbfs_reserv.txt
+++ b/Documentation/vm/hugetlbfs_reserv.rst
@@ -1,6 +1,13 @@
-Hugetlbfs Reservation Overview
-------------------------------
-Huge pages as described at 'Documentation/vm/hugetlbpage.txt' are typically
+.. _hugetlbfs_reserve:
+
+=====================
+Hugetlbfs Reservation
+=====================
+
+Overview
+========
+
+Huge pages as described at :ref:`hugetlbpage` are typically
preallocated for application use. These huge pages are instantiated in a
task's address space at page fault time if the VMA indicates huge pages are
to be used. If no huge page exists at page fault time, the task is sent
@@ -17,47 +24,55 @@ describe how huge page reserve processing is done in the v4.10 kernel.
Audience
---------
+========
This description is primarily targeted at kernel developers who are modifying
hugetlbfs code.
The Data Structures
--------------------
+===================
+
resv_huge_pages
This is a global (per-hstate) count of reserved huge pages. Reserved
huge pages are only available to the task which reserved them.
Therefore, the number of huge pages generally available is computed
- as (free_huge_pages - resv_huge_pages).
+ as (``free_huge_pages - resv_huge_pages``).
Reserve Map
- A reserve map is described by the structure:
- struct resv_map {
- struct kref refs;
- spinlock_t lock;
- struct list_head regions;
- long adds_in_progress;
- struct list_head region_cache;
- long region_cache_count;
- };
+ A reserve map is described by the structure::
+
+ struct resv_map {
+ struct kref refs;
+ spinlock_t lock;
+ struct list_head regions;
+ long adds_in_progress;
+ struct list_head region_cache;
+ long region_cache_count;
+ };
+
There is one reserve map for each huge page mapping in the system.
The regions list within the resv_map describes the regions within
- the mapping. A region is described as:
- struct file_region {
- struct list_head link;
- long from;
- long to;
- };
+ the mapping. A region is described as::
+
+ struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+ };
+
The 'from' and 'to' fields of the file region structure are huge page
indices into the mapping. Depending on the type of mapping, a
region in the reserv_map may indicate reservations exist for the
range, or reservations do not exist.
Flags for MAP_PRIVATE Reservations
These are stored in the bottom bits of the reservation map pointer.
- #define HPAGE_RESV_OWNER (1UL << 0) Indicates this task is the
- owner of the reservations associated with the mapping.
- #define HPAGE_RESV_UNMAPPED (1UL << 1) Indicates task originally
- mapping this range (and creating reserves) has unmapped a
- page from this task (the child) due to a failed COW.
+
+ ``#define HPAGE_RESV_OWNER (1UL << 0)``
+ Indicates this task is the owner of the reservations
+ associated with the mapping.
+ ``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+ Indicates task originally mapping this range (and creating
+ reserves) has unmapped a page from this task (the child)
+ due to a failed COW.
Page Flags
The PagePrivate page flag is used to indicate that a huge page
reservation must be restored when the huge page is freed. More
@@ -65,12 +80,14 @@ Page Flags
Reservation Map Location (Private or Shared)
---------------------------------------------
+============================================
+
A huge page mapping or segment is either private or shared. If private,
it is typically only available to a single address space (task). If shared,
it can be mapped into multiple address spaces (tasks). The location and
semantics of the reservation map is significantly different for two types
of mappings. Location differences are:
+
- For private mappings, the reservation map hangs off the the VMA structure.
Specifically, vma->vm_private_data. This reserve map is created at the
time the mapping (mmap(MAP_PRIVATE)) is created.
@@ -82,15 +99,15 @@ of mappings. Location differences are:
Creating Reservations
----------------------
+=====================
Reservations are created when a huge page backed shared memory segment is
created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
-These operations result in a call to the routine hugetlb_reserve_pages()
+These operations result in a call to the routine hugetlb_reserve_pages()::
-int hugetlb_reserve_pages(struct inode *inode,
- long from, long to,
- struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+ int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
The first thing hugetlb_reserve_pages() does is check for the NORESERVE
flag was specified in either the shmget() or mmap() call. If NORESERVE
@@ -105,6 +122,7 @@ the 'from' and 'to' arguments have been adjusted by this offset.
One of the big differences between PRIVATE and SHARED mappings is the way
in which reservations are represented in the reservation map.
+
- For shared mappings, an entry in the reservation map indicates a reservation
exists or did exist for the corresponding page. As reservations are
consumed, the reservation map is not modified.
@@ -121,12 +139,13 @@ to indicate this VMA owns the reservations.
The reservation map is consulted to determine how many huge page reservations
are needed for the current mapping/segment. For private mappings, this is
always the value (to - from). However, for shared mappings it is possible that some reservations may already exist within the range (to - from). See the
-section "Reservation Map Modifications" for details on how this is accomplished.
+section :ref:`Reservation Map Modifications <resv_map_modifications>`
+for details on how this is accomplished.
The mapping may be associated with a subpool. If so, the subpool is consulted
to ensure there is sufficient space for the mapping. It is possible that the
subpool has set aside reservations that can be used for the mapping. See the
-section "Subpool Reservations" for more details.
+section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
After consulting the reservation map and subpool, the number of needed new
reservations is known. The routine hugetlb_acct_memory() is called to check
@@ -135,9 +154,11 @@ calls into routines that potentially allocate and adjust surplus page counts.
However, within those routines the code is simply checking to ensure there
are enough free huge pages to accommodate the reservation. If there are,
the global reservation count resv_huge_pages is adjusted something like the
-following.
+following::
+
if (resv_needed <= (resv_huge_pages - free_huge_pages))
resv_huge_pages += resv_needed;
+
Note that the global lock hugetlb_lock is held when checking and adjusting
these counters.
@@ -152,14 +173,18 @@ If hugetlb_reserve_pages() was successful, the global reservation count and
reservation map associated with the mapping will be modified as required to
ensure reservations exist for the range 'from' - 'to'.
+.. _consume_resv:
Consuming Reservations/Allocating a Huge Page
----------------------------------------------
+=============================================
+
Reservations are consumed when huge pages associated with the reservations
are allocated and instantiated in the corresponding mapping. The allocation
-is performed within the routine alloc_huge_page().
-struct page *alloc_huge_page(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+is performed within the routine alloc_huge_page()::
+
+ struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+
alloc_huge_page is passed a VMA pointer and a virtual address, so it can
consult the reservation map to determine if a reservation exists. In addition,
alloc_huge_page takes the argument avoid_reserve which indicates reserves
@@ -170,8 +195,9 @@ page are being allocated.
The helper routine vma_needs_reservation() is called to determine if a
reservation exists for the address within the mapping(vma). See the section
-"Reservation Map Helper Routines" for detailed information on what this
-routine does. The value returned from vma_needs_reservation() is generally
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
+information on what this routine does.
+The value returned from vma_needs_reservation() is generally
0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists.
If a reservation does not exist, and there is a subpool associated with the
mapping the subpool is consulted to determine if it contains reservations.
@@ -180,21 +206,25 @@ However, in every case the avoid_reserve argument overrides the use of
a reservation for the allocation. After determining whether a reservation
exists and can be used for the allocation, the routine dequeue_huge_page_vma()
is called. This routine takes two arguments related to reservations:
+
- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
- chg, even though this argument is of type long only the values 0 or 1 are
passed to dequeue_huge_page_vma. If the value is 0, it indicates a
reservation exists (see the section "Memory Policy and Reservations" for
possible issues). If the value is 1, it indicates a reservation does not
exist and the page must be taken from the global free pool if possible.
+
The free lists associated with the memory policy of the VMA are searched for
a free page. If a page is found, the value free_huge_pages is decremented
when the page is removed from the free list. If there was a reservation
-associated with the page, the following adjustments are made:
+associated with the page, the following adjustments are made::
+
SetPagePrivate(page); /* Indicates allocating this page consumed
* a reservation, and if an error is
* encountered such that the page must be
* freed, the reservation will be restored. */
resv_huge_pages--; /* Decrement the global reservation count */
+
Note, if no huge page can be found that satisfies the VMA's memory policy
an attempt will be made to allocate one using the buddy allocator. This
brings up the issue of surplus huge pages and overcommit which is beyond
@@ -222,12 +252,14 @@ mapping. In such cases, the reservation count and subpool free page count
will be off by one. This rare condition can be identified by comparing the
return value from vma_needs_reservation and vma_commit_reservation. If such
a race is detected, the subpool and global reserve counts are adjusted to
-compensate. See the section "Reservation Map Helper Routines" for more
+compensate. See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
information on these routines.
Instantiate Huge Pages
-----------------------
+======================
+
After huge page allocation, the page is typically added to the page tables
of the allocating task. Before this, pages in a shared mapping are added
to the page cache and pages in private mappings are added to an anonymous
@@ -237,7 +269,8 @@ to the global reservation count (resv_huge_pages).
Freeing Huge Pages
-------------------
+==================
+
Huge page freeing is performed by the routine free_huge_page(). This routine
is the destructor for hugetlbfs compound pages. As a result, it is only
passed a pointer to the page struct. When a huge page is freed, reservation
@@ -247,7 +280,8 @@ on an error path where a global reserve count must be restored.
The page->private field points to any subpool associated with the page.
If the PagePrivate flag is set, it indicates the global reserve count should
-be adjusted (see the section "Consuming Reservations/Allocating a Huge Page"
+be adjusted (see the section
+:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
for information on how these are set).
The routine first calls hugepage_subpool_put_pages() for the page. If this
@@ -259,9 +293,11 @@ Therefore, the global resv_huge_pages counter is incremented in this case.
If the PagePrivate flag was set in the page, the global resv_huge_pages counter
will always be incremented.
+.. _sub_pool_resv:
Subpool Reservations
---------------------
+====================
+
There is a struct hstate associated with each huge page size. The hstate
tracks all huge pages of the specified size. A subpool represents a subset
of pages within a hstate that is associated with a mounted hugetlbfs
@@ -295,7 +331,8 @@ the global pools.
COW and Reservations
---------------------
+====================
+
Since shared mappings all point to and use the same underlying pages, the
biggest reservation concern for COW is private mappings. In this case,
two tasks can be pointing at the same previously allocated page. One task
@@ -326,30 +363,36 @@ faults on a non-present page. But, the original owner of the
mapping/reservation will behave as expected.
+.. _resv_map_modifications:
+
Reservation Map Modifications
------------------------------
+=============================
+
The following low level routines are used to make modifications to a
reservation map. Typically, these routines are not called directly. Rather,
a reservation map helper routine is called which calls one of these low level
routines. These low level routines are fairly well documented in the source
-code (mm/hugetlb.c). These routines are:
-long region_chg(struct resv_map *resv, long f, long t);
-long region_add(struct resv_map *resv, long f, long t);
-void region_abort(struct resv_map *resv, long f, long t);
-long region_count(struct resv_map *resv, long f, long t);
+code (mm/hugetlb.c). These routines are::
+
+ long region_chg(struct resv_map *resv, long f, long t);
+ long region_add(struct resv_map *resv, long f, long t);
+ void region_abort(struct resv_map *resv, long f, long t);
+ long region_count(struct resv_map *resv, long f, long t);
Operations on the reservation map typically involve two operations:
+
1) region_chg() is called to examine the reserve map and determine how
many pages in the specified range [f, t) are NOT currently represented.
The calling code performs global checks and allocations to determine if
there are enough huge pages for the operation to succeed.
-2a) If the operation can succeed, region_add() is called to actually modify
- the reservation map for the same range [f, t) previously passed to
- region_chg().
-2b) If the operation can not succeed, region_abort is called for the same range
- [f, t) to abort the operation.
+2)
+ a) If the operation can succeed, region_add() is called to actually modify
+ the reservation map for the same range [f, t) previously passed to
+ region_chg().
+ b) If the operation can not succeed, region_abort is called for the same
+ range [f, t) to abort the operation.
Note that this is a two step process where region_add() and region_abort()
are guaranteed to succeed after a prior call to region_chg() for the same
@@ -371,6 +414,7 @@ and make the appropriate adjustments.
The routine region_del() is called to remove regions from a reservation map.
It is typically called in the following situations:
+
- When a file in the hugetlbfs filesystem is being removed, the inode will
be released and the reservation map freed. Before freeing the reservation
map, all the individual file_region structures must be freed. In this case
@@ -384,6 +428,7 @@ It is typically called in the following situations:
removed, region_del() is called to remove the corresponding entry from the
reservation map. In this case, region_del is passed the range
[page_idx, page_idx + 1).
+
In every case, region_del() will return the number of pages removed from the
reservation map. In VERY rare cases, region_del() can fail. This can only
happen in the hole punch case where it has to split an existing file_region
@@ -403,9 +448,11 @@ outstanding (outstanding = (end - start) - region_count(resv, start, end)).
Since the mapping is going away, the subpool and global reservation counts
are decremented by the number of outstanding reservations.
+.. _resv_map_helpers:
Reservation Map Helper Routines
--------------------------------
+===============================
+
Several helper routines exist to query and modify the reservation maps.
These routines are only interested with reservations for a specific huge
page, so they just pass in an address instead of a range. In addition,
@@ -414,32 +461,40 @@ or shared) and the location of the reservation map (inode or VMA) can be
determined. These routines simply call the underlying routines described
in the section "Reservation Map Modifications". However, they do take into
account the 'opposite' meaning of reservation map entries for private and
-shared mappings and hide this detail from the caller.
+shared mappings and hide this detail from the caller::
+
+ long vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
-long vma_needs_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
This routine calls region_chg() for the specified page. If no reservation
-exists, 1 is returned. If a reservation exists, 0 is returned.
+exists, 1 is returned. If a reservation exists, 0 is returned::
+
+ long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
-long vma_commit_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
This calls region_add() for the specified page. As in the case of region_chg
and region_add, this routine is to be called after a previous call to
vma_needs_reservation. It will add a reservation entry for the page. It
returns 1 if the reservation was added and 0 if not. The return value should
be compared with the return value of the previous call to
vma_needs_reservation. An unexpected difference indicates the reservation
-map was modified between calls.
+map was modified between calls::
+
+ void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
-void vma_end_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
This calls region_abort() for the specified page. As in the case of region_chg
and region_abort, this routine is to be called after a previous call to
vma_needs_reservation. It will abort/end the in progress reservation add
-operation.
+operation::
+
+ long vma_add_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
-long vma_add_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
This is a special wrapper routine to help facilitate reservation cleanup
on error paths. It is only called from the routine restore_reserve_on_error().
This routine is used in conjunction with vma_needs_reservation in an attempt
@@ -453,8 +508,10 @@ be done on error paths.
Reservation Cleanup in Error Paths
-----------------------------------
-As mentioned in the section "Reservation Map Helper Routines", reservation
+==================================
+
+As mentioned in the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
map modifications are performed in two steps. First vma_needs_reservation
is called before a page is allocated. If the allocation is successful,
then vma_commit_reservation is called. If not, vma_end_reservation is called.
@@ -494,13 +551,14 @@ so that a reservation will not be leaked when the huge page is freed.
Reservations and Memory Policy
-------------------------------
+==============================
Per-node huge page lists existed in struct hstate when git was first used
to manage Linux code. The concept of reservations was added some time later.
When reservations were added, no attempt was made to take memory policy
into account. While cpusets are not exactly the same as memory policy, this
comment in hugetlb_acct_memory sums up the interaction between reservations
-and cpusets/memory policy.
+and cpusets/memory policy::
+
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
@@ -525,5 +583,13 @@ of cpusets or memory policy there is no guarantee that huge pages will be
available on the required nodes. This is true even if there are a sufficient
number of global reservations.
+Hugetlbfs regression testing
+============================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions. In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
+--
Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.rst
index e912d7eee769..09bd24a92784 100644
--- a/Documentation/vm/hwpoison.txt
+++ b/Documentation/vm/hwpoison.rst
@@ -1,7 +1,14 @@
+.. hwpoison:
+
+========
+hwpoison
+========
+
What is hwpoison?
+=================
Upcoming Intel CPUs have support for recovering from some memory errors
-(``MCA recovery''). This requires the OS to declare a page "poisoned",
+(``MCA recovery``). This requires the OS to declare a page "poisoned",
kill the processes associated with it and avoid using it in the future.
This patchkit implements the necessary infrastructure in the VM.
@@ -46,9 +53,10 @@ address. This in theory allows other applications to handle
memory failures too. The expection is that near all applications
won't do that, but some very specialized ones might.
----
+Failure recovery modes
+======================
-There are two (actually three) modi memory failure recovery can be in:
+There are two (actually three) modes memory failure recovery can be in:
vm.memory_failure_recovery sysctl set to zero:
All memory failures cause a panic. Do not attempt recovery.
@@ -67,9 +75,8 @@ late kill
This is best for memory error unaware applications and default
Note some pages are always handled as late kill.
----
-
-User control:
+User control
+============
vm.memory_failure_recovery
See sysctl.txt
@@ -79,11 +86,19 @@ vm.memory_failure_early_kill
PR_MCE_KILL
Set early/late kill mode/revert to system default
- arg1: PR_MCE_KILL_CLEAR: Revert to system default
- arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
- PR_MCE_KILL_EARLY: Early kill
- PR_MCE_KILL_LATE: Late kill
- PR_MCE_KILL_DEFAULT: Use system global default
+
+ arg1: PR_MCE_KILL_CLEAR:
+ Revert to system default
+ arg1: PR_MCE_KILL_SET:
+ arg2 defines thread specific mode
+
+ PR_MCE_KILL_EARLY:
+ Early kill
+ PR_MCE_KILL_LATE:
+ Late kill
+ PR_MCE_KILL_DEFAULT
+ Use system global default
+
Note that if you want to have a dedicated thread which handles
the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
@@ -92,77 +107,64 @@ PR_MCE_KILL
PR_MCE_KILL_GET
return current mode
+Testing
+=======
----
-
-Testing:
-
-madvise(MADV_HWPOISON, ....)
- (as root)
- Poison a page in the process for testing
-
+* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
+ process for testing
-hwpoison-inject module through debugfs
+* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
-/sys/kernel/debug/hwpoison/
+ corrupt-pfn
+ Inject hwpoison fault at PFN echoed into this file. This does
+ some early filtering to avoid corrupted unintended pages in test suites.
-corrupt-pfn
+ unpoison-pfn
+ Software-unpoison page at PFN echoed into this file. This way
+ a page can be reused again. This only works for Linux
+ injected failures, not for real memory failures.
-Inject hwpoison fault at PFN echoed into this file. This does
-some early filtering to avoid corrupted unintended pages in test suites.
+ Note these injection interfaces are not stable and might change between
+ kernel versions
-unpoison-pfn
+ corrupt-filter-dev-major, corrupt-filter-dev-minor
+ Only handle memory failures to pages associated with the file
+ system defined by block device major/minor. -1U is the
+ wildcard value. This should be only used for testing with
+ artificial injection.
-Software-unpoison page at PFN echoed into this file. This
-way a page can be reused again.
-This only works for Linux injected failures, not for real
-memory failures.
+ corrupt-filter-memcg
+ Limit injection to pages owned by memgroup. Specified by inode
+ number of the memcg.
-Note these injection interfaces are not stable and might change between
-kernel versions
+ Example::
-corrupt-filter-dev-major
-corrupt-filter-dev-minor
+ mkdir /sys/fs/cgroup/mem/hwpoison
-Only handle memory failures to pages associated with the file system defined
-by block device major/minor. -1U is the wildcard value.
-This should be only used for testing with artificial injection.
+ usemem -m 100 -s 1000 &
+ echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-corrupt-filter-memcg
+ memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+ echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-Limit injection to pages owned by memgroup. Specified by inode number
-of the memcg.
+ page-types -p `pidof init` --hwpoison # shall do nothing
+ page-types -p `pidof usemem` --hwpoison # poison its pages
-Example:
- mkdir /sys/fs/cgroup/mem/hwpoison
+ corrupt-filter-flags-mask, corrupt-filter-flags-value
+ When specified, only poison pages if ((page_flags & mask) ==
+ value). This allows stress testing of many kinds of
+ pages. The page_flags are the same as in /proc/kpageflags. The
+ flag bits are defined in include/linux/kernel-page-flags.h and
+ documented in Documentation/admin-guide/mm/pagemap.rst
- usemem -m 100 -s 1000 &
- echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+* Architecture specific MCE injector
- memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
- echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+ x86 has mce-inject, mce-test
- page-types -p `pidof init` --hwpoison # shall do nothing
- page-types -p `pidof usemem` --hwpoison # poison its pages
+ Some portable hwpoison test programs in mce-test, see below.
-corrupt-filter-flags-mask
-corrupt-filter-flags-value
-
-When specified, only poison pages if ((page_flags & mask) == value).
-This allows stress testing of many kinds of pages. The page_flags
-are the same as in /proc/kpageflags. The flag bits are defined in
-include/linux/kernel-page-flags.h and documented in
-Documentation/vm/pagemap.txt
-
-Architecture specific MCE injector
-
-x86 has mce-inject, mce-test
-
-Some portable hwpoison test programs in mce-test, see blow.
-
----
-
-References:
+References
+==========
http://halobates.de/mce-lc09-2.pdf
Overview presentation from LinuxCon 09
@@ -174,14 +176,11 @@ git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
x86 specific injector
----
-
-Limitations:
-
+Limitations
+===========
- Not all page types are supported and never will. Most kernel internal
-objects cannot be recovered, only LRU pages for now.
+ objects cannot be recovered, only LRU pages for now.
- Right now hugepage support is missing.
---
Andi Kleen, Oct 2009
-
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
new file mode 100644
index 000000000000..c4ded22197ca
--- /dev/null
+++ b/Documentation/vm/index.rst
@@ -0,0 +1,50 @@
+=====================================
+Linux Memory Management Documentation
+=====================================
+
+This is a collection of documents about Linux memory management (mm) subsystem.
+
+User guides for MM features
+===========================
+
+The following documents provide guides for controlling and tuning
+various features of the Linux memory management
+
+.. toctree::
+ :maxdepth: 1
+
+ swap_numa
+ zswap
+
+Kernel developers MM documentation
+==================================
+
+The below documents describe MM internals with different level of
+details ranging from notes and mailing list responses to elaborate
+descriptions of data structures and algorithms.
+
+.. toctree::
+ :maxdepth: 1
+
+ active_mm
+ balance
+ cleancache
+ frontswap
+ highmem
+ hmm
+ hwpoison
+ hugetlbfs_reserv
+ ksm
+ mmu_notifier
+ numa
+ overcommit-accounting
+ page_migration
+ page_frags
+ page_owner
+ remap_file_pages
+ slub
+ split_page_table_lock
+ transhuge
+ unevictable-lru
+ z3fold
+ zsmalloc
diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst
new file mode 100644
index 000000000000..d32016d9be2c
--- /dev/null
+++ b/Documentation/vm/ksm.rst
@@ -0,0 +1,87 @@
+.. _ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
+
+Design
+======
+
+Overview
+--------
+
+.. kernel-doc:: mm/ksm.c
+ :DOC: Overview
+
+Reverse mapping
+---------------
+KSM maintains reverse mapping information for KSM pages in the stable
+tree.
+
+If a KSM page is shared between less than ``max_page_sharing`` VMAs,
+the node of the stable tree that represents such KSM page points to a
+list of :c:type:`struct rmap_item` and the ``page->mapping`` of the
+KSM page points to the stable tree node.
+
+When the sharing passes this threshold, KSM adds a second dimension to
+the stable tree. The tree node becomes a "chain" that links one or
+more "dups". Each "dup" keeps reverse mapping information for a KSM
+page with ``page->mapping`` pointing to that "dup".
+
+Every "chain" and all "dups" linked into a "chain" enforce the
+invariant that they represent the same write protected memory content,
+even if each "dup" will be pointed by a different KSM page copy of
+that content.
+
+This way the stable tree lookup computational complexity is unaffected
+if compared to an unlimited list of reverse mappings. It is still
+enforced that there cannot be KSM page content duplicates in the
+stable tree itself.
+
+The deduplication limit enforced by ``max_page_sharing`` is required
+to avoid the virtual memory rmap lists to grow too large. The rmap
+walk has O(N) complexity where N is the number of rmap_items
+(i.e. virtual mappings) that are sharing the page, which is in turn
+capped by ``max_page_sharing``. So this effectively spreads the linear
+O(N) computational complexity from rmap walk context over different
+KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
+but N is the number of stable_node "dups", not the number of
+rmap_items, so it has not a significant impact on ksmd performance. In
+practice the best stable_node "dup" candidate will be kept and found
+at the head of the "dups" list.
+
+High values of ``max_page_sharing`` result in faster memory merging
+(because there will be fewer stable_node dups queued into the
+stable_node chain->hlist to check for pruning) and higher
+deduplication factor at the expense of slower worst case for rmap
+walks for any KSM page which can happen during swapping, compaction,
+NUMA balancing and page migration.
+
+The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
+``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to free up
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
+The whole list of stable_node "dups" linked in the stable_node
+"chains" is scanned periodically in order to prune stale stable_nodes.
+The frequency of such scans is defined by
+``stable_node_chains_prune_millisecs`` sysfs tunable.
+
+Reference
+---------
+.. kernel-doc:: mm/ksm.c
+ :functions: mm_slot ksm_scan stable_node rmap_item
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
deleted file mode 100644
index 6686bd267dc9..000000000000
--- a/Documentation/vm/ksm.txt
+++ /dev/null
@@ -1,178 +0,0 @@
-How to use the Kernel Samepage Merging feature
-----------------------------------------------
-
-KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
-added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation,
-and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
-
-The KSM daemon ksmd periodically scans those areas of user memory which
-have been registered with it, looking for pages of identical content which
-can be replaced by a single write-protected page (which is automatically
-copied if a process later wants to update its content).
-
-KSM was originally developed for use with KVM (where it was known as
-Kernel Shared Memory), to fit more virtual machines into physical memory,
-by sharing the data common between them. But it can be useful to any
-application which generates many instances of the same data.
-
-KSM only merges anonymous (private) pages, never pagecache (file) pages.
-KSM's merged pages were originally locked into kernel memory, but can now
-be swapped out just like other user pages (but sharing is broken when they
-are swapped back in: ksmd must rediscover their identity and merge again).
-
-KSM only operates on those areas of address space which an application
-has advised to be likely candidates for merging, by using the madvise(2)
-system call: int madvise(addr, length, MADV_MERGEABLE).
-
-The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
-that advice and restore unshared pages: whereupon KSM unmerges whatever
-it merged in that range. Note: this unmerging call may suddenly require
-more memory than is available - possibly failing with EAGAIN, but more
-probably arousing the Out-Of-Memory killer.
-
-If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
-and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was
-built with CONFIG_KSM=y, those calls will normally succeed: even if the
-the KSM daemon is not currently running, MADV_MERGEABLE still registers
-the range for whenever the KSM daemon is started; even if the range
-cannot contain any pages which KSM could actually merge; even if
-MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
-
-If a region of memory must be split into at least one new MADV_MERGEABLE
-or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
-will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
-
-Like other madvise calls, they are intended for use on mapped areas of
-the user address space: they will report ENOMEM if the specified range
-includes unmapped gaps (though working on the intervening mapped areas),
-and might fail with EAGAIN if not enough memory for internal structures.
-
-Applications should be considerate in their use of MADV_MERGEABLE,
-restricting its use to areas likely to benefit. KSM's scans may use a lot
-of processing power: some installations will disable KSM for that reason.
-
-The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
-readable by all but writable only by root:
-
-pages_to_scan - how many present pages to scan before ksmd goes to sleep
- e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan"
- Default: 100 (chosen for demonstration purposes)
-
-sleep_millisecs - how many milliseconds ksmd should sleep before next scan
- e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
- Default: 20 (chosen for demonstration purposes)
-
-merge_across_nodes - specifies if pages from different numa nodes can be merged.
- When set to 0, ksm merges only pages which physically
- reside in the memory area of same NUMA node. That brings
- lower latency to access of shared pages. Systems with more
- nodes, at significant NUMA distances, are likely to benefit
- from the lower latency of setting 0. Smaller systems, which
- need to minimize memory usage, are likely to benefit from
- the greater sharing of setting 1 (default). You may wish to
- compare how your system performs under each setting, before
- deciding on which to use. merge_across_nodes setting can be
- changed only when there are no ksm shared pages in system:
- set run 2 to unmerge pages first, then to 1 after changing
- merge_across_nodes, to remerge according to the new setting.
- Default: 1 (merging across nodes as in earlier releases)
-
-run - set 0 to stop ksmd from running but keep merged pages,
- set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
- set 2 to stop ksmd and unmerge all pages currently merged,
- but leave mergeable areas registered for next run
- Default: 0 (must be changed to 1 to activate KSM,
- except if CONFIG_SYSFS is disabled)
-
-use_zero_pages - specifies whether empty pages (i.e. allocated pages
- that only contain zeroes) should be treated specially.
- When set to 1, empty pages are merged with the kernel
- zero page(s) instead of with each other as it would
- happen normally. This can improve the performance on
- architectures with coloured zero pages, depending on
- the workload. Care should be taken when enabling this
- setting, as it can potentially degrade the performance
- of KSM for some workloads, for example if the checksums
- of pages candidate for merging match the checksum of
- an empty page. This setting can be changed at any time,
- it is only effective for pages merged after the change.
- Default: 0 (normal KSM behaviour as in earlier releases)
-
-max_page_sharing - Maximum sharing allowed for each KSM page. This
- enforces a deduplication limit to avoid the virtual
- memory rmap lists to grow too large. The minimum
- value is 2 as a newly created KSM page will have at
- least two sharers. The rmap walk has O(N)
- complexity where N is the number of rmap_items
- (i.e. virtual mappings) that are sharing the page,
- which is in turn capped by max_page_sharing. So
- this effectively spread the the linear O(N)
- computational complexity from rmap walk context
- over different KSM pages. The ksmd walk over the
- stable_node "chains" is also O(N), but N is the
- number of stable_node "dups", not the number of
- rmap_items, so it has not a significant impact on
- ksmd performance. In practice the best stable_node
- "dup" candidate will be kept and found at the head
- of the "dups" list. The higher this value the
- faster KSM will merge the memory (because there
- will be fewer stable_node dups queued into the
- stable_node chain->hlist to check for pruning) and
- the higher the deduplication factor will be, but
- the slowest the worst case rmap walk could be for
- any given KSM page. Slowing down the rmap_walk
- means there will be higher latency for certain
- virtual memory operations happening during
- swapping, compaction, NUMA balancing and page
- migration, in turn decreasing responsiveness for
- the caller of those virtual memory operations. The
- scheduler latency of other tasks not involved with
- the VM operations doing the rmap walk is not
- affected by this parameter as the rmap walks are
- always schedule friendly themselves.
-
-stable_node_chains_prune_millisecs - How frequently to walk the whole
- list of stable_node "dups" linked in the
- stable_node "chains" in order to prune stale
- stable_nodes. Smaller milllisecs values will free
- up the KSM metadata with lower latency, but they
- will make ksmd use more CPU during the scan. This
- only applies to the stable_node chains so it's a
- noop if not a single KSM page hit the
- max_page_sharing yet (there would be no stable_node
- chains in such case).
-
-The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
-
-pages_shared - how many shared pages are being used
-pages_sharing - how many more sites are sharing them i.e. how much saved
-pages_unshared - how many pages unique but repeatedly checked for merging
-pages_volatile - how many pages changing too fast to be placed in a tree
-full_scans - how many times all mergeable areas have been scanned
-
-stable_node_chains - number of stable node chains allocated, this is
- effectively the number of KSM pages that hit the
- max_page_sharing limit
-stable_node_dups - number of stable node dups queued into the
- stable_node chains
-
-A high ratio of pages_sharing to pages_shared indicates good sharing, but
-a high ratio of pages_unshared to pages_sharing indicates wasted effort.
-pages_volatile embraces several different kinds of activity, but a high
-proportion there would also indicate poor use of madvise MADV_MERGEABLE.
-
-The maximum possible page_sharing/page_shared ratio is limited by the
-max_page_sharing tunable. To increase the ratio max_page_sharing must
-be increased accordingly.
-
-The stable_node_dups/stable_node_chains ratio is also affected by the
-max_page_sharing tunable, and an high ratio may indicate fragmentation
-in the stable_node dups, which could be solved by introducing
-fragmentation algorithms in ksmd which would refile rmap_items from
-one stable_node dup to another stable_node dup, in order to freeup
-stable_node "dups" with few rmap_items in them, but that may increase
-the ksmd CPU usage and possibly slowdown the readonly computations on
-the KSM pages of the applications.
-
-Izik Eidus,
-Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst
new file mode 100644
index 000000000000..47baa1cf28c5
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.rst
@@ -0,0 +1,99 @@
+.. _mmu_notifier:
+
+When do you need to notify inside page table lock ?
+===================================================
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+ A) page backing address is free before mmu_notifier_invalidate_range_end()
+ B) a page table entry is updated to point to a new page (COW, write fault
+ on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+
+ - take page table lock
+ - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+ - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0 {try to write to addrA}
+ CPU-thread-1 {try to write to addrB}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {read addrA and populate device TLB}
+ DEV-thread-2 {read addrB and populate device TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}}
+ CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {preempted}
+ CPU-thread-2 {write to addrA which is a write to new page}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {preempted}
+ CPU-thread-2 {}
+ CPU-thread-3 {write to addrB which is a write to new page}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {read addrA from old page}
+ DEV-thread-2 {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt
deleted file mode 100644
index 23b462566bb7..000000000000
--- a/Documentation/vm/mmu_notifier.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-When do you need to notify inside page table lock ?
-
-When clearing a pte/pmd we are given a choice to notify the event through
-(notify version of *_clear_flush call mmu_notifier_invalidate_range) under
-the page table lock. But that notification is not necessary in all cases.
-
-For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
-thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
-process virtual address space). There is only 2 cases when you need to notify
-those secondary TLB while holding page table lock when clearing a pte/pmd:
-
- A) page backing address is free before mmu_notifier_invalidate_range_end()
- B) a page table entry is updated to point to a new page (COW, write fault
- on zero page, __replace_page(), ...)
-
-Case A is obvious you do not want to take the risk for the device to write to
-a page that might now be used by some completely different task.
-
-Case B is more subtle. For correctness it requires the following sequence to
-happen:
- - take page table lock
- - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
- - set page table entry to point to new page
-
-If clearing the page table entry is not followed by a notify before setting
-the new pte/pmd value then you can break memory model like C11 or C++11 for
-the device.
-
-Consider the following scenario (device use a feature similar to ATS/PASID):
-
-Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume
-they are write protected for COW (other case of B apply too).
-
-[Time N] --------------------------------------------------------------------
-CPU-thread-0 {try to write to addrA}
-CPU-thread-1 {try to write to addrB}
-CPU-thread-2 {}
-CPU-thread-3 {}
-DEV-thread-0 {read addrA and populate device TLB}
-DEV-thread-2 {read addrB and populate device TLB}
-[Time N+1] ------------------------------------------------------------------
-CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
-CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
-CPU-thread-2 {}
-CPU-thread-3 {}
-DEV-thread-0 {}
-DEV-thread-2 {}
-[Time N+2] ------------------------------------------------------------------
-CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}}
-CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}}
-CPU-thread-2 {}
-CPU-thread-3 {}
-DEV-thread-0 {}
-DEV-thread-2 {}
-[Time N+3] ------------------------------------------------------------------
-CPU-thread-0 {preempted}
-CPU-thread-1 {preempted}
-CPU-thread-2 {write to addrA which is a write to new page}
-CPU-thread-3 {}
-DEV-thread-0 {}
-DEV-thread-2 {}
-[Time N+3] ------------------------------------------------------------------
-CPU-thread-0 {preempted}
-CPU-thread-1 {preempted}
-CPU-thread-2 {}
-CPU-thread-3 {write to addrB which is a write to new page}
-DEV-thread-0 {}
-DEV-thread-2 {}
-[Time N+4] ------------------------------------------------------------------
-CPU-thread-0 {preempted}
-CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
-CPU-thread-2 {}
-CPU-thread-3 {}
-DEV-thread-0 {}
-DEV-thread-2 {}
-[Time N+5] ------------------------------------------------------------------
-CPU-thread-0 {preempted}
-CPU-thread-1 {}
-CPU-thread-2 {}
-CPU-thread-3 {}
-DEV-thread-0 {read addrA from old page}
-DEV-thread-2 {read addrB from new page}
-
-So here because at time N+2 the clear page table entry was not pair with a
-notification to invalidate the secondary TLB, the device see the new value for
-addrB before seing the new value for addrA. This break total memory ordering
-for the device.
-
-When changing a pte to write protect or to point to a new write protected page
-with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
-call to mmu_notifier_invalidate_range_end() outside the page table lock. This
-is true even if the thread doing the page table update is preempted right after
-releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/numa b/Documentation/vm/numa.rst
index a31b85b9bb88..185d8a568168 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa.rst
@@ -1,6 +1,10 @@
+.. _numa:
+
Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+=============
What is NUMA?
+=============
This question can be answered from a couple of perspectives: the
hardware view and the Linux software view.
@@ -106,7 +110,7 @@ to improve NUMA locality using various CPU affinity command line interfaces,
such as taskset(1) and numactl(1), and program interfaces such as
sched_setaffinity(2). Further, one can modify the kernel's default local
allocation behavior using Linux NUMA memory policy.
-[see Documentation/vm/numa_memory_policy.txt.]
+[see Documentation/admin-guide/mm/numa_memory_policy.rst.]
System administrators can restrict the CPUs and nodes' memories that a non-
privileged user can specify in the scheduling or NUMA commands and functions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
deleted file mode 100644
index 622b927816e7..000000000000
--- a/Documentation/vm/numa_memory_policy.txt
+++ /dev/null
@@ -1,452 +0,0 @@
-
-What is Linux Memory Policy?
-
-In the Linux kernel, "memory policy" determines from which node the kernel will
-allocate memory in a NUMA system or in an emulated NUMA system. Linux has
-supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
-The current memory policy support was added to Linux 2.6 around May 2004. This
-document attempts to describe the concepts and APIs of the 2.6 memory policy
-support.
-
-Memory policies should not be confused with cpusets
-(Documentation/cgroup-v1/cpusets.txt)
-which is an administrative mechanism for restricting the nodes from which
-memory may be allocated by a set of processes. Memory policies are a
-programming interface that a NUMA-aware application can take advantage of. When
-both cpusets and policies are applied to a task, the restrictions of the cpuset
-takes priority. See "MEMORY POLICIES AND CPUSETS" below for more details.
-
-MEMORY POLICY CONCEPTS
-
-Scope of Memory Policies
-
-The Linux kernel supports _scopes_ of memory policy, described here from
-most general to most specific:
-
- System Default Policy: this policy is "hard coded" into the kernel. It
- is the policy that governs all page allocations that aren't controlled
- by one of the more specific policy scopes discussed below. When the
- system is "up and running", the system default policy will use "local
- allocation" described below. However, during boot up, the system
- default policy will be set to interleave allocations across all nodes
- with "sufficient" memory, so as not to overload the initial boot node
- with boot-time allocations.
-
- Task/Process Policy: this is an optional, per-task policy. When defined
- for a specific task, this policy controls all page allocations made by or
- on behalf of the task that aren't controlled by a more specific scope.
- If a task does not define a task policy, then all page allocations that
- would have been controlled by the task policy "fall back" to the System
- Default Policy.
-
- The task policy applies to the entire address space of a task. Thus,
- it is inheritable, and indeed is inherited, across both fork()
- [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task
- to establish the task policy for a child task exec()'d from an
- executable image that has no awareness of memory policy. See the
- MEMORY POLICY APIS section, below, for an overview of the system call
- that a task may use to set/change its task/process policy.
-
- In a multi-threaded task, task policies apply only to the thread
- [Linux kernel task] that installs the policy and any threads
- subsequently created by that thread. Any sibling threads existing
- at the time a new task policy is installed retain their current
- policy.
-
- A task policy applies only to pages allocated after the policy is
- installed. Any pages already faulted in by the task when the task
- changes its task policy remain where they were allocated based on
- the policy at the time they were allocated.
-
- VMA Policy: A "VMA" or "Virtual Memory Area" refers to a range of a task's
- virtual address space. A task may define a specific policy for a range
- of its virtual address space. See the MEMORY POLICIES APIS section,
- below, for an overview of the mbind() system call used to set a VMA
- policy.
-
- A VMA policy will govern the allocation of pages that back this region of
- the address space. Any regions of the task's address space that don't
- have an explicit VMA policy will fall back to the task policy, which may
- itself fall back to the System Default Policy.
-
- VMA policies have a few complicating details:
-
- VMA policy applies ONLY to anonymous pages. These include pages
- allocated for anonymous segments, such as the task stack and heap, and
- any regions of the address space mmap()ed with the MAP_ANONYMOUS flag.
- If a VMA policy is applied to a file mapping, it will be ignored if
- the mapping used the MAP_SHARED flag. If the file mapping used the
- MAP_PRIVATE flag, the VMA policy will only be applied when an
- anonymous page is allocated on an attempt to write to the mapping--
- i.e., at Copy-On-Write.
-
- VMA policies are shared between all tasks that share a virtual address
- space--a.k.a. threads--independent of when the policy is installed; and
- they are inherited across fork(). However, because VMA policies refer
- to a specific region of a task's address space, and because the address
- space is discarded and recreated on exec*(), VMA policies are NOT
- inheritable across exec(). Thus, only NUMA-aware applications may
- use VMA policies.
-
- A task may install a new VMA policy on a sub-range of a previously
- mmap()ed region. When this happens, Linux splits the existing virtual
- memory area into 2 or 3 VMAs, each with it's own policy.
-
- By default, VMA policy applies only to pages allocated after the policy
- is installed. Any pages already faulted into the VMA range remain
- where they were allocated based on the policy at the time they were
- allocated. However, since 2.6.16, Linux supports page migration via
- the mbind() system call, so that page contents can be moved to match
- a newly installed policy.
-
- Shared Policy: Conceptually, shared policies apply to "memory objects"
- mapped shared into one or more tasks' distinct address spaces. An
- application installs a shared policies the same way as VMA policies--using
- the mbind() system call specifying a range of virtual addresses that map
- the shared object. However, unlike VMA policies, which can be considered
- to be an attribute of a range of a task's address space, shared policies
- apply directly to the shared object. Thus, all tasks that attach to the
- object share the policy, and all pages allocated for the shared object,
- by any task, will obey the shared policy.
-
- As of 2.6.22, only shared memory segments, created by shmget() or
- mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared
- policy support was added to Linux, the associated data structures were
- added to hugetlbfs shmem segments. At the time, hugetlbfs did not
- support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
- shmem segments were never "hooked up" to the shared policy support.
- Although hugetlbfs segments now support lazy allocation, their support
- for shared policy has not been completed.
-
- As mentioned above [re: VMA policies], allocations of page cache
- pages for regular files mmap()ed with MAP_SHARED ignore any VMA
- policy installed on the virtual address range backed by the shared
- file mapping. Rather, shared page cache pages, including pages backing
- private mappings that have not yet been written by the task, follow
- task policy, if any, else System Default Policy.
-
- The shared policy infrastructure supports different policies on subset
- ranges of the shared object. However, Linux still splits the VMA of
- the task that installs the policy for each range of distinct policy.
- Thus, different tasks that attach to a shared memory segment can have
- different VMA configurations mapping that one shared object. This
- can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
- a shared memory region, when one task has installed shared policy on
- one or more ranges of the region.
-
-Components of Memory Policies
-
- A Linux memory policy consists of a "mode", optional mode flags, and an
- optional set of nodes. The mode determines the behavior of the policy,
- the optional mode flags determine the behavior of the mode, and the
- optional set of nodes can be viewed as the arguments to the policy
- behavior.
-
- Internally, memory policies are implemented by a reference counted
- structure, struct mempolicy. Details of this structure will be discussed
- in context, below, as required to explain the behavior.
-
- Linux memory policy supports the following 4 behavioral modes:
-
- Default Mode--MPOL_DEFAULT: This mode is only used in the memory
- policy APIs. Internally, MPOL_DEFAULT is converted to the NULL
- memory policy in all policy scopes. Any existing non-default policy
- will simply be removed when MPOL_DEFAULT is specified. As a result,
- MPOL_DEFAULT means "fall back to the next most specific policy scope."
-
- For example, a NULL or default task policy will fall back to the
- system default policy. A NULL or default vma policy will fall
- back to the task policy.
-
- When specified in one of the memory policy APIs, the Default mode
- does not use the optional set of nodes.
-
- It is an error for the set of nodes specified for this policy to
- be non-empty.
-
- MPOL_BIND: This mode specifies that memory must come from the
- set of nodes specified by the policy. Memory will be allocated from
- the node in the set with sufficient free memory that is closest to
- the node where the allocation takes place.
-
- MPOL_PREFERRED: This mode specifies that the allocation should be
- attempted from the single node specified in the policy. If that
- allocation fails, the kernel will search other nodes, in order of
- increasing distance from the preferred node based on information
- provided by the platform firmware.
-
- Internally, the Preferred policy uses a single node--the
- preferred_node member of struct mempolicy. When the internal
- mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and
- the policy is interpreted as local allocation. "Local" allocation
- policy can be viewed as a Preferred policy that starts at the node
- containing the cpu where the allocation takes place.
-
- It is possible for the user to specify that local allocation is
- always preferred by passing an empty nodemask with this mode.
- If an empty nodemask is passed, the policy cannot use the
- MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described
- below.
-
- MPOL_INTERLEAVED: This mode specifies that page allocations be
- interleaved, on a page granularity, across the nodes specified in
- the policy. This mode also behaves slightly differently, based on
- the context where it is used:
-
- For allocation of anonymous pages and shared memory pages,
- Interleave mode indexes the set of nodes specified by the policy
- using the page offset of the faulting address into the segment
- [VMA] containing the address modulo the number of nodes specified
- by the policy. It then attempts to allocate a page, starting at
- the selected node, as if the node had been specified by a Preferred
- policy or had been selected by a local allocation. That is,
- allocation will follow the per node zonelist.
-
- For allocation of page cache pages, Interleave mode indexes the set
- of nodes specified by the policy using a node counter maintained
- per task. This counter wraps around to the lowest specified node
- after it reaches the highest specified node. This will tend to
- spread the pages out over the nodes specified by the policy based
- on the order in which they are allocated, rather than based on any
- page offset into an address range or file. During system boot up,
- the temporary interleaved system default policy works in this
- mode.
-
- Linux memory policy supports the following optional mode flags:
-
- MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
- the user should not be remapped if the task or VMA's set of allowed
- nodes changes after the memory policy has been defined.
-
- Without this flag, anytime a mempolicy is rebound because of a
- change in the set of allowed nodes, the node (Preferred) or
- nodemask (Bind, Interleave) is remapped to the new set of
- allowed nodes. This may result in nodes being used that were
- previously undesired.
-
- With this flag, if the user-specified nodes overlap with the
- nodes allowed by the task's cpuset, then the memory policy is
- applied to their intersection. If the two sets of nodes do not
- overlap, the Default policy is used.
-
- For example, consider a task that is attached to a cpuset with
- mems 1-3 that sets an Interleave policy over the same set. If
- the cpuset's mems change to 3-5, the Interleave will now occur
- over nodes 3, 4, and 5. With this flag, however, since only node
- 3 is allowed from the user's nodemask, the "interleave" only
- occurs over that node. If no nodes from the user's nodemask are
- now allowed, the Default behavior is used.
-
- MPOL_F_STATIC_NODES cannot be combined with the
- MPOL_F_RELATIVE_NODES flag. It also cannot be used for
- MPOL_PREFERRED policies that were created with an empty nodemask
- (local allocation).
-
- MPOL_F_RELATIVE_NODES: This flag specifies that the nodemask passed
- by the user will be mapped relative to the set of the task or VMA's
- set of allowed nodes. The kernel stores the user-passed nodemask,
- and if the allowed nodes changes, then that original nodemask will
- be remapped relative to the new set of allowed nodes.
-
- Without this flag (and without MPOL_F_STATIC_NODES), anytime a
- mempolicy is rebound because of a change in the set of allowed
- nodes, the node (Preferred) or nodemask (Bind, Interleave) is
- remapped to the new set of allowed nodes. That remap may not
- preserve the relative nature of the user's passed nodemask to its
- set of allowed nodes upon successive rebinds: a nodemask of
- 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
- allowed nodes is restored to its original state.
-
- With this flag, the remap is done so that the node numbers from
- the user's passed nodemask are relative to the set of allowed
- nodes. In other words, if nodes 0, 2, and 4 are set in the user's
- nodemask, the policy will be effected over the first (and in the
- Bind or Interleave case, the third and fifth) nodes in the set of
- allowed nodes. The nodemask passed by the user represents nodes
- relative to task or VMA's set of allowed nodes.
-
- If the user's nodemask includes nodes that are outside the range
- of the new set of allowed nodes (for example, node 5 is set in
- the user's nodemask when the set of allowed nodes is only 0-3),
- then the remap wraps around to the beginning of the nodemask and,
- if not already set, sets the node in the mempolicy nodemask.
-
- For example, consider a task that is attached to a cpuset with
- mems 2-5 that sets an Interleave policy over the same set with
- MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the
- interleave now occurs over nodes 3,5-7. If the cpuset's mems
- then change to 0,2-3,5, then the interleave occurs over nodes
- 0,2-3,5.
-
- Thanks to the consistent remapping, applications preparing
- nodemasks to specify memory policies using this flag should
- disregard their current, actual cpuset imposed memory placement
- and prepare the nodemask as if they were always located on
- memory nodes 0 to N-1, where N is the number of memory nodes the
- policy is intended to manage. Let the kernel then remap to the
- set of memory nodes allowed by the task's cpuset, as that may
- change over time.
-
- MPOL_F_RELATIVE_NODES cannot be combined with the
- MPOL_F_STATIC_NODES flag. It also cannot be used for
- MPOL_PREFERRED policies that were created with an empty nodemask
- (local allocation).
-
-MEMORY POLICY REFERENCE COUNTING
-
-To resolve use/free races, struct mempolicy contains an atomic reference
-count field. Internal interfaces, mpol_get()/mpol_put() increment and
-decrement this reference count, respectively. mpol_put() will only free
-the structure back to the mempolicy kmem cache when the reference count
-goes to zero.
-
-When a new memory policy is allocated, its reference count is initialized
-to '1', representing the reference held by the task that is installing the
-new policy. When a pointer to a memory policy structure is stored in another
-structure, another reference is added, as the task's reference will be dropped
-on completion of the policy installation.
-
-During run-time "usage" of the policy, we attempt to minimize atomic operations
-on the reference count, as this can lead to cache lines bouncing between cpus
-and NUMA nodes. "Usage" here means one of the following:
-
-1) querying of the policy, either by the task itself [using the get_mempolicy()
- API discussed below] or by another task using the /proc/<pid>/numa_maps
- interface.
-
-2) examination of the policy to determine the policy mode and associated node
- or node lists, if any, for page allocation. This is considered a "hot
- path". Note that for MPOL_BIND, the "usage" extends across the entire
- allocation process, which may sleep during page reclaimation, because the
- BIND policy nodemask is used, by reference, to filter ineligible nodes.
-
-We can avoid taking an extra reference during the usages listed above as
-follows:
-
-1) we never need to get/free the system default policy as this is never
- changed nor freed, once the system is up and running.
-
-2) for querying the policy, we do not need to take an extra reference on the
- target task's task policy nor vma policies because we always acquire the
- task's mm's mmap_sem for read during the query. The set_mempolicy() and
- mbind() APIs [see below] always acquire the mmap_sem for write when
- installing or replacing task or vma policies. Thus, there is no possibility
- of a task or thread freeing a policy while another task or thread is
- querying it.
-
-3) Page allocation usage of task or vma policy occurs in the fault path where
- we hold them mmap_sem for read. Again, because replacing the task or vma
- policy requires that the mmap_sem be held for write, the policy can't be
- freed out from under us while we're using it for page allocation.
-
-4) Shared policies require special consideration. One task can replace a
- shared memory policy while another task, with a distinct mmap_sem, is
- querying or allocating a page based on the policy. To resolve this
- potential race, the shared policy infrastructure adds an extra reference
- to the shared policy during lookup while holding a spin lock on the shared
- policy management structure. This requires that we drop this extra
- reference when we're finished "using" the policy. We must drop the
- extra reference on shared policies in the same query/allocation paths
- used for non-shared policies. For this reason, shared policies are marked
- as such, and the extra reference is dropped "conditionally"--i.e., only
- for shared policies.
-
- Because of this extra reference counting, and because we must lookup
- shared policies in a tree structure under spinlock, shared policies are
- more expensive to use in the page allocation path. This is especially
- true for shared policies on shared memory regions shared by tasks running
- on different NUMA nodes. This extra overhead can be avoided by always
- falling back to task or system default policy for shared memory regions,
- or by prefaulting the entire shared memory region into memory and locking
- it down. However, this might not be appropriate for all applications.
-
-MEMORY POLICY APIs
-
-Linux supports 3 system calls for controlling memory policy. These APIS
-always affect only the calling task, the calling task's address space, or
-some shared object mapped into the calling task's address space.
-
- Note: the headers that define these APIs and the parameter data types
- for user space applications reside in a package that is not part of
- the Linux kernel. The kernel system call interfaces, with the 'sys_'
- prefix, are defined in <linux/syscalls.h>; the mode and flag
- definitions are defined in <linux/mempolicy.h>.
-
-Set [Task] Memory Policy:
-
- long set_mempolicy(int mode, const unsigned long *nmask,
- unsigned long maxnode);
-
- Set's the calling task's "task/process memory policy" to mode
- specified by the 'mode' argument and the set of nodes defined
- by 'nmask'. 'nmask' points to a bit mask of node ids containing
- at least 'maxnode' ids. Optional mode flags may be passed by
- combining the 'mode' argument with the flag (for example:
- MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
-
- See the set_mempolicy(2) man page for more details
-
-
-Get [Task] Memory Policy or Related Information
-
- long get_mempolicy(int *mode,
- const unsigned long *nmask, unsigned long maxnode,
- void *addr, int flags);
-
- Queries the "task/process memory policy" of the calling task, or
- the policy or location of a specified virtual address, depending
- on the 'flags' argument.
-
- See the get_mempolicy(2) man page for more details
-
-
-Install VMA/Shared Policy for a Range of Task's Address Space
-
- long mbind(void *start, unsigned long len, int mode,
- const unsigned long *nmask, unsigned long maxnode,
- unsigned flags);
-
- mbind() installs the policy specified by (mode, nmask, maxnodes) as
- a VMA policy for the range of the calling task's address space
- specified by the 'start' and 'len' arguments. Additional actions
- may be requested via the 'flags' argument.
-
- See the mbind(2) man page for more details.
-
-MEMORY POLICY COMMAND LINE INTERFACE
-
-Although not strictly part of the Linux implementation of memory policy,
-a command line tool, numactl(8), exists that allows one to:
-
-+ set the task policy for a specified program via set_mempolicy(2), fork(2) and
- exec(2)
-
-+ set the shared policy for a shared memory segment via mbind(2)
-
-The numactl(8) tool is packaged with the run-time version of the library
-containing the memory policy system call wrappers. Some distributions
-package the headers and compile-time libraries in a separate development
-package.
-
-
-MEMORY POLICIES AND CPUSETS
-
-Memory policies work within cpusets as described above. For memory policies
-that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints. If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset and
-MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
-specified for the policy and the set of nodes with memory is used. If the
-result is the empty set, the policy is considered invalid and cannot be
-installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
-onto and folded into the task's set of allowed nodes as previously described.
-
-The interaction of memory policies and cpusets can be problematic when tasks
-in two cpusets share access to a memory region, such as shared memory segments
-created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
-any of the tasks install shared policy on the region, only nodes whose
-memories are allowed in both cpusets may be used in the policies. Obtaining
-this information requires "stepping outside" the memory policy APIs to use the
-cpuset information and requires that one know in what cpusets other task might
-be attaching to the shared region. Furthermore, if the cpusets' allowed
-memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
deleted file mode 100644
index cbfaaa674118..000000000000
--- a/Documentation/vm/overcommit-accounting
+++ /dev/null
@@ -1,80 +0,0 @@
-The Linux kernel supports the following overcommit handling modes
-
-0 - Heuristic overcommit handling. Obvious overcommits of
- address space are refused. Used for a typical system. It
- ensures a seriously wild allocation fails while allowing
- overcommit to reduce swap usage. root is allowed to
- allocate slightly more memory in this mode. This is the
- default.
-
-1 - Always overcommit. Appropriate for some scientific
- applications. Classic example is code using sparse arrays
- and just relying on the virtual memory consisting almost
- entirely of zero pages.
-
-2 - Don't overcommit. The total address space commit
- for the system is not permitted to exceed swap + a
- configurable amount (default is 50%) of physical RAM.
- Depending on the amount you use, in most situations
- this means a process will not be killed while accessing
- pages but will receive errors on memory allocation as
- appropriate.
-
- Useful for applications that want to guarantee their
- memory allocations will be available in the future
- without having to initialize every page.
-
-The overcommit policy is set via the sysctl `vm.overcommit_memory'.
-
-The overcommit amount can be set via `vm.overcommit_ratio' (percentage)
-or `vm.overcommit_kbytes' (absolute value).
-
-The current overcommit limit and amount committed are viewable in
-/proc/meminfo as CommitLimit and Committed_AS respectively.
-
-Gotchas
--------
-
-The C language stack growth does an implicit mremap. If you want absolute
-guarantees and run close to the edge you MUST mmap your stack for the
-largest size you think you will need. For typical stack usage this does
-not matter much but it's a corner case if you really really care
-
-In mode 2 the MAP_NORESERVE flag is ignored.
-
-
-How It Works
-------------
-
-The overcommit is based on the following rules
-
-For a file backed map
- SHARED or READ-only - 0 cost (the file is the map not swap)
- PRIVATE WRITABLE - size of mapping per instance
-
-For an anonymous or /dev/zero map
- SHARED - size of mapping
- PRIVATE READ-only - 0 cost (but of little use)
- PRIVATE WRITABLE - size of mapping per instance
-
-Additional accounting
- Pages made writable copies by mmap
- shmfs memory drawn from the same pool
-
-Status
-------
-
-o We account mmap memory mappings
-o We account mprotect changes in commit
-o We account mremap changes in size
-o We account brk
-o We account munmap
-o We report the commit status in /proc
-o Account and check on fork
-o Review stack handling/building on exec
-o SHMfs accounting
-o Implement actual limit enforcement
-
-To Do
------
-o Account ptrace pages (this is hard)
diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst
new file mode 100644
index 000000000000..0dd54bbe4afa
--- /dev/null
+++ b/Documentation/vm/overcommit-accounting.rst
@@ -0,0 +1,87 @@
+.. _overcommit_accounting:
+
+=====================
+Overcommit Accounting
+=====================
+
+The Linux kernel supports the following overcommit handling modes
+
+0
+ Heuristic overcommit handling. Obvious overcommits of address
+ space are refused. Used for a typical system. It ensures a
+ seriously wild allocation fails while allowing overcommit to
+ reduce swap usage. root is allowed to allocate slightly more
+ memory in this mode. This is the default.
+
+1
+ Always overcommit. Appropriate for some scientific
+ applications. Classic example is code using sparse arrays and
+ just relying on the virtual memory consisting almost entirely
+ of zero pages.
+
+2
+ Don't overcommit. The total address space commit for the
+ system is not permitted to exceed swap + a configurable amount
+ (default is 50%) of physical RAM. Depending on the amount you
+ use, in most situations this means a process will not be
+ killed while accessing pages but will receive errors on memory
+ allocation as appropriate.
+
+ Useful for applications that want to guarantee their memory
+ allocations will be available in the future without having to
+ initialize every page.
+
+The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
+
+The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
+or ``vm.overcommit_kbytes`` (absolute value).
+
+The current overcommit limit and amount committed are viewable in
+``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
+
+Gotchas
+=======
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+============
+
+The overcommit is based on the following rules
+
+For a file backed map
+ | SHARED or READ-only - 0 cost (the file is the map not swap)
+ | PRIVATE WRITABLE - size of mapping per instance
+
+For an anonymous or ``/dev/zero`` map
+ | SHARED - size of mapping
+ | PRIVATE READ-only - 0 cost (but of little use)
+ | PRIVATE WRITABLE - size of mapping per instance
+
+Additional accounting
+ | Pages made writable copies by mmap
+ | shmfs memory drawn from the same pool
+
+Status
+======
+
+* We account mmap memory mappings
+* We account mprotect changes in commit
+* We account mremap changes in size
+* We account brk
+* We account munmap
+* We report the commit status in /proc
+* Account and check on fork
+* Review stack handling/building on exec
+* SHMfs accounting
+* Implement actual limit enforcement
+
+To Do
+=====
+* Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags.rst
index a6714565dbf9..637cc49d1b2f 100644
--- a/Documentation/vm/page_frags
+++ b/Documentation/vm/page_frags.rst
@@ -1,5 +1,8 @@
+.. _page_frags:
+
+==============
Page fragments
---------------
+==============
A page fragment is an arbitrary-length arbitrary-offset area of memory
which resides within a 0 or higher order compound page. Multiple
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration.rst
index 496868072e24..f68d61335abb 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration.rst
@@ -1,5 +1,8 @@
+.. _page_migration:
+
+==============
Page migration
---------------
+==============
Page migration allows the moving of the physical location of pages between
nodes in a numa system while the process is running. This means that the
@@ -20,7 +23,7 @@ Page migration functions are provided by the numactl package by Andi Kleen
(a version later than 0.9.3 is required. Get it from
ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
which provides an interface similar to other numa functionality for page
-migration. cat /proc/<pid>/numa_maps allows an easy review of where the
+migration. cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
pages of a process are located. See also the numa_maps documentation in the
proc(5) man page.
@@ -56,8 +59,8 @@ description for those trying to use migrate_pages() from the kernel
(for userspace usage see the Andi Kleen's numactl package mentioned above)
and then a low level description of how the low level details work.
-A. In kernel use of migrate_pages()
------------------------------------
+In kernel use of migrate_pages()
+================================
1. Remove pages from the LRU.
@@ -78,8 +81,8 @@ A. In kernel use of migrate_pages()
the new page for each page that is considered for
moving.
-B. How migrate_pages() works
-----------------------------
+How migrate_pages() works
+=========================
migrate_pages() does several passes over its list of pages. A page is moved
if all references to a page are removable at the time. The page has
@@ -142,8 +145,8 @@ Steps:
20. The new page is moved to the LRU and can be scanned by the swapper
etc again.
-C. Non-LRU page migration
--------------------------
+Non-LRU page migration
+======================
Although original migration aimed for reducing the latency of memory access
for NUMA, compaction who want to create high-order page is also main customer.
@@ -164,89 +167,91 @@ migration path.
If a driver want to make own pages movable, it should define three functions
which are function pointers of struct address_space_operations.
-1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
+1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-What VM expects on isolate_page function of driver is to return *true*
-if driver isolates page successfully. On returing true, VM marks the page
-as PG_isolated so concurrent isolation in several CPUs skip the page
-for isolation. If a driver cannot isolate the page, it should return *false*.
+ What VM expects on isolate_page function of driver is to return *true*
+ if driver isolates page successfully. On returing true, VM marks the page
+ as PG_isolated so concurrent isolation in several CPUs skip the page
+ for isolation. If a driver cannot isolate the page, it should return *false*.
-Once page is successfully isolated, VM uses page.lru fields so driver
-shouldn't expect to preserve values in that fields.
+ Once page is successfully isolated, VM uses page.lru fields so driver
+ shouldn't expect to preserve values in that fields.
-2. int (*migratepage) (struct address_space *mapping,
- struct page *newpage, struct page *oldpage, enum migrate_mode);
+2. ``int (*migratepage) (struct address_space *mapping,``
+| ``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-After isolation, VM calls migratepage of driver with isolated page.
-The function of migratepage is to move content of the old page to new page
-and set up fields of struct page newpage. Keep in mind that you should
-indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-under page_lock if you migrated the oldpage successfully and returns
-MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-because VM interprets -EAGAIN as "temporal migration failure". On returning
-any error except -EAGAIN, VM will give up the page migration without retrying
-in this time.
+ After isolation, VM calls migratepage of driver with isolated page.
+ The function of migratepage is to move content of the old page to new page
+ and set up fields of struct page newpage. Keep in mind that you should
+ indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+ under page_lock if you migrated the oldpage successfully and returns
+ MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
+ can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
+ because VM interprets -EAGAIN as "temporal migration failure". On returning
+ any error except -EAGAIN, VM will give up the page migration without retrying
+ in this time.
-Driver shouldn't touch page.lru field VM using in the functions.
+ Driver shouldn't touch page.lru field VM using in the functions.
-3. void (*putback_page)(struct page *);
+3. ``void (*putback_page)(struct page *);``
-If migration fails on isolated page, VM should return the isolated page
-to the driver so VM calls driver's putback_page with migration failed page.
-In this function, driver should put the isolated page back to the own data
-structure.
+ If migration fails on isolated page, VM should return the isolated page
+ to the driver so VM calls driver's putback_page with migration failed page.
+ In this function, driver should put the isolated page back to the own data
+ structure.
4. non-lru movable page flags
-There are two page flags for supporting non-lru movable page.
+ There are two page flags for supporting non-lru movable page.
-* PG_movable
+ * PG_movable
-Driver should use the below function to make page movable under page_lock.
+ Driver should use the below function to make page movable under page_lock::
void __SetPageMovable(struct page *page, struct address_space *mapping)
-It needs argument of address_space for registering migration family functions
-which will be called by VM. Exactly speaking, PG_movable is not a real flag of
-struct page. Rather than, VM reuses page->mapping's lower bits to represent it.
+ It needs argument of address_space for registering migration
+ family functions which will be called by VM. Exactly speaking,
+ PG_movable is not a real flag of struct page. Rather than, VM
+ reuses page->mapping's lower bits to represent it.
+::
#define PAGE_MAPPING_MOVABLE 0x2
page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-so driver shouldn't access page->mapping directly. Instead, driver should
-use page_mapping which mask off the low two bits of page->mapping under
-page lock so it can get right struct address_space.
-
-For testing of non-lru movable page, VM supports __PageMovable function.
-However, it doesn't guarantee to identify non-lru movable page because
-page->mapping field is unified with other variables in struct page.
-As well, if driver releases the page after isolation by VM, page->mapping
-doesn't have stable value although it has PAGE_MAPPING_MOVABLE
-(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
-page is LRU or non-lru movable once the page has been isolated. Because
-LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
-good for just peeking to test non-lru movable pages before more expensive
-checking with lock_page in pfn scanning to select victim.
-
-For guaranteeing non-lru movable page, VM provides PageMovable function.
-Unlike __PageMovable, PageMovable functions validates page->mapping and
-mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
-destroying of page->mapping.
-
-Driver using __SetPageMovable should clear the flag via __ClearMovablePage
-under page_lock before the releasing the page.
-
-* PG_isolated
-
-To prevent concurrent isolation among several CPUs, VM marks isolated page
-as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
-movable page, it can skip it. Driver doesn't need to manipulate the flag
-because VM will set/clear it automatically. Keep in mind that if driver
-sees PG_isolated page, it means the page have been isolated by VM so it
-shouldn't touch page.lru field.
-PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
-for own purpose.
+ so driver shouldn't access page->mapping directly. Instead, driver should
+ use page_mapping which mask off the low two bits of page->mapping under
+ page lock so it can get right struct address_space.
+
+ For testing of non-lru movable page, VM supports __PageMovable function.
+ However, it doesn't guarantee to identify non-lru movable page because
+ page->mapping field is unified with other variables in struct page.
+ As well, if driver releases the page after isolation by VM, page->mapping
+ doesn't have stable value although it has PAGE_MAPPING_MOVABLE
+ (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
+ page is LRU or non-lru movable once the page has been isolated. Because
+ LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
+ good for just peeking to test non-lru movable pages before more expensive
+ checking with lock_page in pfn scanning to select victim.
+
+ For guaranteeing non-lru movable page, VM provides PageMovable function.
+ Unlike __PageMovable, PageMovable functions validates page->mapping and
+ mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
+ destroying of page->mapping.
+
+ Driver using __SetPageMovable should clear the flag via __ClearMovablePage
+ under page_lock before the releasing the page.
+
+ * PG_isolated
+
+ To prevent concurrent isolation among several CPUs, VM marks isolated page
+ as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
+ movable page, it can skip it. Driver doesn't need to manipulate the flag
+ because VM will set/clear it automatically. Keep in mind that if driver
+ sees PG_isolated page, it means the page have been isolated by VM so it
+ shouldn't touch page.lru field.
+ PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
+ for own purpose.
Christoph Lameter, May 8, 2006.
Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.rst
index ffff1439076a..0ed5ab8c7ab4 100644
--- a/Documentation/vm/page_owner.txt
+++ b/Documentation/vm/page_owner.rst
@@ -1,7 +1,11 @@
+.. _page_owner:
+
+==================================================
page owner: Tracking about who allocated each page
------------------------------------------------------------
+==================================================
-* Introduction
+Introduction
+============
page owner is for the tracking about who allocated each page.
It can be used to debug memory leak or to find a memory hogger.
@@ -34,13 +38,15 @@ not affect to allocation performance, especially if the static keys jump
label patching functionality is available. Following is the kernel's code
size change due to this facility.
-- Without page owner
+- Without page owner::
+
text data bss dec hex filename
- 40662 1493 644 42799 a72f mm/page_alloc.o
+ 40662 1493 644 42799 a72f mm/page_alloc.o
+
+- With page owner::
-- With page owner
text data bss dec hex filename
- 40892 1493 644 43029 a815 mm/page_alloc.o
+ 40892 1493 644 43029 a815 mm/page_alloc.o
1427 24 8 1459 5b3 mm/page_ext.o
2722 50 0 2772 ad4 mm/page_owner.o
@@ -62,21 +68,23 @@ are catched and marked, although they are mostly allocated from struct
page extension feature. Anyway, after that, no page is left in
un-tracking state.
-* Usage
+Usage
+=====
+
+1) Build user-space helper::
-1) Build user-space helper
cd tools/vm
make page_owner_sort
-2) Enable page owner
- Add "page_owner=on" to boot cmdline.
+2) Enable page owner: add "page_owner=on" to boot cmdline.
3) Do the job what you want to debug
-4) Analyze information from page owner
+4) Analyze information from page owner::
+
cat /sys/kernel/debug/page_owner > page_owner_full.txt
grep -v ^PFN page_owner_full.txt > page_owner.txt
./page_owner_sort page_owner.txt sorted_page_owner.txt
- See the result about who allocated each page
- in the sorted_page_owner.txt.
+ See the result about who allocated each page
+ in the ``sorted_page_owner.txt``.
diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.rst
index f609142f406a..7bef6718e3a9 100644
--- a/Documentation/vm/remap_file_pages.txt
+++ b/Documentation/vm/remap_file_pages.rst
@@ -1,3 +1,9 @@
+.. _remap_file_pages:
+
+==============================
+remap_file_pages() system call
+==============================
+
The remap_file_pages() system call is used to create a nonlinear mapping,
that is, a mapping in which the pages of the file are mapped into a
nonsequential order in memory. The advantage of using remap_file_pages()
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
new file mode 100644
index 000000000000..3a775fd64e2d
--- /dev/null
+++ b/Documentation/vm/slub.rst
@@ -0,0 +1,361 @@
+.. _slub:
+
+==========================
+Short users guide for SLUB
+==========================
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add an option ``slub_debug``
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the ``slabinfo`` command to get statistical
+data and perform operation on the slabs. By default ``slabinfo`` only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. ``slabinfo`` can be compiled with
+::
+
+ gcc -o slabinfo tools/vm/slabinfo.c
+
+Some of the modes of operation of ``slabinfo`` require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to ``slub_debug``. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>
+ Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+ Enable options only for select slabs
+
+
+Possible debug options are::
+
+ F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+ Sorry SLAB legacy issues)
+ Z Red zoning
+ P Poisoning (object and padding)
+ U User tracking (free and alloc)
+ T Trace (please only use on single slabs)
+ A Toggle failslab filter mark for the cache
+ O Switch debugging off for caches that would have
+ caused higher minimum slab orders
+ - Switch all debugging off (useful if the kernel is
+ configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify::
+
+ slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try::
+
+ slub_debug=,dentry
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab. We can just apply sanity checks
+to the dentry cache with::
+
+ slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes). This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory. To
+switch off debugging for such caches by default, use::
+
+ slub_debug=O
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of::
+
+ /sys/kernel/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+============
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+``slabinfo -a`` displays which slabs were merged together.
+
+Slab validation
+===============
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the ``slabinfo`` tool. Then you can do
+::
+
+ slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case ``slabinfo -v`` simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+========================
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+.. slub_min_objects=x (default 4)
+.. slub_min_order=x (default 0)
+.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER))
+
+``slub_min_objects``
+ allows to specify how many objects must at least fit into one
+ slab in order for the allocation order to be acceptable. In
+ general slub will be able to perform this number of
+ allocations on a slab without consulting centralized resources
+ (list_lock) where contention may occur.
+
+``slub_min_order``
+ specifies a minim order of slabs. A similar effect like
+ ``slub_min_objects``.
+
+``slub_max_order``
+ specified the order at which ``slub_min_objects`` should no
+ longer be checked. This is useful to avoid SLUB trying to
+ generate super large order pages to fit ``slub_min_objects``
+ of a slab cache with large object sizes into one high order
+ page. Setting command line parameter
+ ``debug_guardpage_minorder=N`` (N > 0), forces setting
+ ``slub_max_order`` to 0, what cause minimum possible order of
+ slabs allocation.
+
+SLUB Debug output
+=================
+
+Here is a sample of slub debug output::
+
+ ====================================================================
+ BUG kmalloc-8: Redzone overwritten
+ --------------------------------------------------------------------
+
+ INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+ INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+ INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+ INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+ Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+ Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005
+ Redzone 0xc90f6d28: 00 cc cc cc .
+ Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
+
+ [<c010523d>] dump_trace+0x63/0x1eb
+ [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+ [<c010601d>] show_trace+0x12/0x14
+ [<c0106035>] dump_stack+0x16/0x18
+ [<c017e0fa>] object_err+0x143/0x14b
+ [<c017e2cc>] check_object+0x66/0x234
+ [<c017eb43>] __slab_free+0x239/0x384
+ [<c017f446>] kfree+0xa6/0xc6
+ [<c02e2335>] get_modalias+0xb9/0xf5
+ [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+ [<c027866a>] dev_uevent+0x1ad/0x1da
+ [<c0205024>] kobject_uevent_env+0x20a/0x45b
+ [<c020527f>] kobject_uevent+0xa/0xf
+ [<c02779f1>] store_uevent+0x4f/0x58
+ [<c027758e>] dev_attr_store+0x29/0x2f
+ [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+ [<c0183ba7>] vfs_write+0xd1/0x15a
+ [<c01841d7>] sys_write+0x3d/0x72
+ [<c0104112>] sysenter_past_esp+0x5f/0x99
+ [<b7f7b410>] 0xb7f7b410
+ =======================
+
+ FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+ This will be a message in the system log starting with::
+
+ ===============================================
+ BUG <slab cache affected>: <What went wrong>
+ -----------------------------------------------
+
+ INFO: <corruption start>-<corruption_end> <more info>
+ INFO: Slab <address> <slab information>
+ INFO: Object <address> <object information>
+ INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+ cpu> pid=<pid of the process>
+ INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+ pid=<pid of the process>
+
+ (Object allocation / free information is only available if SLAB_STORE_USER is
+ set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+ Various types of lines can follow the BUG SLUB line:
+
+ Bytes b4 <address> : <bytes>
+ Shows a few bytes before the object where the problem was detected.
+ Can be useful if the corruption does not stop with the start of the
+ object.
+
+ Object <address> : <bytes>
+ The bytes of the object. If the object is inactive then the bytes
+ typically contain poison values. Any non-poison value shows a
+ corruption by a write after free.
+
+ Redzone <address> : <bytes>
+ The Redzone following the object. The Redzone is used to detect
+ writes after the object. All bytes should always have the same
+ value. If there is any deviation then it is due to a write after
+ the object boundary.
+
+ (Redzone information is only available if SLAB_RED_ZONE is set.
+ slub_debug sets that option)
+
+ Padding <address> : <bytes>
+ Unused data to fill up the space in order to get the next object
+ properly aligned. In the debug case we make sure that there are
+ at least 4 bytes of padding. This allows the detection of writes
+ before the object.
+
+3. A stackdump
+
+ The stackdump describes the location where the error was detected. The cause
+ of the corruption is may be more likely found by looking at the function that
+ allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+ operation of the system.
+
+ These are messages in the system log beginning with::
+
+ FIX <slab cache affected>: <corrective action taken>
+
+ In the above sample SLUB found that the Redzone of an active object has
+ been overwritten. Here a string of 8 characters was written into a slab that
+ has the length of 8 characters. However, a 8 character string needs a
+ terminating 0. That zero has overwritten the first byte of the Redzone field.
+ After reporting the details of the issue encountered the FIX SLUB message
+ tells us that SLUB has restored the Redzone to its proper value and then
+ system operations continue.
+
+Emergency operations
+====================
+
+Minimal debugging (sanity checks alone) can be enabled by booting with::
+
+ slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.::
+
+ slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects::
+
+ slub_debug=FZ,dentry
+
+Extended slabinfo mode and plotting
+===================================
+
+The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
+ - Slabcache Totals
+ - Slabs sorted by size (up to -N <num> slabs, default 1)
+ - Slabs sorted by loss (up to -N <num> slabs, default 1)
+
+Additionally, in this mode ``slabinfo`` does not dynamically scale
+sizes (G/M/K) and reports everything in bytes (this functionality is
+also available to other slabinfo modes via '-B' option) which makes
+reporting more precise and accurate. Moreover, in some sense the `-X'
+mode also simplifies the analysis of slabs' behaviour, because its
+output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
+pushes the analysis from looking through the numbers (tons of numbers)
+to something easier -- visual analysis.
+
+To generate plots:
+
+a) collect slabinfo extended records, for example::
+
+ while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
+
+b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
+
+ slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
+
+ The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
+ and generates 3 png files (and 3 pre-processing cache files) per STATS
+ file:
+ - Slabcache Totals: FOO_STATS-totals.png
+ - Slabs sorted by size: FOO_STATS-slabs-by-size.png
+ - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
+
+Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
+need to compare slabs' behaviour "prior to" and "after" some code
+modification. To help you out there, ``slabinfo-gnuplot.sh`` script
+can 'merge' the `Slabcache Totals` sections from different
+measurements. To visually compare N plots:
+
+a) Collect as many STATS1, STATS2, .. STATSN files as you need::
+
+ while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
+
+b) Pre-process those STATS files::
+
+ slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
+
+c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
+ generated pre-processed \*-totals::
+
+ slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
+
+ This will produce a single plot (png file).
+
+ Plots, expectedly, can be large so some fluctuations or small spikes
+ can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
+ options to 'zoom-in'/'zoom-out':
+
+ a) ``-s %d,%d`` -- overwrites the default image width and heigh
+ b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
+ in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
+ 40,60`` range will plot only samples collected between 40th and
+ 60th seconds).
+
+Christoph Lameter, May 30, 2007
+Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
deleted file mode 100644
index 84652419bff2..000000000000
--- a/Documentation/vm/slub.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Short users guide for SLUB
---------------------------
-
-The basic philosophy of SLUB is very different from SLAB. SLAB
-requires rebuilding the kernel to activate debug options for all
-slab caches. SLUB always includes full debugging but it is off by default.
-SLUB can enable debugging only for selected slabs in order to avoid
-an impact on overall system performance which may make a bug more
-difficult to find.
-
-In order to switch debugging on one can add an option "slub_debug"
-to the kernel command line. That will enable full debugging for
-all slabs.
-
-Typically one would then use the "slabinfo" command to get statistical
-data and perform operation on the slabs. By default slabinfo only lists
-slabs that have data in them. See "slabinfo -h" for more options when
-running the command. slabinfo can be compiled with
-
-gcc -o slabinfo tools/vm/slabinfo.c
-
-Some of the modes of operation of slabinfo require that slub debugging
-be enabled on the command line. F.e. no tracking information will be
-available without debugging on and validation can only partially
-be performed if debugging was not switched on.
-
-Some more sophisticated uses of slub_debug:
--------------------------------------------
-
-Parameters may be given to slub_debug. If none is specified then full
-debugging is enabled. Format:
-
-slub_debug=<Debug-Options> Enable options for all slabs
-slub_debug=<Debug-Options>,<slab name>
- Enable options only for select slabs
-
-Possible debug options are
- F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
- Sorry SLAB legacy issues)
- Z Red zoning
- P Poisoning (object and padding)
- U User tracking (free and alloc)
- T Trace (please only use on single slabs)
- A Toggle failslab filter mark for the cache
- O Switch debugging off for caches that would have
- caused higher minimum slab orders
- - Switch all debugging off (useful if the kernel is
- configured with CONFIG_SLUB_DEBUG_ON)
-
-F.e. in order to boot just with sanity checks and red zoning one would specify:
-
- slub_debug=FZ
-
-Trying to find an issue in the dentry cache? Try
-
- slub_debug=,dentry
-
-to only enable debugging on the dentry cache.
-
-Red zoning and tracking may realign the slab. We can just apply sanity checks
-to the dentry cache with
-
- slub_debug=F,dentry
-
-Debugging options may require the minimum possible slab order to increase as
-a result of storing the metadata (for example, caches with PAGE_SIZE object
-sizes). This has a higher liklihood of resulting in slab allocation errors
-in low memory situations or if there's high fragmentation of memory. To
-switch off debugging for such caches by default, use
-
- slub_debug=O
-
-In case you forgot to enable debugging on the kernel command line: It is
-possible to enable debugging manually when the kernel is up. Look at the
-contents of:
-
-/sys/kernel/slab/<slab name>/
-
-Look at the writable files. Writing 1 to them will enable the
-corresponding debug option. All options can be set on a slab that does
-not contain objects. If the slab already contains objects then sanity checks
-and tracing may only be enabled. The other options may cause the realignment
-of objects.
-
-Careful with tracing: It may spew out lots of information and never stop if
-used on the wrong slab.
-
-Slab merging
-------------
-
-If no debug options are specified then SLUB may merge similar slabs together
-in order to reduce overhead and increase cache hotness of objects.
-slabinfo -a displays which slabs were merged together.
-
-Slab validation
----------------
-
-SLUB can validate all object if the kernel was booted with slub_debug. In
-order to do so you must have the slabinfo tool. Then you can do
-
-slabinfo -v
-
-which will test all objects. Output will be generated to the syslog.
-
-This also works in a more limited way if boot was without slab debug.
-In that case slabinfo -v simply tests all reachable objects. Usually
-these are in the cpu slabs and the partial slabs. Full slabs are not
-tracked by SLUB in a non debug situation.
-
-Getting more performance
-------------------------
-
-To some degree SLUB's performance is limited by the need to take the
-list_lock once in a while to deal with partial slabs. That overhead is
-governed by the order of the allocation for each slab. The allocations
-can be influenced by kernel parameters:
-
-slub_min_objects=x (default 4)
-slub_min_order=x (default 0)
-slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER))
-
-slub_min_objects allows to specify how many objects must at least fit
-into one slab in order for the allocation order to be acceptable.
-In general slub will be able to perform this number of allocations
-on a slab without consulting centralized resources (list_lock) where
-contention may occur.
-
-slub_min_order specifies a minim order of slabs. A similar effect like
-slub_min_objects.
-
-slub_max_order specified the order at which slub_min_objects should no
-longer be checked. This is useful to avoid SLUB trying to generate
-super large order pages to fit slub_min_objects of a slab cache with
-large object sizes into one high order page. Setting command line
-parameter debug_guardpage_minorder=N (N > 0), forces setting
-slub_max_order to 0, what cause minimum possible order of slabs
-allocation.
-
-SLUB Debug output
------------------
-
-Here is a sample of slub debug output:
-
-====================================================================
-BUG kmalloc-8: Redzone overwritten
---------------------------------------------------------------------
-
-INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
-INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
-INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
-INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
-
-Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
- Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005
- Redzone 0xc90f6d28: 00 cc cc cc .
- Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
-
- [<c010523d>] dump_trace+0x63/0x1eb
- [<c01053df>] show_trace_log_lvl+0x1a/0x2f
- [<c010601d>] show_trace+0x12/0x14
- [<c0106035>] dump_stack+0x16/0x18
- [<c017e0fa>] object_err+0x143/0x14b
- [<c017e2cc>] check_object+0x66/0x234
- [<c017eb43>] __slab_free+0x239/0x384
- [<c017f446>] kfree+0xa6/0xc6
- [<c02e2335>] get_modalias+0xb9/0xf5
- [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
- [<c027866a>] dev_uevent+0x1ad/0x1da
- [<c0205024>] kobject_uevent_env+0x20a/0x45b
- [<c020527f>] kobject_uevent+0xa/0xf
- [<c02779f1>] store_uevent+0x4f/0x58
- [<c027758e>] dev_attr_store+0x29/0x2f
- [<c01bec4f>] sysfs_write_file+0x16e/0x19c
- [<c0183ba7>] vfs_write+0xd1/0x15a
- [<c01841d7>] sys_write+0x3d/0x72
- [<c0104112>] sysenter_past_esp+0x5f/0x99
- [<b7f7b410>] 0xb7f7b410
- =======================
-
-FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
-
-If SLUB encounters a corrupted object (full detection requires the kernel
-to be booted with slub_debug) then the following output will be dumped
-into the syslog:
-
-1. Description of the problem encountered
-
-This will be a message in the system log starting with
-
-===============================================
-BUG <slab cache affected>: <What went wrong>
------------------------------------------------
-
-INFO: <corruption start>-<corruption_end> <more info>
-INFO: Slab <address> <slab information>
-INFO: Object <address> <object information>
-INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
- cpu> pid=<pid of the process>
-INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
- pid=<pid of the process>
-
-(Object allocation / free information is only available if SLAB_STORE_USER is
-set for the slab. slub_debug sets that option)
-
-2. The object contents if an object was involved.
-
-Various types of lines can follow the BUG SLUB line:
-
-Bytes b4 <address> : <bytes>
- Shows a few bytes before the object where the problem was detected.
- Can be useful if the corruption does not stop with the start of the
- object.
-
-Object <address> : <bytes>
- The bytes of the object. If the object is inactive then the bytes
- typically contain poison values. Any non-poison value shows a
- corruption by a write after free.
-
-Redzone <address> : <bytes>
- The Redzone following the object. The Redzone is used to detect
- writes after the object. All bytes should always have the same
- value. If there is any deviation then it is due to a write after
- the object boundary.
-
- (Redzone information is only available if SLAB_RED_ZONE is set.
- slub_debug sets that option)
-
-Padding <address> : <bytes>
- Unused data to fill up the space in order to get the next object
- properly aligned. In the debug case we make sure that there are
- at least 4 bytes of padding. This allows the detection of writes
- before the object.
-
-3. A stackdump
-
-The stackdump describes the location where the error was detected. The cause
-of the corruption is may be more likely found by looking at the function that
-allocated or freed the object.
-
-4. Report on how the problem was dealt with in order to ensure the continued
-operation of the system.
-
-These are messages in the system log beginning with
-
-FIX <slab cache affected>: <corrective action taken>
-
-In the above sample SLUB found that the Redzone of an active object has
-been overwritten. Here a string of 8 characters was written into a slab that
-has the length of 8 characters. However, a 8 character string needs a
-terminating 0. That zero has overwritten the first byte of the Redzone field.
-After reporting the details of the issue encountered the FIX SLUB message
-tells us that SLUB has restored the Redzone to its proper value and then
-system operations continue.
-
-Emergency operations:
----------------------
-
-Minimal debugging (sanity checks alone) can be enabled by booting with
-
- slub_debug=F
-
-This will be generally be enough to enable the resiliency features of slub
-which will keep the system running even if a bad kernel component will
-keep corrupting objects. This may be important for production systems.
-Performance will be impacted by the sanity checks and there will be a
-continual stream of error messages to the syslog but no additional memory
-will be used (unlike full debugging).
-
-No guarantees. The kernel component still needs to be fixed. Performance
-may be optimized further by locating the slab that experiences corruption
-and enabling debugging only for that cache
-
-I.e.
-
- slub_debug=F,dentry
-
-If the corruption occurs by writing after the end of the object then it
-may be advisable to enable a Redzone to avoid corrupting the beginning
-of other objects.
-
- slub_debug=FZ,dentry
-
-Extended slabinfo mode and plotting
------------------------------------
-
-The slabinfo tool has a special 'extended' ('-X') mode that includes:
- - Slabcache Totals
- - Slabs sorted by size (up to -N <num> slabs, default 1)
- - Slabs sorted by loss (up to -N <num> slabs, default 1)
-
-Additionally, in this mode slabinfo does not dynamically scale sizes (G/M/K)
-and reports everything in bytes (this functionality is also available to
-other slabinfo modes via '-B' option) which makes reporting more precise and
-accurate. Moreover, in some sense the `-X' mode also simplifies the analysis
-of slabs' behaviour, because its output can be plotted using the
-slabinfo-gnuplot.sh script. So it pushes the analysis from looking through
-the numbers (tons of numbers) to something easier -- visual analysis.
-
-To generate plots:
-a) collect slabinfo extended records, for example:
-
- while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
-
-b) pass stats file(-s) to slabinfo-gnuplot.sh script:
- slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
-
-The slabinfo-gnuplot.sh script will pre-processes the collected records
-and generates 3 png files (and 3 pre-processing cache files) per STATS
-file:
- - Slabcache Totals: FOO_STATS-totals.png
- - Slabs sorted by size: FOO_STATS-slabs-by-size.png
- - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
-
-Another use case, when slabinfo-gnuplot can be useful, is when you need
-to compare slabs' behaviour "prior to" and "after" some code modification.
-To help you out there, slabinfo-gnuplot.sh script can 'merge' the
-`Slabcache Totals` sections from different measurements. To visually
-compare N plots:
-
-a) Collect as many STATS1, STATS2, .. STATSN files as you need
- while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
-
-b) Pre-process those STATS files
- slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
-
-c) Execute slabinfo-gnuplot.sh in '-t' mode, passing all of the
-generated pre-processed *-totals
- slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
-
-This will produce a single plot (png file).
-
-Plots, expectedly, can be large so some fluctuations or small spikes
-can go unnoticed. To deal with that, `slabinfo-gnuplot.sh' has two
-options to 'zoom-in'/'zoom-out':
- a) -s %d,%d overwrites the default image width and heigh
- b) -r %d,%d specifies a range of samples to use (for example,
- in `slabinfo -X >> FOO_STATS; sleep 1;' case, using
- a "-r 40,60" range will plot only samples collected
- between 40th and 60th seconds).
-
-Christoph Lameter, May 30, 2007
-Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock.rst
index 62842a857dab..889b00be469f 100644
--- a/Documentation/vm/split_page_table_lock
+++ b/Documentation/vm/split_page_table_lock.rst
@@ -1,3 +1,6 @@
+.. _split_page_table_lock:
+
+=====================
Split page table lock
=====================
@@ -11,6 +14,7 @@ access to the table. At the moment we use split lock for PTE and PMD
tables. Access to higher level tables protected by mm->page_table_lock.
There are helpers to lock/unlock a table and other accessor functions:
+
- pte_offset_map_lock()
maps pte and takes PTE table lock, returns pointer to the taken
lock;
@@ -34,12 +38,13 @@ Split page table lock for PMD tables is enabled, if it's enabled for PTE
tables and the architecture supports it (see below).
Hugetlb and split page table lock
----------------------------------
+=================================
Hugetlb can support several page sizes. We use split lock only for PMD
level, but not for PUD.
Hugetlb-specific helpers:
+
- huge_pte_lock()
takes pmd split lock for PMD_SIZE page, mm->page_table_lock
otherwise;
@@ -47,7 +52,7 @@ Hugetlb-specific helpers:
returns pointer to table lock;
Support of split page table lock by an architecture
----------------------------------------------------
+===================================================
There's no need in special enabling of PTE split page table lock:
everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
@@ -73,7 +78,7 @@ NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
be handled properly.
page->ptl
----------
+=========
page->ptl is used to access split page table lock, where 'page' is struct
page of page containing the table. It shares storage with page->private
@@ -81,6 +86,7 @@ page of page containing the table. It shares storage with page->private
To avoid increasing size of struct page and have best performance, we use a
trick:
+
- if spinlock_t fits into long, we use page->ptr as spinlock, so we
can avoid indirect access and save a cache line.
- if size of spinlock_t is bigger then size of long, we use page->ptl as
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.rst
index d5960c9124f5..e0466f2db8fa 100644
--- a/Documentation/vm/swap_numa.txt
+++ b/Documentation/vm/swap_numa.rst
@@ -1,5 +1,8 @@
+.. _swap_numa:
+
+===========================================
Automatically bind swap device to numa node
--------------------------------------------
+===========================================
If the system has more than one swap device and swap device has the node
information, we can make use of this information to decide which swap
@@ -7,15 +10,16 @@ device to use in get_swap_pages() to get better performance.
How to use this feature
------------------------
+=======================
Swap device has priority and that decides the order of it to be used. To make
use of automatically binding, there is no need to manipulate priority settings
for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing:
-# swapon /dev/swapA
-# swapon /dev/swapB
+to be swapped on. Simply swapping them on by doing::
+
+ # swapon /dev/swapA
+ # swapon /dev/swapB
Then node 0 will use the two swap devices in the order of swapA then swapB and
node 1 will use the two swap devices in the order of swapB then swapA. Note
@@ -24,32 +28,39 @@ that the order of them being swapped on doesn't matter.
A more complex example on a 4 node machine. Assume 6 swap devices are going to
be swapped on: swapA and swapB are attached to node 0, swapC is attached to
node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above:
-# swapon /dev/swapA
-# swapon /dev/swapB
-# swapon /dev/swapC
-# swapon /dev/swapD
-# swapon /dev/swapE
-# swapon /dev/swapF
-
-Then node 0 will use them in the order of:
-swapA/swapB -> swapC -> swapD -> swapE -> swapF
+The way to swap them on is the same as above::
+
+ # swapon /dev/swapA
+ # swapon /dev/swapB
+ # swapon /dev/swapC
+ # swapon /dev/swapD
+ # swapon /dev/swapE
+ # swapon /dev/swapF
+
+Then node 0 will use them in the order of::
+
+ swapA/swapB -> swapC -> swapD -> swapE -> swapF
+
swapA and swapB will be used in a round robin mode before any other swap device.
-node 1 will use them in the order of:
-swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+node 1 will use them in the order of::
+
+ swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of::
+
+ swapD/swapE -> swapA -> swapB -> swapC -> swapF
-node 2 will use them in the order of:
-swapD/swapE -> swapA -> swapB -> swapC -> swapF
Similaly, swapD and swapE will be used in a round robin mode before any
other swap devices.
-node 3 will use them in the order of:
-swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+node 3 will use them in the order of::
+
+ swapF -> swapA -> swapB -> swapC -> swapD -> swapE
Implementation details
-----------------------
+======================
The current code uses a priority based list, swap_avail_list, to decide
which swap device to use and if multiple swap devices share the same
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
new file mode 100644
index 000000000000..a8cf6809e36e
--- /dev/null
+++ b/Documentation/vm/transhuge.rst
@@ -0,0 +1,197 @@
+.. _transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+This document describes design principles Transparent Hugepage (THP)
+Support and its interaction with other parts of the memory management.
+
+Design principles
+=================
+
+- "graceful fallback": mm components which don't have transparent hugepage
+ knowledge fall back to breaking huge pmd mapping into table of ptes and,
+ if necessary, split a transparent hugepage. Therefore these components
+ can continue working on the regular pages or regular pte mappings.
+
+- if a hugepage allocation fails because of memory fragmentation,
+ regular pages should be gracefully allocated instead and mixed in
+ the same vma without any failure or significant delay and without
+ userland noticing
+
+- if some task quits and more hugepages become available (either
+ immediately in the buddy or through the VM), guest physical memory
+ backed by regular pages should be relocated on hugepages
+ automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+ whenever possible (the only possible reservation here is kernelcore=
+ to avoid unmovable pages to fragment all the memory but such a tweak
+ is not specific to transparent hugepage support and it's a generic
+ feature that applies to all dynamic high order allocations in the
+ kernel)
+
+get_user_pages and follow_page
+==============================
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
+
+.. note::
+ these aren't new constraints to the GUP API, and they match the
+ same constrains that applies to hugetlbfs too, so any driver capable
+ of handling GUP on hugetlbfs will also work fine on transparent
+ hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+Graceful fallback
+=================
+
+Code walking pagetables but unaware about huge pmds can simply call
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change::
+
+ diff --git a/mm/mremap.c b/mm/mremap.c
+ --- a/mm/mremap.c
+ +++ b/mm/mremap.c
+ @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
+ + split_huge_pmd(vma, pmd, addr);
+ if (pmd_none_or_clear_bad(pmd))
+ return NULL;
+
+Locking in hugepage aware code
+==============================
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+Refcounts and transparent huge pages
+====================================
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+ - get_page()/put_page() and GUP operate in head page's ->_refcount.
+
+ - ->_refcount in tail pages is always zero: get_page_unless_zero() never
+ succeed on tail pages.
+
+ - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+ on relevant sub-page of the compound page.
+
+ - map/unmap of the whole compound page accounted in compound_mapcount
+ (stored in first tail page). For file huge pages, we also increment
+ ->_mapcount of all sub-pages in order to have race-free detection of
+ last unmap of subpages.
+
+PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
+
+For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
+subpages is offset up by one. This additional reference is required to
+get race-free detection of unmap of subpages when we have them mapped with
+both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+For anonymous pages, we set PG_double_map when a PMD of the page got split
+for the first time, but still have PMD mapping. The additional references
+go away with last compound_mapcount.
+
+File pages get PG_double_map set on first map of the page with PTE and
+goes away when the page gets evicted from page cache.
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_refcount and
+page->_mapcount of anonymous pages. File pages just got unmapped.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already known how
+many references should be uncharged from the head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+Partial unmap and deferred_split_huge_page()
+============================================
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap happens during exit(2) if
+a THP crosses a VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
deleted file mode 100644
index 4dde03b44ad1..000000000000
--- a/Documentation/vm/transhuge.txt
+++ /dev/null
@@ -1,527 +0,0 @@
-= Transparent Hugepage Support =
-
-== Objective ==
-
-Performance critical computing applications dealing with large memory
-working sets are already running on top of libhugetlbfs and in turn
-hugetlbfs. Transparent Hugepage Support is an alternative means of
-using huge pages for the backing of virtual memory with huge pages
-that supports the automatic promotion and demotion of page sizes and
-without the shortcomings of hugetlbfs.
-
-Currently it only works for anonymous memory mappings and tmpfs/shmem.
-But in the future it can expand to other filesystems.
-
-The reason applications are running faster is because of two
-factors. The first factor is almost completely irrelevant and it's not
-of significant interest because it'll also have the downside of
-requiring larger clear-page copy-page in page faults which is a
-potentially negative effect. The first factor consists in taking a
-single page fault for each 2M virtual region touched by userland (so
-reducing the enter/exit kernel frequency by a 512 times factor). This
-only matters the first time the memory is accessed for the lifetime of
-a memory mapping. The second long lasting and much more important
-factor will affect all subsequent accesses to the memory for the whole
-runtime of the application. The second factor consist of two
-components: 1) the TLB miss will run faster (especially with
-virtualization using nested pagetables but almost always also on bare
-metal without virtualization) and 2) a single TLB entry will be
-mapping a much larger amount of virtual memory in turn reducing the
-number of TLB misses. With virtualization and nested pagetables the
-TLB can be mapped of larger size only if both KVM and the Linux guest
-are using hugepages but a significant speedup already happens if only
-one of the two is using hugepages just because of the fact the TLB
-miss is going to run faster.
-
-== Design ==
-
-- "graceful fallback": mm components which don't have transparent hugepage
- knowledge fall back to breaking huge pmd mapping into table of ptes and,
- if necessary, split a transparent hugepage. Therefore these components
- can continue working on the regular pages or regular pte mappings.
-
-- if a hugepage allocation fails because of memory fragmentation,
- regular pages should be gracefully allocated instead and mixed in
- the same vma without any failure or significant delay and without
- userland noticing
-
-- if some task quits and more hugepages become available (either
- immediately in the buddy or through the VM), guest physical memory
- backed by regular pages should be relocated on hugepages
- automatically (with khugepaged)
-
-- it doesn't require memory reservation and in turn it uses hugepages
- whenever possible (the only possible reservation here is kernelcore=
- to avoid unmovable pages to fragment all the memory but such a tweak
- is not specific to transparent hugepage support and it's a generic
- feature that applies to all dynamic high order allocations in the
- kernel)
-
-Transparent Hugepage Support maximizes the usefulness of free memory
-if compared to the reservation approach of hugetlbfs by allowing all
-unused memory to be used as cache or other movable (or even unmovable
-entities). It doesn't require reservation to prevent hugepage
-allocation failures to be noticeable from userland. It allows paging
-and all other advanced VM features to be available on the
-hugepages. It requires no modifications for applications to take
-advantage of it.
-
-Applications however can be further optimized to take advantage of
-this feature, like for example they've been optimized before to avoid
-a flood of mmap system calls for every malloc(4k). Optimizing userland
-is by far not mandatory and khugepaged already can take care of long
-lived page allocations even for hugepage unaware applications that
-deals with large amounts of memory.
-
-In certain cases when hugepages are enabled system wide, application
-may end up allocating more memory resources. An application may mmap a
-large region but only touch 1 byte of it, in that case a 2M page might
-be allocated instead of a 4k page for no good. This is why it's
-possible to disable hugepages system-wide and to only have them inside
-MADV_HUGEPAGE madvise regions.
-
-Embedded systems should enable hugepages only inside madvise regions
-to eliminate any risk of wasting any precious byte of memory and to
-only run faster.
-
-Applications that gets a lot of benefit from hugepages and that don't
-risk to lose memory by using hugepages, should use
-madvise(MADV_HUGEPAGE) on their critical mmapped regions.
-
-== sysfs ==
-
-Transparent Hugepage Support for anonymous memory can be entirely disabled
-(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
-regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved with one of:
-
-echo always >/sys/kernel/mm/transparent_hugepage/enabled
-echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
-echo never >/sys/kernel/mm/transparent_hugepage/enabled
-
-It's also possible to limit defrag efforts in the VM to generate
-anonymous hugepages in case they're not immediately free to madvise
-regions or to never try to defrag memory and simply fallback to regular
-pages unless hugepages are immediately available. Clearly if we spend CPU
-time to defrag memory, we would expect to gain even more by the fact we
-use hugepages later instead of regular pages. This isn't always
-guaranteed, but it may be more likely in case the allocation is for a
-MADV_HUGEPAGE region.
-
-echo always >/sys/kernel/mm/transparent_hugepage/defrag
-echo defer >/sys/kernel/mm/transparent_hugepage/defrag
-echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
-echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
-echo never >/sys/kernel/mm/transparent_hugepage/defrag
-
-"always" means that an application requesting THP will stall on allocation
-failure and directly reclaim pages and compact memory in an effort to
-allocate a THP immediately. This may be desirable for virtual machines
-that benefit heavily from THP use and are willing to delay the VM start
-to utilise them.
-
-"defer" means that an application will wake kswapd in the background
-to reclaim pages and wake kcompactd to compact memory so that THP is
-available in the near future. It's the responsibility of khugepaged
-to then install the THP pages later.
-
-"defer+madvise" will enter direct reclaim and compaction like "always", but
-only for regions that have used madvise(MADV_HUGEPAGE); all other regions
-will wake kswapd in the background to reclaim pages and wake kcompactd to
-compact memory so that THP is available in the near future.
-
-"madvise" will enter direct reclaim like "always" but only for regions
-that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
-
-"never" should be self-explanatory.
-
-By default kernel tries to use huge zero page on read page fault to
-anonymous mapping. It's possible to disable huge zero page by writing 0
-or enable it back by writing 1:
-
-echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-
-Some userspace (such as a test program, or an optimized memory allocation
-library) may want to know the size (in bytes) of a transparent hugepage:
-
-cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
-
-khugepaged will be automatically started when
-transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-be automatically shutdown if it's set to "never".
-
-khugepaged runs usually at low frequency so while one may not want to
-invoke defrag algorithms synchronously during the page faults, it
-should be worth invoking defrag at least in khugepaged. However it's
-also possible to disable defrag in khugepaged by writing 0 or enable
-defrag in khugepaged by writing 1:
-
-echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-
-You can also control how many pages khugepaged should scan at each
-pass:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
-
-and how many milliseconds to wait in khugepaged between each pass (you
-can set this to 0 to run khugepaged at 100% utilization of one core):
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
-
-and how many milliseconds to wait in khugepaged if there's an hugepage
-allocation failure to throttle the next allocation attempt.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
-
-The khugepaged progress can be seen in the number of pages collapsed:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
-
-for each pass:
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-
-max_ptes_none specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
-
-max_ptes_swap specifies how many pages can be brought in from
-swap when collapsing a group of pages into a transparent huge page.
-
-/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
-
-A higher value can cause excessive swap IO and waste
-memory. A lower value can prevent THPs from being
-collapsed, resulting fewer pages being collapsed into
-THPs, and lower memory access performance.
-
-== Boot parameter ==
-
-You can change the sysfs boot time defaults of Transparent Hugepage
-Support by passing the parameter "transparent_hugepage=always" or
-"transparent_hugepage=madvise" or "transparent_hugepage=never"
-(without "") to the kernel command line.
-
-== Hugepages in tmpfs/shmem ==
-
-You can control hugepage allocation policy in tmpfs with mount option
-"huge=". It can have following values:
-
- - "always":
- Attempt to allocate huge pages every time we need a new page;
-
- - "never":
- Do not allocate huge pages;
-
- - "within_size":
- Only allocate huge page if it will be fully within i_size.
- Also respect fadvise()/madvise() hints;
-
- - "advise:
- Only allocate huge pages if requested with fadvise()/madvise();
-
-The default policy is "never".
-
-"mount -o remount,huge= /mountpoint" works fine after mount: remounting
-huge=never will not attempt to break up huge pages at all, just stop more
-from being allocated.
-
-There's also sysfs knob to control hugepage allocation policy for internal
-shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
-is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
-MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
-
-In addition to policies listed above, shmem_enabled allows two further
-values:
-
- - "deny":
- For use in emergencies, to force the huge option off from
- all mounts;
- - "force":
- Force the huge option on for all - very useful for testing;
-
-== Need of application restart ==
-
-The transparent_hugepage/enabled values and tmpfs mount option only affect
-future behavior. So to make them effective you need to restart any
-application that could have been using hugepages. This also applies to the
-regions registered in khugepaged.
-
-== Monitoring usage ==
-
-The number of anonymous transparent huge pages currently used by the
-system is available by reading the AnonHugePages field in /proc/meminfo.
-To identify what applications are using anonymous transparent huge pages,
-it is necessary to read /proc/PID/smaps and count the AnonHugePages fields
-for each mapping.
-
-The number of file transparent huge pages mapped to userspace is available
-by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo.
-To identify what applications are mapping file transparent huge pages, it
-is necessary to read /proc/PID/smaps and count the FileHugeMapped fields
-for each mapping.
-
-Note that reading the smaps file is expensive and reading it
-frequently will incur overhead.
-
-There are a number of counters in /proc/vmstat that may be used to
-monitor how successfully the system is providing huge pages for use.
-
-thp_fault_alloc is incremented every time a huge page is successfully
- allocated to handle a page fault. This applies to both the
- first time a page is faulted and for COW faults.
-
-thp_collapse_alloc is incremented by khugepaged when it has found
- a range of pages to collapse into one huge page and has
- successfully allocated a new huge page to store the data.
-
-thp_fault_fallback is incremented if a page fault fails to allocate
- a huge page and instead falls back to using small pages.
-
-thp_collapse_alloc_failed is incremented if khugepaged found a range
- of pages that should be collapsed into one huge page but failed
- the allocation.
-
-thp_file_alloc is incremented every time a file huge page is successfully
- allocated.
-
-thp_file_mapped is incremented every time a file huge page is mapped into
- user address space.
-
-thp_split_page is incremented every time a huge page is split into base
- pages. This can happen for a variety of reasons but a common
- reason is that a huge page is old and is being reclaimed.
- This action implies splitting all PMD the page mapped with.
-
-thp_split_page_failed is incremented if kernel fails to split huge
- page. This can happen if the page was pinned by somebody.
-
-thp_deferred_split_page is incremented when a huge page is put onto split
- queue. This happens when a huge page is partially unmapped and
- splitting it would free up some memory. Pages on split queue are
- going to be split under memory pressure.
-
-thp_split_pmd is incremented every time a PMD split into table of PTEs.
- This can happen, for instance, when application calls mprotect() or
- munmap() on part of huge page. It doesn't split huge page, only
- page table entry.
-
-thp_zero_page_alloc is incremented every time a huge zero page is
- successfully allocated. It includes allocations which where
- dropped due race with other allocation. Note, it doesn't count
- every map of the huge zero page, only its allocation.
-
-thp_zero_page_alloc_failed is incremented if kernel fails to allocate
- huge zero page and falls back to using small pages.
-
-As the system ages, allocating huge pages may be expensive as the
-system uses memory compaction to copy data around memory to free a
-huge page for use. There are some counters in /proc/vmstat to help
-monitor this overhead.
-
-compact_stall is incremented every time a process stalls to run
- memory compaction so that a huge page is free for use.
-
-compact_success is incremented if the system compacted memory and
- freed a huge page for use.
-
-compact_fail is incremented if the system tries to compact memory
- but failed.
-
-compact_pages_moved is incremented each time a page is moved. If
- this value is increasing rapidly, it implies that the system
- is copying a lot of data to satisfy the huge page allocation.
- It is possible that the cost of copying exceeds any savings
- from reduced TLB misses.
-
-compact_pagemigrate_failed is incremented when the underlying mechanism
- for moving a page failed.
-
-compact_blocks_moved is incremented each time memory compaction examines
- a huge page aligned range of pages.
-
-It is possible to establish how long the stalls were using the function
-tracer to record how long was spent in __alloc_pages_nodemask and
-using the mm_page_alloc tracepoint to identify which allocations were
-for huge pages.
-
-== get_user_pages and follow_page ==
-
-get_user_pages and follow_page if run on a hugepage, will return the
-head or tail pages as usual (exactly as they would do on
-hugetlbfs). Most gup users will only care about the actual physical
-address of the page and its temporary pinning to release after the I/O
-is complete, so they won't ever notice the fact the page is huge. But
-if any driver is going to mangle over the page structure of the tail
-page (like for checking page->mapping or other bits that are relevant
-for the head page and not the tail page), it should be updated to jump
-to check head page instead. Taking reference on any head/tail page would
-prevent page from being split by anyone.
-
-NOTE: these aren't new constraints to the GUP API, and they match the
-same constrains that applies to hugetlbfs too, so any driver capable
-of handling GUP on hugetlbfs will also work fine on transparent
-hugepage backed mappings.
-
-In case you can't handle compound pages if they're returned by
-follow_page, the FOLL_SPLIT bit can be specified as parameter to
-follow_page, so that it will split the hugepages before returning
-them. Migration for example passes FOLL_SPLIT as parameter to
-follow_page because it's not hugepage aware and in fact it can't work
-at all on hugetlbfs (but it instead works fine on transparent
-hugepages thanks to FOLL_SPLIT). migration simply can't deal with
-hugepages being returned (as it's not only checking the pfn of the
-page and pinning it during the copy but it pretends to migrate the
-memory in regular page sizes and with regular pte/pmd mappings).
-
-== Optimizing the applications ==
-
-To be guaranteed that the kernel will map a 2M page immediately in any
-memory region, the mmap region has to be hugepage naturally
-aligned. posix_memalign() can provide that guarantee.
-
-== Hugetlbfs ==
-
-You can use hugetlbfs on a kernel that has transparent hugepage
-support enabled just fine as always. No difference can be noted in
-hugetlbfs other than there will be less overall fragmentation. All
-usual features belonging to hugetlbfs are preserved and
-unaffected. libhugetlbfs will also work fine as usual.
-
-== Graceful fallback ==
-
-Code walking pagetables but unaware about huge pmds can simply call
-split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
-pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_pmd where
-missing after pmd_offset returns the pmd. Thanks to the graceful
-fallback design, with a one liner change, you can avoid to write
-hundred if not thousand of lines of complex code to make your code
-hugepage aware.
-
-If you're not walking pagetables but you run into a physical hugepage
-but you can't handle it natively in your code, you can split it by
-calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example. split_huge_page() can fail
-if the page is pinned and you must handle this correctly.
-
-Example to make mremap.c transparent hugepage aware with a one liner
-change:
-
-diff --git a/mm/mremap.c b/mm/mremap.c
---- a/mm/mremap.c
-+++ b/mm/mremap.c
-@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
- return NULL;
-
- pmd = pmd_offset(pud, addr);
-+ split_huge_pmd(vma, pmd, addr);
- if (pmd_none_or_clear_bad(pmd))
- return NULL;
-
-== Locking in hugepage aware code ==
-
-We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_pmd() has a cost.
-
-To make pagetable walks huge pmd aware, all you need to do is to call
-pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
-mmap_sem in read (or write) mode to be sure an huge pmd cannot be
-created from under you by khugepaged (khugepaged collapse_huge_page
-takes the mmap_sem in write mode in addition to the anon_vma lock). If
-pmd_trans_huge returns false, you just fallback in the old code
-paths. If instead pmd_trans_huge returns true, you have to take the
-page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page table lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_pmd can run in parallel to the
-pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page table lock and fallback to the old code as
-before. Otherwise you can proceed to process the huge pmd and the
-hugepage natively. Once finished you can drop the page table lock.
-
-== Refcounts and transparent huge pages ==
-
-Refcounting on THP is mostly consistent with refcounting on other compound
-pages:
-
- - get_page()/put_page() and GUP operate in head page's ->_refcount.
-
- - ->_refcount in tail pages is always zero: get_page_unless_zero() never
- succeed on tail pages.
-
- - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
- on relevant sub-page of the compound page.
-
- - map/unmap of the whole compound page accounted in compound_mapcount
- (stored in first tail page). For file huge pages, we also increment
- ->_mapcount of all sub-pages in order to have race-free detection of
- last unmap of subpages.
-
-PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
-
-For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
-subpages is offset up by one. This additional reference is required to
-get race-free detection of unmap of subpages when we have them mapped with
-both PMDs and PTEs.
-
-This is optimization required to lower overhead of per-subpage mapcount
-tracking. The alternative is alter ->_mapcount in all subpages on each
-map/unmap of the whole compound page.
-
-For anonymous pages, we set PG_double_map when a PMD of the page got split
-for the first time, but still have PMD mapping. The additional references
-go away with last compound_mapcount.
-
-File pages get PG_double_map set on first map of the page with PTE and
-goes away when the page gets evicted from page cache.
-
-split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the page
-structures. It can be done easily for refcounts taken by page table
-entries. But we don't have enough information on how to distribute any
-additional pins (i.e. from get_user_pages). split_huge_page() fails any
-requests to split pinned huge page: it expects page count to be equal to
-sum of mapcount of all sub-pages plus one (split_huge_page caller must
-have reference for head page).
-
-split_huge_page uses migration entries to stabilize page->_refcount and
-page->_mapcount of anonymous pages. File pages just got unmapped.
-
-We safe against physical memory scanners too: the only legitimate way
-scanner can get reference to a page is get_page_unless_zero().
-
-All tail pages have zero ->_refcount until atomic_add(). This prevents the
-scanner from getting a reference to the tail page up to that point. After the
-atomic_add() we don't care about the ->_refcount value. We already known how
-many references should be uncharged from the head page.
-
-For head page get_page_unless_zero() will succeed and we don't mind. It's
-clear where reference should go after split: it will stay on head page.
-
-Note that split_huge_pmd() doesn't have any limitation on refcounting:
-pmd can be split at any point and never fails.
-
-== Partial unmap and deferred_split_huge_page() ==
-
-Unmapping part of THP (with munmap() or other way) is not going to free
-memory immediately. Instead, we detect that a subpage of THP is not in use
-in page_remove_rmap() and queue the THP for splitting if memory pressure
-comes. Splitting will free up unused subpages.
-
-Splitting the page right away is not an option due to locking context in
-the place where we can detect partial unmap. It's also might be
-counterproductive since in many cases partial unmap happens during exit(2) if
-a THP crosses a VMA boundary.
-
-Function deferred_split_huge_page() is used to queue page for splitting.
-The splitting itself will happen when we get memory pressure via shrinker
-interface.
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.rst
index e14718572476..fdd84cb8d511 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.rst
@@ -1,37 +1,13 @@
- ==============================
- UNEVICTABLE LRU INFRASTRUCTURE
- ==============================
-
-========
-CONTENTS
-========
-
- (*) The Unevictable LRU
-
- - The unevictable page list.
- - Memory control group interaction.
- - Marking address spaces unevictable.
- - Detecting Unevictable Pages.
- - vmscan's handling of unevictable pages.
-
- (*) mlock()'d pages.
-
- - History.
- - Basic management.
- - mlock()/mlockall() system call handling.
- - Filtering special vmas.
- - munlock()/munlockall() system call handling.
- - Migrating mlocked pages.
- - Compacting mlocked pages.
- - mmap(MAP_LOCKED) system call handling.
- - munmap()/exit()/exec() system call handling.
- - try_to_unmap().
- - try_to_munlock() reverse map scan.
- - Page reclaim in shrink_*_list().
+.. _unevictable_lru:
+==============================
+Unevictable LRU Infrastructure
+==============================
-============
-INTRODUCTION
+.. contents:: :local:
+
+
+Introduction
============
This document describes the Linux memory manager's "Unevictable LRU"
@@ -46,8 +22,8 @@ details - the "what does it do?" - by reading the code. One hopes that the
descriptions below add value by provide the answer to "why does it do that?".
-===================
-THE UNEVICTABLE LRU
+
+The Unevictable LRU
===================
The Unevictable LRU facility adds an additional LRU list to track unevictable
@@ -66,17 +42,17 @@ completely unresponsive.
The unevictable list addresses the following classes of unevictable pages:
- (*) Those owned by ramfs.
+ * Those owned by ramfs.
- (*) Those mapped into SHM_LOCK'd shared memory regions.
+ * Those mapped into SHM_LOCK'd shared memory regions.
- (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
+ * Those mapped into VM_LOCKED [mlock()ed] VMAs.
The infrastructure may also be able to handle other conditions that make pages
unevictable, either by definition or by circumstance, in the future.
-THE UNEVICTABLE PAGE LIST
+The Unevictable Page List
-------------------------
The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
@@ -118,7 +94,7 @@ the unevictable list when one task has the page isolated from the LRU and other
tasks are changing the "evictability" state of the page.
-MEMORY CONTROL GROUP INTERACTION
+Memory Control Group Interaction
--------------------------------
The unevictable LRU facility interacts with the memory control group [aka
@@ -144,7 +120,9 @@ effects:
the control group to thrash or to OOM-kill tasks.
-MARKING ADDRESS SPACES UNEVICTABLE
+.. _mark_addr_space_unevict:
+
+Marking Address Spaces Unevictable
----------------------------------
For facilities such as ramfs none of the pages attached to the address space
@@ -152,15 +130,15 @@ may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE
address space flag is provided, and this can be manipulated by a filesystem
using a number of wrapper functions:
- (*) void mapping_set_unevictable(struct address_space *mapping);
+ * ``void mapping_set_unevictable(struct address_space *mapping);``
Mark the address space as being completely unevictable.
- (*) void mapping_clear_unevictable(struct address_space *mapping);
+ * ``void mapping_clear_unevictable(struct address_space *mapping);``
Mark the address space as being evictable.
- (*) int mapping_unevictable(struct address_space *mapping);
+ * ``int mapping_unevictable(struct address_space *mapping);``
Query the address space, and return true if it is completely
unevictable.
@@ -177,12 +155,13 @@ These are currently used in two places in the kernel:
ensure they're in memory.
-DETECTING UNEVICTABLE PAGES
+Detecting Unevictable Pages
---------------------------
The function page_evictable() in vmscan.c determines whether a page is
-evictable or not using the query function outlined above [see section "Marking
-address spaces unevictable"] to check the AS_UNEVICTABLE flag.
+evictable or not using the query function outlined above [see section
+:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
+to check the AS_UNEVICTABLE flag.
For address spaces that are so marked after being populated (as SHM regions
might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
@@ -202,7 +181,7 @@ flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
-VMSCAN'S HANDLING OF UNEVICTABLE PAGES
+Vmscan's Handling of Unevictable Pages
--------------------------------------
If unevictable pages are culled in the fault path, or moved to the unevictable
@@ -233,8 +212,7 @@ extra evictabilty checks should not occur in the majority of calls to
putback_lru_page().
-=============
-MLOCKED PAGES
+MLOCKED Pages
=============
The unevictable page list is also useful for mlock(), in addition to ramfs and
@@ -242,7 +220,7 @@ SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in
NOMMU situations, all mappings are effectively mlocked.
-HISTORY
+History
-------
The "Unevictable mlocked Pages" infrastructure is based on work originally
@@ -263,7 +241,7 @@ replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
mapped the page. More on this below.
-BASIC MANAGEMENT
+Basic Management
----------------
mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
@@ -304,10 +282,10 @@ mlocked pages become unlocked and rescued from the unevictable list when:
(4) before a page is COW'd in a VM_LOCKED VMA.
-mlock()/mlockall() SYSTEM CALL HANDLING
+mlock()/mlockall() System Call Handling
---------------------------------------
-Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
+Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup()
for each VMA in the range specified by the call. In the case of mlockall(),
this is the entire active address space of the task. Note that mlock_fixup()
is used for both mlocking and munlocking a range of memory. A call to mlock()
@@ -351,7 +329,7 @@ mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
it later if and when it attempts to reclaim the page.
-FILTERING SPECIAL VMAS
+Filtering Special VMAs
----------------------
mlock_fixup() filters several classes of "special" VMAs:
@@ -379,8 +357,9 @@ VM_LOCKED flag. Therefore, we won't have to deal with them later during
munlock(), munmap() or task exit. Neither does mlock_fixup() account these
VMAs against the task's "locked_vm".
+.. _munlock_munlockall_handling:
-munlock()/munlockall() SYSTEM CALL HANDLING
+munlock()/munlockall() System Call Handling
-------------------------------------------
The munlock() and munlockall() system calls are handled by the same functions -
@@ -426,7 +405,7 @@ This is fine, because we'll catch it later if and if vmscan tries to reclaim
the page. This should be relatively rare.
-MIGRATING MLOCKED PAGES
+Migrating MLOCKED Pages
-----------------------
A page that is being migrated has been isolated from the LRU lists and is held
@@ -451,7 +430,7 @@ list because of a race between munlock and migration, page migration uses the
putback_lru_page() function to add migrated pages back to the LRU.
-COMPACTING MLOCKED PAGES
+Compacting MLOCKED Pages
------------------------
The unevictable LRU can be scanned for compactable regions and the default
@@ -461,7 +440,7 @@ unevictable LRU is enabled, the work of compaction is mostly handled by
the page migration code and the same work flow as described in MIGRATING
MLOCKED PAGES will apply.
-MLOCKING TRANSPARENT HUGE PAGES
+MLOCKING Transparent Huge Pages
-------------------------------
A transparent huge page is represented by a single entry on an LRU list.
@@ -483,7 +462,7 @@ to unevictable LRU and the rest can be reclaimed.
See also comment in follow_trans_huge_pmd().
-mmap(MAP_LOCKED) SYSTEM CALL HANDLING
+mmap(MAP_LOCKED) System Call Handling
-------------------------------------
In addition the mlock()/mlockall() system calls, an application can request
@@ -514,7 +493,7 @@ memory range accounted as locked_vm, as the protections could be changed later
and pages allocated into that region.
-munmap()/exit()/exec() SYSTEM CALL HANDLING
+munmap()/exit()/exec() System Call Handling
-------------------------------------------
When unmapping an mlocked region of memory, whether by an explicit call to
@@ -568,16 +547,18 @@ munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
holepunching, and truncation of file pages and their anonymous COWed pages.
-try_to_munlock() REVERSE MAP SCAN
+try_to_munlock() Reverse Map Scan
---------------------------------
- [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
- page_referenced() reverse map walker.
+.. warning::
+ [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
+ page_referenced() reverse map walker.
-When munlock_vma_page() [see section "munlock()/munlockall() System Call
-Handling" above] tries to munlock a page, it needs to determine whether or not
-the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
-all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
+When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
+Handling <munlock_munlockall_handling>` above] tries to munlock a
+page, it needs to determine whether or not the page is mapped by any
+VM_LOCKED VMA without actually attempting to unmap all PTEs from the
+page. For this purpose, the unevictable/mlock infrastructure
introduced a variant of try_to_unmap() called try_to_munlock().
try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
@@ -595,7 +576,7 @@ large region or tearing down a large address space that has been mlocked via
mlockall(), overall this is a fairly rare event.
-PAGE RECLAIM IN shrink_*_list()
+Page Reclaim in shrink_*_list()
-------------------------------
shrink_active_list() culls any obviously unevictable pages - i.e.
diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.rst
index 38e4dac810b6..224e3c61d686 100644
--- a/Documentation/vm/z3fold.txt
+++ b/Documentation/vm/z3fold.rst
@@ -1,5 +1,8 @@
+.. _z3fold:
+
+======
z3fold
-------
+======
z3fold is a special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical page.
@@ -7,6 +10,7 @@ It is a zbud derivative which allows for higher compression
ratio keeping the simplicity and determinism of its predecessor.
The main differences between z3fold and zbud are:
+
* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
* z3fold can hold up to 3 compressed pages in its page
* z3fold doesn't export any API itself and is thus intended to be used
diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.rst
index 64ed63c4f69d..6e79893d6132 100644
--- a/Documentation/vm/zsmalloc.txt
+++ b/Documentation/vm/zsmalloc.rst
@@ -1,5 +1,8 @@
+.. _zsmalloc:
+
+========
zsmalloc
---------
+========
This allocator is designed for use with zram. Thus, the allocator is
supposed to work well under low memory conditions. In particular, it
@@ -31,40 +34,49 @@ be mapped using zs_map_object() to get a usable pointer and subsequently
unmapped using zs_unmap_object().
stat
-----
+====
With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
-/sys/kernel/debug/zsmalloc/<user name>. Here is a sample of stat output:
+``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
-# cat /sys/kernel/debug/zsmalloc/zram0/classes
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
- ..
- ..
+ ...
+ ...
9 176 0 1 186 129 8 4
10 192 1 0 2880 2872 135 3
11 208 0 1 819 795 42 2
12 224 0 1 219 159 12 4
- ..
- ..
+ ...
+ ...
+
+class
+ index
+size
+ object size zspage stores
+almost_empty
+ the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full
+ the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated
+ the number of objects allocated
+obj_used
+ the number of objects allocated to the user
+pages_used
+ the number of pages allocated for the class
+pages_per_zspage
+ the number of 0-order pages to make a zspage
-class: index
-size: object size zspage stores
-almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full: the number of ZS_ALMOST_FULL zspages(see below)
-obj_allocated: the number of objects allocated
-obj_used: the number of objects allocated to the user
-pages_used: the number of pages allocated for the class
-pages_per_zspage: the number of 0-order pages to make a zspage
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when:
- n <= N / f, where
-n = number of allocated objects
-N = total number of objects zspage can store
-f = fullness_threshold_frac(ie, 4 at the moment)
+* n = number of allocated objects
+* N = total number of objects zspage can store
+* f = fullness_threshold_frac(ie, 4 at the moment)
Similarly, we assign zspage to:
- ZS_ALMOST_FULL when n > N / f
- ZS_EMPTY when n == 0
- ZS_FULL when n == N
+
+* ZS_ALMOST_FULL when n > N / f
+* ZS_EMPTY when n == 0
+* ZS_FULL when n == N
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.rst
index 0b3a1148f9f0..1444ecd40911 100644
--- a/Documentation/vm/zswap.txt
+++ b/Documentation/vm/zswap.rst
@@ -1,4 +1,11 @@
-Overview:
+.. _zswap:
+
+=====
+zswap
+=====
+
+Overview
+========
Zswap is a lightweight compressed cache for swap pages. It takes pages that are
in the process of being swapped out and attempts to compress them into a
@@ -7,32 +14,34 @@ for potentially reduced swap I/O.  This trade-off can also result in a
significant performance improvement if reads from the compressed cache are
faster than reads from a swap device.
-NOTE: Zswap is a new feature as of v3.11 and interacts heavily with memory
-reclaim. This interaction has not been fully explored on the large set of
-potential configurations and workloads that exist. For this reason, zswap
-is a work in progress and should be considered experimental.
+.. note::
+ Zswap is a new feature as of v3.11 and interacts heavily with memory
+ reclaim. This interaction has not been fully explored on the large set of
+ potential configurations and workloads that exist. For this reason, zswap
+ is a work in progress and should be considered experimental.
+
+ Some potential benefits:
-Some potential benefits:
* Desktop/laptop users with limited RAM capacities can mitigate the
-    performance impact of swapping.
+ performance impact of swapping.
* Overcommitted guests that share a common I/O resource can
-    dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
- throttling by the hypervisor. This allows more work to get done with less
- impact to the guest workload and guests sharing the I/O subsystem
+ dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
+ throttling by the hypervisor. This allows more work to get done with less
+ impact to the guest workload and guests sharing the I/O subsystem
* Users with SSDs as swap devices can extend the life of the device by
-    drastically reducing life-shortening writes.
+ drastically reducing life-shortening writes.
Zswap evicts pages from compressed cache on an LRU basis to the backing swap
device when the compressed pool reaches its size limit. This requirement had
been identified in prior community discussions.
Zswap is disabled by default but can be enabled at boot time by setting
-the "enabled" attribute to 1 at boot time. ie: zswap.enabled=1. Zswap
+the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``. Zswap
can also be enabled and disabled at runtime using the sysfs interface.
An example command to enable zswap at runtime, assuming sysfs is mounted
-at /sys, is:
+at ``/sys``, is::
-echo 1 > /sys/module/zswap/parameters/enabled
+ echo 1 > /sys/module/zswap/parameters/enabled
When zswap is disabled at runtime it will stop storing pages that are
being swapped out. However, it will _not_ immediately write out or fault
@@ -43,7 +52,8 @@ pages out of the compressed pool, a swapoff on the swap device(s) will
fault back into memory all swapped out pages, including those in the
compressed pool.
-Design:
+Design
+======
Zswap receives pages for compression through the Frontswap API and is able to
evict pages from its own compressed pool on an LRU basis and write them back to
@@ -53,12 +63,12 @@ Zswap makes use of zpool for the managing the compressed memory pool. Each
allocation in zpool is not directly accessible by address. Rather, a handle is
returned by the allocation routine and that handle must be mapped before being
accessed. The compressed memory pool grows on demand and shrinks as compressed
-pages are freed. The pool is not preallocated. By default, a zpool of type
-zbud is created, but it can be selected at boot time by setting the "zpool"
-attribute, e.g. zswap.zpool=zbud. It can also be changed at runtime using the
-sysfs "zpool" attribute, e.g.
+pages are freed. The pool is not preallocated. By default, a zpool
+of type zbud is created, but it can be selected at boot time by
+setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
+also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
-echo zbud > /sys/module/zswap/parameters/zpool
+ echo zbud > /sys/module/zswap/parameters/zpool
The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
means the compression ratio will always be 2:1 or worse (because of half-full
@@ -83,14 +93,16 @@ via frontswap, to free the compressed entry.
Zswap seeks to be simple in its policies. Sysfs attributes allow for one user
controlled policy:
+
* max_pool_percent - The maximum percentage of memory that the compressed
- pool can occupy.
+ pool can occupy.
-The default compressor is lzo, but it can be selected at boot time by setting
-the “compressor†attribute, e.g. zswap.compressor=lzo. It can also be changed
-at runtime using the sysfs "compressor" attribute, e.g.
+The default compressor is lzo, but it can be selected at boot time by
+setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+It can also be changed at runtime using the sysfs "compressor"
+attribute, e.g.::
-echo lzo > /sys/module/zswap/parameters/compressor
+ echo lzo > /sys/module/zswap/parameters/compressor
When the zpool and/or compressor parameter is changed at runtime, any existing
compressed pages are not modified; they are left in their own zpool. When a
@@ -106,11 +118,12 @@ compressed length of the page is set to zero and the pattern or same-filled
value is stored.
Same-value filled pages identification feature is enabled by default and can be
-disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0,
-e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at
-runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
+disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
+disabled at runtime using the sysfs ``same_filled_pages_enabled``
+attribute, e.g.::
-echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+ echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
When zswap same-filled page identification is disabled at runtime, it will stop
checking for the same-value filled pages during store operation. However, the
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index 71c30984e94d..a16aa2113840 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -17,12 +17,14 @@ MBA (Memory Bandwidth Allocation) - "mba"
To use the feature mount the file system:
- # mount -t resctrl resctrl [-o cdp[,cdpl2]] /sys/fs/resctrl
+ # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl
mount options are:
"cdp": Enable code/data prioritization in L3 cache allocations.
"cdpl2": Enable code/data prioritization in L2 cache allocations.
+"mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA
+ bandwidth in MBps
L2 and L3 CDP are controlled seperately.
@@ -270,10 +272,11 @@ and 0xA are not. On a system with a 20-bit mask each bit represents 5%
of the capacity of the cache. You could partition the cache into four
equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.
-Memory bandwidth(b/w) percentage
---------------------------------
-For Memory b/w resource, user controls the resource by indicating the
-percentage of total memory b/w.
+Memory bandwidth Allocation and monitoring
+------------------------------------------
+
+For Memory bandwidth resource, by default the user controls the resource
+by indicating the percentage of total memory bandwidth.
The minimum bandwidth percentage value for each cpu model is predefined
and can be looked up through "info/MB/min_bandwidth". The bandwidth
@@ -285,7 +288,47 @@ to the next control step available on the hardware.
The bandwidth throttling is a core specific mechanism on some of Intel
SKUs. Using a high bandwidth and a low bandwidth setting on two threads
sharing a core will result in both threads being throttled to use the
-low bandwidth.
+low bandwidth. The fact that Memory bandwidth allocation(MBA) is a core
+specific mechanism where as memory bandwidth monitoring(MBM) is done at
+the package level may lead to confusion when users try to apply control
+via the MBA and then monitor the bandwidth to see if the controls are
+effective. Below are such scenarios:
+
+1. User may *not* see increase in actual bandwidth when percentage
+ values are increased:
+
+This can occur when aggregate L2 external bandwidth is more than L3
+external bandwidth. Consider an SKL SKU with 24 cores on a package and
+where L2 external is 10GBps (hence aggregate L2 external bandwidth is
+240GBps) and L3 external bandwidth is 100GBps. Now a workload with '20
+threads, having 50% bandwidth, each consuming 5GBps' consumes the max L3
+bandwidth of 100GBps although the percentage value specified is only 50%
+<< 100%. Hence increasing the bandwidth percentage will not yeild any
+more bandwidth. This is because although the L2 external bandwidth still
+has capacity, the L3 external bandwidth is fully used. Also note that
+this would be dependent on number of cores the benchmark is run on.
+
+2. Same bandwidth percentage may mean different actual bandwidth
+ depending on # of threads:
+
+For the same SKU in #1, a 'single thread, with 10% bandwidth' and '4
+thread, with 10% bandwidth' can consume upto 10GBps and 40GBps although
+they have same percentage bandwidth of 10%. This is simply because as
+threads start using more cores in an rdtgroup, the actual bandwidth may
+increase or vary although user specified bandwidth percentage is same.
+
+In order to mitigate this and make the interface more user friendly,
+resctrl added support for specifying the bandwidth in MBps as well. The
+kernel underneath would use a software feedback mechanism or a "Software
+Controller(mba_sc)" which reads the actual bandwidth using MBM counters
+and adjust the memowy bandwidth percentages to ensure
+
+ "actual bandwidth < user specified bandwidth".
+
+By default, the schemata would take the bandwidth percentage values
+where as user can switch to the "MBA software controller" mode using
+a mount option 'mba_MBps'. The schemata format is specified in the below
+sections.
L3 schemata file details (code and data prioritization disabled)
----------------------------------------------------------------
@@ -308,13 +351,20 @@ schemata format is always:
L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
-Memory b/w Allocation details
------------------------------
+Memory bandwidth Allocation (default mode)
+------------------------------------------
Memory b/w domain is L3 cache.
MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...
+Memory bandwidth Allocation specified in MBps
+---------------------------------------------
+
+Memory bandwidth domain is L3 cache.
+
+ MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;...
+
Reading/writing the schemata file
---------------------------------
Reading the schemata file will show the state of all resources
@@ -358,6 +408,15 @@ allocations can overlap or not. The allocations specifies the maximum
b/w that the group may be able to use and the system admin can configure
the b/w accordingly.
+If the MBA is specified in MB(megabytes) then user can enter the max b/w in MB
+rather than the percentage values.
+
+# echo "L3:0=3;1=c\nMB:0=1024;1=500" > /sys/fs/resctrl/p0/schemata
+# echo "L3:0=3;1=3\nMB:0=1024;1=500" > /sys/fs/resctrl/p1/schemata
+
+In the above example the tasks in "p1" and "p0" on socket 0 would use a max b/w
+of 1024MB where as on socket 1 they would use 500MB.
+
Example 2
---------
Again two sockets, but this time with a more realistic 20-bit mask.
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index b297c48389b9..8d109ef67ab6 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -187,9 +187,9 @@ PCI
IOMMU (input/output memory management unit)
- Currently four x86-64 PCI-DMA mapping implementations exist:
+ Multiple x86-64 PCI-DMA mapping implementations exist, for example:
- 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+ 1. <lib/dma-direct.c>: use no hardware/software IOMMU at all
(e.g. because you have < 3 GB memory).
Kernel boot message: "PCI-DMA: Disabling IOMMU"
@@ -208,7 +208,7 @@ IOMMU (input/output memory management unit)
Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
- [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+ [,memaper[=<order>]][,merge][,fullflush][,nomerge]
[,noaperture][,calgary]
General iommu options:
@@ -235,14 +235,7 @@ IOMMU (input/output memory management unit)
(experimental).
nomerge Don't do scatter-gather (SG) merging.
noaperture Ask the IOMMU not to touch the aperture for AGP.
- forcesac Force single-address cycle (SAC) mode for masks <40bits
- (experimental).
noagp Don't initialize the AGP driver and use full aperture.
- allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
- DAC is used with 32-bit PCI to push a 64-bit address in
- two cycles. When off all DMA over >4GB is forced through
- an IOMMU or software bounce buffering.
- nodac Forbid DAC mode, i.e. DMA >4GB.
panic Always panic when IOMMU overflows.
calgary Use the Calgary IOMMU if it is available
diff --git a/LICENSES/exceptions/Linux-syscall-note b/LICENSES/exceptions/Linux-syscall-note
index 6b60b61be4e9..9abdad71fafd 100644
--- a/LICENSES/exceptions/Linux-syscall-note
+++ b/LICENSES/exceptions/Linux-syscall-note
@@ -1,6 +1,6 @@
SPDX-Exception-Identifier: Linux-syscall-note
SPDX-URL: https://spdx.org/licenses/Linux-syscall-note.html
-SPDX-Licenses: GPL-2.0, GPL-2.0+, GPL-1.0+, LGPL-2.0, LGPL-2.0+, LGPL-2.1, LGPL-2.1+
+SPDX-Licenses: GPL-2.0, GPL-2.0+, GPL-1.0+, LGPL-2.0, LGPL-2.0+, LGPL-2.1, LGPL-2.1+, GPL-2.0-only, GPL-2.0-or-later
Usage-Guide:
This exception is used together with one of the above SPDX-Licenses
to mark user space API (uapi) header files so they can be included
diff --git a/LICENSES/other/Apache-2.0 b/LICENSES/other/Apache-2.0
new file mode 100644
index 000000000000..7cd903f573e5
--- /dev/null
+++ b/LICENSES/other/Apache-2.0
@@ -0,0 +1,183 @@
+Valid-License-Identifier: Apache-2.0
+SPDX-URL: https://spdx.org/licenses/Apache-2.0.html
+Usage-Guide:
+ To use the Apache License version 2.0 put the following SPDX tag/value
+ pair into a comment according to the placement guidelines in the
+ licensing rules documentation:
+ SPDX-License-Identifier: Apache-2.0
+License-Text:
+
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the
+copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other
+entities that control, are controlled by, or are under common control with
+that entity. For the purposes of this definition, "control" means (i) the
+power, direct or indirect, to cause the direction or management of such
+entity, whether by contract or otherwise, or (ii) ownership of fifty
+percent (50%) or more of the outstanding shares, or (iii) beneficial
+ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation source,
+and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation
+or translation of a Source form, including but not limited to compiled
+object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form,
+made available under the License, as indicated by a copyright notice that
+is included in or attached to the work (an example is provided in the
+Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form,
+that is based on (or derived from) the Work and for which the editorial
+revisions, annotations, elaborations, or other modifications represent, as
+a whole, an original work of authorship. For the purposes of this License,
+Derivative Works shall not include works that remain separable from, or
+merely link (or bind by name) to the interfaces of, the Work and Derivative
+Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original
+version of the Work and any modifications or additions to that Work or
+Derivative Works thereof, that is intentionally submitted to Licensor for
+inclusion in the Work by the copyright owner or by an individual or Legal
+Entity authorized to submit on behalf of the copyright owner. For the
+purposes of this definition, "submitted" means any form of electronic,
+verbal, or written communication sent to the Licensor or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that
+are managed by, or on behalf of, the Licensor for the purpose of discussing
+and improving the Work, but excluding communication that is conspicuously
+marked or otherwise designated in writing by the copyright owner as "Not a
+Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on
+behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this
+ License, each Contributor hereby grants to You a perpetual, worldwide,
+ non-exclusive, no-charge, royalty-free, irrevocable copyright license to
+ reproduce, prepare Derivative Works of, publicly display, publicly
+ perform, sublicense, and distribute the Work and such Derivative Works
+ in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this
+ License, each Contributor hereby grants to You a perpetual, worldwide,
+ non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
+ this section) patent license to make, have made, use, offer to sell,
+ sell, import, and otherwise transfer the Work, where such license
+ applies only to those patent claims licensable by such Contributor that
+ are necessarily infringed by their Contribution(s) alone or by
+ combination of their Contribution(s) with the Work to which such
+ Contribution(s) was submitted. If You institute patent litigation
+ against any entity (including a cross-claim or counterclaim in a
+ lawsuit) alleging that the Work or a Contribution incorporated within
+ the Work constitutes direct or contributory patent infringement, then
+ any patent licenses granted to You under this License for that Work
+ shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or
+ Derivative Works thereof in any medium, with or without modifications,
+ and in Source or Object form, provided that You meet the following
+ conditions:
+
+ a. You must give any other recipients of the Work or Derivative Works a
+ copy of this License; and
+
+ b. You must cause any modified files to carry prominent notices stating
+ that You changed the files; and
+
+ c. You must retain, in the Source form of any Derivative Works that You
+ distribute, all copyright, patent, trademark, and attribution notices
+ from the Source form of the Work, excluding those notices that do not
+ pertain to any part of the Derivative Works; and
+
+ d. If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained within
+ such NOTICE file, excluding those notices that do not pertain to any
+ part of the Derivative Works, in at least one of the following
+ places: within a NOTICE text file distributed as part of the
+ Derivative Works; within the Source form or documentation, if
+ provided along with the Derivative Works; or, within a display
+ generated by the Derivative Works, if and wherever such third-party
+ notices normally appear. The contents of the NOTICE file are for
+ informational purposes only and do not modify the License. You may
+ add Your own attribution notices within Derivative Works that You
+ distribute, alongside or as an addendum to the NOTICE text from the
+ Work, provided that such additional attribution notices cannot be
+ construed as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and may
+ provide additional or different license terms and conditions for use,
+ reproduction, or distribution of Your modifications, or for any such
+ Derivative Works as a whole, provided Your use, reproduction, and
+ distribution of the Work otherwise complies with the conditions stated
+ in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any
+ Contribution intentionally submitted for inclusion in the Work by You to
+ the Licensor shall be under the terms and conditions of this License,
+ without any additional terms or conditions. Notwithstanding the above,
+ nothing herein shall supersede or modify the terms of any separate
+ license agreement you may have executed with Licensor regarding such
+ Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to
+ in writing, Licensor provides the Work (and each Contributor provides
+ its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied, including, without limitation,
+ any warranties or conditions of TITLE, NON-INFRINGEMENT,
+ MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely
+ responsible for determining the appropriateness of using or
+ redistributing the Work and assume any risks associated with Your
+ exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether
+ in tort (including negligence), contract, or otherwise, unless required
+ by applicable law (such as deliberate and grossly negligent acts) or
+ agreed to in writing, shall any Contributor be liable to You for
+ damages, including any direct, indirect, special, incidental, or
+ consequential damages of any character arising as a result of this
+ License or out of the use or inability to use the Work (including but
+ not limited to damages for loss of goodwill, work stoppage, computer
+ failure or malfunction, or any and all other commercial damages or
+ losses), even if such Contributor has been advised of the possibility of
+ such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the
+ Work or Derivative Works thereof, You may choose to offer, and charge a
+ fee for, acceptance of support, warranty, indemnity, or other liability
+ obligations and/or rights consistent with this License. However, in
+ accepting such obligations, You may act only on Your own behalf and on
+ Your sole responsibility, not on behalf of any other Contributor, and
+ only if You agree to indemnify, defend, and hold each Contributor
+ harmless for any liability incurred by, or claims asserted against, such
+ Contributor by reason of your accepting any such warranty or additional
+ liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/LICENSES/other/CC-BY-SA-4.0 b/LICENSES/other/CC-BY-SA-4.0
new file mode 100644
index 000000000000..f9158e831e79
--- /dev/null
+++ b/LICENSES/other/CC-BY-SA-4.0
@@ -0,0 +1,397 @@
+Valid-License-Identifier: CC-BY-SA-4.0
+SPDX-URL: https://spdx.org/licenses/CC-BY-SA-4.0
+Usage-Guide:
+ To use the Creative Commons Attribution Share Alike 4.0 International
+ license put the following SPDX tag/value pair into a comment according to
+ the placement guidelines in the licensing rules documentation:
+ SPDX-License-Identifier: CC-BY-SA-4.0
+License-Text:
+
+Creative Commons Attribution-ShareAlike 4.0 International
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of Creative
+Commons public licenses does not create a lawyer-client or other
+relationship. Creative Commons makes its licenses and related information
+available on an "as-is" basis. Creative Commons gives no warranties
+regarding its licenses, any material licensed under their terms and
+conditions, or any related information. Creative Commons disclaims all
+liability for damages resulting from their use to the fullest extent
+possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share original
+works of authorship and other material subject to copyright and certain
+other rights specified in the public license below. The following
+considerations are for informational purposes only, are not exhaustive, and
+do not form part of our licenses.
+
+Considerations for licensors: Our public licenses are intended for use by
+those authorized to give the public permission to use material in ways
+otherwise restricted by copyright and certain other rights. Our licenses
+are irrevocable. Licensors should read and understand the terms and
+conditions of the license they choose before applying it. Licensors should
+also secure all rights necessary before applying our licenses so that the
+public can reuse the material as expected. Licensors should clearly mark
+any material not subject to the license. This includes other CC-licensed
+material, or material used under an exception or limitation to
+copyright. More considerations for licensors :
+wiki.creativecommons.org/Considerations_for_licensors
+
+Considerations for the public: By using one of our public licenses, a
+licensor grants the public permission to use the licensed material under
+specified terms and conditions. If the licensor's permission is not
+necessary for any reason - for example, because of any applicable exception
+or limitation to copyright - then that use is not regulated by the
+license. Our licenses grant only permissions under copyright and certain
+other rights that a licensor has authority to grant. Use of the licensed
+material may still be restricted for other reasons, including because
+others have copyright or other rights in the material. A licensor may make
+special requests, such as asking that all changes be marked or described.
+
+Although not required by our licenses, you are encouraged to respect those
+requests where reasonable. More considerations for the public :
+wiki.creativecommons.org/Considerations_for_licensees
+
+Creative Commons Attribution-ShareAlike 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to
+be bound by the terms and conditions of this Creative Commons
+Attribution-ShareAlike 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You such
+rights in consideration of benefits the Licensor receives from making the
+Licensed Material available under these terms and conditions.
+
+Section 1 - Definitions.
+
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material and
+ in which the Licensed Material is translated, altered, arranged,
+ transformed, or otherwise modified in a manner requiring permission
+ under the Copyright and Similar Rights held by the Licensor. For
+ purposes of this Public License, where the Licensed Material is a
+ musical work, performance, or sound recording, Adapted Material is
+ always produced where the Licensed Material is synched in timed
+ relation with a moving image.
+
+ b. Adapter's License means the license You apply to Your Copyright and
+ Similar Rights in Your contributions to Adapted Material in
+ accordance with the terms and conditions of this Public License.
+
+ c. BY-SA Compatible License means a license listed at
+ creativecommons.org/compatiblelicenses, approved by Creative Commons
+ as essentially the equivalent of this Public License.
+
+ d. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+
+ e. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright Treaty
+ adopted on December 20, 1996, and/or similar international
+ agreements.
+
+ f. Exceptions and Limitations means fair use, fair dealing, and/or any
+ other exception or limitation to Copyright and Similar Rights that
+ applies to Your use of the Licensed Material.
+
+ g. License Elements means the license attributes listed in the name of
+ a Creative Commons Public License. The License Elements of this
+ Public License are Attribution and ShareAlike.
+
+ h. Licensed Material means the artistic or literary work, database, or
+ other material to which the Licensor applied this Public License.
+
+ i. Licensed Rights means the rights granted to You subject to the terms
+ and conditions of this Public License, which are limited to all
+ Copyright and Similar Rights that apply to Your use of the Licensed
+ Material and that the Licensor has authority to license.
+
+ j. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+
+ k. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such as
+ reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the public
+ may access the material from a place and at a time individually
+ chosen by them.
+
+ l. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially equivalent
+ rights anywhere in the world. m. You means the individual or entity
+ exercising the Licensed Rights under this Public License. Your has a
+ corresponding meaning.
+
+Section 2 - Scope.
+
+ a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License, the
+ Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ A. reproduce and Share the Licensed Material, in whole or in part; and
+
+ B. produce, reproduce, and Share Adapted Material.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with its
+ terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section 6(a).
+
+ 4. Media and formats; technical modifications allowed. The Licensor
+ authorizes You to exercise the Licensed Rights in all media and
+ formats whether now known or hereafter created, and to make
+ technical modifications necessary to do so. The Licensor waives
+ and/or agrees not to assert any right or authority to forbid You
+ from making technical modifications necessary to exercise the
+ Licensed Rights, including technical modifications necessary to
+ circumvent Effective Technological Measures. For purposes of
+ this Public License, simply making modifications authorized by
+ this Section 2(a)(4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ A. Offer from the Licensor - Licensed Material. Every recipient
+ of the Licensed Material automatically receives an offer
+ from the Licensor to exercise the Licensed Rights under the
+ terms and conditions of this Public License.
+
+ B. Additional offer from the Licensor - Adapted Material. Every
+ recipient of Adapted Material from You automatically
+ receives an offer from the Licensor to exercise the Licensed
+ Rights in the Adapted Material under the conditions of the
+ Adapter's License You apply.
+
+ C. No downstream restrictions. You may not offer or impose any
+ additional or different terms or conditions on, or apply any
+ Effective Technological Measures to, the Licensed Material
+ if doing so restricts exercise of the Licensed Rights by any
+ recipient of the Licensed Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You are,
+ or that Your use of the Licensed Material is, connected with, or
+ sponsored, endorsed, or granted official status by, the Licensor
+ or others designated to receive attribution as provided in
+ Section 3(a)(1)(A)(i).
+
+ b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not licensed
+ under this Public License, nor are publicity, privacy, and/or
+ other similar personality rights; however, to the extent
+ possible, the Licensor waives and/or agrees not to assert any
+ such rights held by the Licensor to the limited extent necessary
+ to allow You to exercise the Licensed Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this Public
+ License.
+
+ 3. To the extent possible, the Licensor waives any right to collect
+ royalties from You for the exercise of the Licensed Rights,
+ whether directly or through a collecting society under any
+ voluntary or waivable statutory or compulsory licensing
+ scheme. In all other cases the Licensor expressly reserves any
+ right to collect such royalties.
+
+Section 3 - License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+ a. Attribution.
+
+ 1. If You Share the Licensed Material (including in modified form),
+ You must:
+
+ A. retain the following if it is supplied by the Licensor with
+ the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by the
+ Licensor (including by pseudonym if designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+
+ B. indicate if You modified the Licensed Material and retain an
+ indication of any previous modifications; and
+
+ C. indicate the Licensed Material is licensed under this Public
+ License, and include the text of, or the URI or hyperlink to,
+ this Public License.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required information.
+
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable. b. ShareAlike.In addition to the
+ conditions in Section 3(a), if You Share Adapted Material You
+ produce, the following conditions also apply.
+
+ 1. The Adapter's License You apply must be a Creative Commons
+ license with the same License Elements, this version or
+ later, or a BY-SA Compatible License.
+
+ 2. You must include the text of, or the URI or hyperlink to, the
+ Adapter's License You apply. You may satisfy this condition
+ in any reasonable manner based on the medium, means, and
+ context in which You Share Adapted Material.
+
+ 3. You may not offer or impose any additional or different terms
+ or conditions on, or apply any Effective Technological
+ Measures to, Adapted Material that restrict exercise of the
+ rights granted under the Adapter's License You apply.
+
+Section 4 - Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to
+Your use of the Licensed Material:
+
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right to
+ extract, reuse, reproduce, and Share all or a substantial portion of
+ the contents of the database;
+
+ b. if You include all or a substantial portion of the database contents
+ in a database in which You have Sui Generis Database Rights, then
+ the database in which You have Sui Generis Database Rights (but not
+ its individual contents) is Adapted Material, including for purposes
+ of Section 3(b); and
+
+ c. You must comply with the conditions in Section 3(a) if You Share all
+ or a substantial portion of the contents of the database.
+
+ For the avoidance of doubt, this Section 4 supplements and does not
+ replace Your obligations under this Public License where the Licensed
+ Rights include other Copyright and Similar Rights.
+
+Section 5 - Disclaimer of Warranties and Limitation of Liability.
+
+ a. Unless otherwise separately undertaken by the Licensor, to the
+ extent possible, the Licensor offers the Licensed Material as-is and
+ as-available, and makes no representations or warranties of any kind
+ concerning the Licensed Material, whether express, implied,
+ statutory, or other. This includes, without limitation, warranties
+ of title, merchantability, fitness for a particular purpose,
+ non-infringement, absence of latent or other defects, accuracy, or
+ the presence or absence of errors, whether or not known or
+ discoverable. Where disclaimers of warranties are not allowed in
+ full or in part, this disclaimer may not apply to You.
+
+ b. To the extent possible, in no event will the Licensor be liable to
+ You on any legal theory (including, without limitation, negligence)
+ or otherwise for any direct, special, indirect, incidental,
+ consequential, punitive, exemplary, or other losses, costs,
+ expenses, or damages arising out of this Public License or use of
+ the Licensed Material, even if the Licensor has been advised of the
+ possibility of such losses, costs, expenses, or damages. Where a
+ limitation of liability is not allowed in full or in part, this
+ limitation may not apply to You.
+
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent possible,
+ most closely approximates an absolute disclaimer and waiver of all
+ liability.
+
+Section 6 - Term and Termination.
+
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided it
+ is cured within 30 days of Your discovery of the violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ c. For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations of
+ this Public License.
+
+ d. For the avoidance of doubt, the Licensor may also offer the Licensed
+ Material under separate terms or conditions or stop distributing the
+ Licensed Material at any time; however, doing so will not terminate
+ this Public License.
+
+ e. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+Section 7 - Other Terms and Conditions.
+
+ a. The Licensor shall not be bound by any additional or different terms
+ or conditions communicated by You unless expressly agreed.
+
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+Section 8 - Interpretation.
+
+ a. For the avoidance of doubt, this Public License does not, and shall
+ not be interpreted to, reduce, limit, restrict, or impose conditions
+ on any use of the Licensed Material that could lawfully be made
+ without permission under this Public License.
+
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+
+ d. Nothing in this Public License constitutes or may be interpreted as
+ a limitation upon, or waiver of, any privileges and immunities that
+ apply to the Licensor or You, including from the legal processes of
+ any jurisdiction or authority.
+
+Creative Commons is not a party to its public licenses. Notwithstanding,
+Creative Commons may elect to apply one of its public licenses to material
+it publishes and in those instances will be considered the "Licensor." The
+text of the Creative Commons public licenses is dedicated to the public
+domain under the CC0 Public Domain Dedication. Except for the limited
+purpose of indicating that material is shared under a Creative Commons
+public license or as otherwise permitted by the Creative Commons policies
+published at creativecommons.org/policies, Creative Commons does not
+authorize the use of the trademark "Creative Commons" or any other
+trademark or logo of Creative Commons without its prior written consent
+including, without limitation, in connection with any unauthorized
+modifications to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For the
+avoidance of doubt, this paragraph does not form part of the public
+licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/LICENSES/other/CDDL-1.0 b/LICENSES/other/CDDL-1.0
new file mode 100644
index 000000000000..195a1687930a
--- /dev/null
+++ b/LICENSES/other/CDDL-1.0
@@ -0,0 +1,364 @@
+Valid-License-Identifier: CDDL-1.0
+SPDX-URL: https://spdx.org/licenses/CDDL-1.0.html
+Usage-Guide:
+ To use the Common Development and Distribution License 1.0 put the
+ following SPDX tag/value pair into a comment according to the placement
+ guidelines in the licensing rules documentation:
+ SPDX-License-Identifier: CDDL-1.0
+
+License-Text:
+
+COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL)
+Version 1.0
+
+ 1. Definitions.
+
+ 1.1. "Contributor" means each individual or entity that creates or
+ contributes to the creation of Modifications.
+
+ 1.2. "Contributor Version" means the combination of the Original
+ Software, prior Modifications used by a Contributor (if any),
+ and the Modifications made by that particular Contributor.
+
+ 1.3. "Covered Software" means (a) the Original Software, or (b)
+ Modifications, or (c) the combination of files containing
+ Original Software with files containing Modifications, in each
+ case including portions thereof.
+
+ 1.4. "Executable" means the Covered Software in any form other than
+ Source Code.
+
+ 1.5. "Initial Developer" means the individual or entity that first
+ makes Original Software available under this License.
+
+ 1.6. "Larger Work" means a work which combines Covered Software or
+ portions thereof with code not governed by the terms of this
+ License.
+
+ 1.7. "License" means this document.
+
+ 1.8. "Licensable" means having the right to grant, to the maximum
+ extent possible, whether at the time of the initial grant or
+ subsequently acquired, any and all of the rights conveyed herein.
+
+ 1.9. "Modifications" means the Source Code and Executable form of
+ any of the following:
+
+ A. Any file that results from an addition to, deletion from or
+ modification of the contents of a file containing Original
+ Software or previous Modifications;
+
+ B. Any new file that contains any part of the Original Software
+ or previous Modification; or
+
+ C. Any new file that is contributed or otherwise made available
+ under the terms of this License.
+
+ 1.10. "Original Software" means the Source Code and Executable form
+ of computer software code that is originally released under
+ this License.
+
+ 1.11. "Patent Claims" means any patent claim(s), now owned or
+ hereafter acquired, including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by
+ grantor.
+
+ 1.12. "Source Code" means (a) the common form of computer software
+ code in which modifications are made and (b) associated
+ documentation included in or with such code.
+
+ 1.13. "You" (or "Your") means an individual or a legal entity
+ exercising rights under, and complying with all of the terms
+ of, this License. For legal entities, "You" includes any
+ entity which controls, is controlled by, or is under common
+ control with You. For purposes of this definition, "control"
+ means (a) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract
+ or otherwise, or (b) ownership of more than fifty percent
+ (50%) of the outstanding shares or beneficial ownership of
+ such entity.
+
+ 2. License Grants.
+ 2.1. The Initial Developer Grant.
+
+ Conditioned upon Your compliance with Section 3.1 below and subject
+ to third party intellectual property claims, the Initial Developer
+ hereby grants You a world-wide, royalty-free, non-exclusive
+ license:
+
+ (a) under intellectual property rights (other than patent or
+ trademark) Licensable by Initial Developer, to use,
+ reproduce, modify, display, perform, sublicense and
+ distribute the Original Software (or portions thereof),
+ with or without Modifications, and/or as part of a Larger
+ Work; and
+
+ (b) under Patent Claims infringed by the making, using or
+ selling of Original Software, to make, have made, use,
+ practice, sell, and offer for sale, and/or otherwise
+ dispose of the Original Software (or portions thereof).
+
+ (c) The licenses granted in Sections 2.1(a) and (b) are
+ effective on the date Initial Developer first distributes
+ or otherwise makes the Original Software available to a
+ third party under the terms of this License.
+
+ (d) Notwithstanding Section 2.1(b) above, no patent license is
+ granted: (1) for code that You delete from the Original
+ Software, or (2) for infringements caused by: (i) the
+ modification of the Original Software, or (ii) the
+ combination of the Original Software with other software or
+ devices.
+
+ 2.2. Contributor Grant.
+
+ Conditioned upon Your compliance with Section 3.1 below and subject
+ to third party intellectual property claims, each Contributor
+ hereby grants You a world-wide, royalty-free, non-exclusive
+ license:
+
+ (a) under intellectual property rights (other than patent or
+ trademark) Licensable by Contributor to use, reproduce,
+ modify, display, perform, sublicense and distribute the
+ Modifications created by such Contributor (or portions
+ thereof), either on an unmodified basis, with other
+ Modifications, as Covered Software and/or as part of a
+ Larger Work; and
+
+ (b) under Patent Claims infringed by the making, using, or
+ selling of Modifications made by that Contributor either
+ alone and/or in combination with its Contributor Version
+ (or portions of such combination), to make, use, sell,
+ offer for sale, have made, and/or otherwise dispose of: (1)
+ Modifications made by that Contributor (or portions
+ thereof); and (2) the combination of Modifications made by
+ that Contributor with its Contributor Version (or portions
+ of such combination).
+
+ (c) The licenses granted in Sections 2.2(a) and 2.2(b) are
+ effective on the date Contributor first distributes or
+ otherwise makes the Modifications available to a third
+ party.
+
+ (d) Notwithstanding Section 2.2(b) above, no patent license is
+ granted: (1) for any code that Contributor has deleted from
+ the Contributor Version; (2) for infringements caused by:
+ (i) third party modifications of Contributor Version, or
+ (ii) the combination of Modifications made by that
+ Contributor with other software (except as part of the
+ Contributor Version) or other devices; or (3) under Patent
+ Claims infringed by Covered Software in the absence of
+ Modifications made by that Contributor.
+
+ 3. Distribution Obligations.
+ 3.1. Availability of Source Code.
+
+ Any Covered Software that You distribute or otherwise make
+ available in Executable form must also be made available in Source
+ Code form and that Source Code form must be distributed only under
+ the terms of this License. You must include a copy of this License
+ with every copy of the Source Code form of the Covered Software You
+ distribute or otherwise make available. You must inform recipients
+ of any such Covered Software in Executable form as to how they can
+ obtain such Covered Software in Source Code form in a reasonable
+ manner on or through a medium customarily used for software
+ exchange.
+
+ 3.2. Modifications.
+
+ The Modifications that You create or to which You contribute are
+ governed by the terms of this License. You represent that You
+ believe Your Modifications are Your original creation(s) and/or You
+ have sufficient rights to grant the rights conveyed by this
+ License.
+
+ 3.3. Required Notices.
+
+ You must include a notice in each of Your Modifications that
+ identifies You as the Contributor of the Modification. You may not
+ remove or alter any copyright, patent or trademark notices
+ contained within the Covered Software, or any notices of licensing
+ or any descriptive text giving attribution to any Contributor or
+ the Initial Developer.
+
+ 3.4. Application of Additional Terms.
+
+ You may not offer or impose any terms on any Covered Software in
+ Source Code form that alters or restricts the applicable version of
+ this License or the recipients' rights hereunder. You may choose to
+ offer, and to charge a fee for, warranty, support, indemnity or
+ liability obligations to one or more recipients of Covered
+ Software. However, you may do so only on Your own behalf, and not
+ on behalf of the Initial Developer or any Contributor. You must
+ make it absolutely clear that any such warranty, support, indemnity
+ or liability obligation is offered by You alone, and You hereby
+ agree to indemnify the Initial Developer and every Contributor for
+ any liability incurred by the Initial Developer or such Contributor
+ as a result of warranty, support, indemnity or liability terms You
+ offer.
+
+ 3.5. Distribution of Executable Versions.
+
+ You may distribute the Executable form of the Covered Software
+ under the terms of this License or under the terms of a license of
+ Your choice, which may contain terms different from this License,
+ provided that You are in compliance with the terms of this License
+ and that the license for the Executable form does not attempt to
+ limit or alter the recipient's rights in the Source Code form from
+ the rights set forth in this License. If You distribute the Covered
+ Software in Executable form under a different license, You must
+ make it absolutely clear that any terms which differ from this
+ License are offered by You alone, not by the Initial Developer or
+ Contributor. You hereby agree to indemnify the Initial Developer
+ and every Contributor for any liability incurred by the Initial
+ Developer or such Contributor as a result of any such terms You
+ offer.
+
+ 3.6. Larger Works.
+
+ You may create a Larger Work by combining Covered Software with
+ other code not governed by the terms of this License and distribute
+ the Larger Work as a single product. In such a case, You must make
+ sure the requirements of this License are fulfilled for the Covered
+ Software.
+
+ 4. Versions of the License.
+ 4.1. New Versions.
+
+ Sun Microsystems, Inc. is the initial license steward and may
+ publish revised and/or new versions of this License from time to
+ time. Each version will be given a distinguishing version
+ number. Except as provided in Section 4.3, no one other than the
+ license steward has the right to modify this License.
+
+ 4.2. Effect of New Versions.
+
+ You may always continue to use, distribute or otherwise make the
+ Covered Software available under the terms of the version of the
+ License under which You originally received the Covered
+ Software. If the Initial Developer includes a notice in the
+ Original Software prohibiting it from being distributed or
+ otherwise made available under any subsequent version of the
+ License, You must distribute and make the Covered Software
+ available under the terms of the version of the License under which
+ You originally received the Covered Software. Otherwise, You may
+ also choose to use, distribute or otherwise make the Covered
+ Software available under the terms of any subsequent version of the
+ License published by the license steward.
+
+ 4.3. Modified Versions.
+
+ When You are an Initial Developer and You want to create a new
+ license for Your Original Software, You may create and use a
+ modified version of this License if You: (a) rename the license and
+ remove any references to the name of the license steward (except to
+ note that the license differs from this License); and (b) otherwise
+ make it clear that the license contains terms which differ from
+ this License.
+
+ 5. DISCLAIMER OF WARRANTY.
+
+ COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
+ WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE IS FREE OF
+ DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR
+ NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF
+ THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE PROVE
+ DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER
+ CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR
+ CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART
+ OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER
+ EXCEPT UNDER THIS DISCLAIMER.
+
+ 6. TERMINATION.
+
+ 6.1. This License and the rights granted hereunder will terminate
+ automatically if You fail to comply with terms herein and fail to
+ cure such breach within 30 days of becoming aware of the
+ breach. Provisions which, by their nature, must remain in effect
+ beyond the termination of this License shall survive.
+
+ 6.2. If You assert a patent infringement claim (excluding
+ declaratory judgment actions) against Initial Developer or a
+ Contributor (the Initial Developer or Contributor against whom You
+ assert such claim is referred to as "Participant") alleging that
+ the Participant Software (meaning the Contributor Version where the
+ Participant is a Contributor or the Original Software where the
+ Participant is the Initial Developer) directly or indirectly
+ infringes any patent, then any and all rights granted directly or
+ indirectly to You by such Participant, the Initial Developer (if
+ the Initial Developer is not the Participant) and all Contributors
+ under Sections 2.1 and/or 2.2 of this License shall, upon 60 days
+ notice from Participant terminate prospectively and automatically
+ at the expiration of such 60 day notice period, unless if within
+ such 60 day period You withdraw Your claim with respect to the
+ Participant Software against such Participant either unilaterally
+ or pursuant to a written agreement with Participant.
+
+ 6.3. In the event of termination under Sections 6.1 or 6.2 above,
+ all end user licenses that have been validly granted by You or any
+ distributor hereunder prior to termination (excluding licenses
+ granted to You by any distributor) shall survive termination.
+
+ 7. LIMITATION OF LIABILITY.
+
+ UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
+ (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
+ DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED
+ SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY
+ PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOST
+ PROFITS, LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR
+ MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF
+ SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH
+ DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR
+ DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE
+ EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO
+ NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL
+ DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
+
+ 8. U.S. GOVERNMENT END USERS.
+
+ The Covered Software is a "commercial item," as that term is defined in
+ 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
+ software" (as that term is defined at 48 C.F.R. $ 252.227-7014(a)(1))
+ and "commercial computer software documentation" as such terms are used
+ in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and
+ 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all
+ U.S. Government End Users acquire Covered Software with only those
+ rights set forth herein. This U.S. Government Rights clause is in lieu
+ of, and supersedes, any other FAR, DFAR, or other clause or provision
+ that addresses Government rights in computer software under this
+ License.
+
+ 9. MISCELLANEOUS.
+
+ This License represents the complete agreement concerning subject
+ matter hereof. If any provision of this License is held to be
+ unenforceable, such provision shall be reformed only to the extent
+ necessary to make it enforceable. This License shall be governed by the
+ law of the jurisdiction specified in a notice contained within the
+ Original Software (except to the extent applicable law, if any,
+ provides otherwise), excluding such jurisdiction's conflict-of-law
+ provisions. Any litigation relating to this License shall be subject to
+ the jurisdiction of the courts located in the jurisdiction and venue
+ specified in a notice contained within the Original Software, with the
+ losing party responsible for costs, including, without limitation,
+ court costs and reasonable attorneys' fees and expenses. The
+ application of the United Nations Convention on Contracts for the
+ International Sale of Goods is expressly excluded. Any law or
+ regulation which provides that the language of a contract shall be
+ construed against the drafter shall not apply to this License. You
+ agree that You alone are responsible for compliance with the United
+ States export administration regulations (and the export control laws
+ and regulation of any other countries) when You use, distribute or
+ otherwise make available any Covered Software.
+
+ 10. RESPONSIBILITY FOR CLAIMS.
+
+ As between Initial Developer and the Contributors, each party is
+ responsible for claims and damages arising, directly or indirectly, out
+ of its utilization of rights under this License and You agree to work
+ with Initial Developer and Contributors to distribute such
+ responsibility on an equitable basis. Nothing herein is intended or
+ shall be deemed to constitute any admission of liability.
diff --git a/LICENSES/other/Linux-OpenIB b/LICENSES/other/Linux-OpenIB
new file mode 100644
index 000000000000..1ad85f6b3a89
--- /dev/null
+++ b/LICENSES/other/Linux-OpenIB
@@ -0,0 +1,26 @@
+Valid-License-Identifier: Linux-OpenIB
+SPDX-URL: https://spdx.org/licenses/Linux-OpenIB.html
+Usage-Guide:
+ To use the Linux Kernel Variant of OpenIB.org license put the following
+ SPDX tag/value pair into a comment according to the placement guidelines
+ in the licensing rules documentation:
+ SPDX-License-Identifier: Linux-OpenIB
+License-Text:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/LICENSES/other/X11 b/LICENSES/other/X11
new file mode 100644
index 000000000000..fe4353fd0000
--- /dev/null
+++ b/LICENSES/other/X11
@@ -0,0 +1,37 @@
+Valid-License-Identifier: X11
+SPDX-URL: https://spdx.org/licenses/X11.html
+Usage-Guide:
+ To use the X11 put the following SPDX tag/value pair into a comment
+ according to the placement guidelines in the licensing rules
+ documentation:
+ SPDX-License-Identifier: X11
+License-Text:
+
+
+X11 License
+
+Copyright (C) 1996 X Consortium
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of the X Consortium shall not
+be used in advertising or otherwise to promote the sale, use or other
+dealings in this Software without prior written authorization from the X
+Consortium.
+
+X Window System is a trademark of X Consortium, Inc.
diff --git a/LICENSES/preferred/GPL-2.0 b/LICENSES/preferred/GPL-2.0
index b8db91d3a1cb..ff0812fd89cc 100644
--- a/LICENSES/preferred/GPL-2.0
+++ b/LICENSES/preferred/GPL-2.0
@@ -1,5 +1,7 @@
Valid-License-Identifier: GPL-2.0
+Valid-License-Identifier: GPL-2.0-only
Valid-License-Identifier: GPL-2.0+
+Valid-License-Identifier: GPL-2.0-or-later
SPDX-URL: https://spdx.org/licenses/GPL-2.0.html
Usage-Guide:
To use this license in source code, put one of the following SPDX
@@ -7,8 +9,12 @@ Usage-Guide:
guidelines in the licensing rules documentation.
For 'GNU General Public License (GPL) version 2 only' use:
SPDX-License-Identifier: GPL-2.0
+ or
+ SPDX-License-Identifier: GPL-2.0-only
For 'GNU General Public License (GPL) version 2 or any later version' use:
SPDX-License-Identifier: GPL-2.0+
+ or
+ SPDX-License-Identifier: GPL-2.0-or-later
License-Text:
GNU GENERAL PUBLIC LICENSE
diff --git a/MAINTAINERS b/MAINTAINERS
index cbc6bbf4ef9c..c3e2e7076a95 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4330,12 +4330,14 @@ W: http://git.infradead.org/users/hch/dma-mapping.git
S: Supported
F: lib/dma-debug.c
F: lib/dma-direct.c
+F: lib/dma-noncoherent.c
F: lib/dma-virt.c
F: drivers/base/dma-mapping.c
F: drivers/base/dma-coherent.c
F: include/asm-generic/dma-mapping.h
F: include/linux/dma-direct.h
F: include/linux/dma-mapping.h
+F: include/linux/dma-noncoherent.h
DME1737 HARDWARE MONITOR DRIVER
M: Juerg Haefliger <juergh@gmail.com>
@@ -5413,6 +5415,11 @@ S: Maintained
F: Documentation/hwmon/f71805f
F: drivers/hwmon/f71805f.c
+FADDR2LINE
+M: Josh Poimboeuf <jpoimboe@redhat.com>
+S: Maintained
+F: scripts/faddr2line
+
FANOTIFY
M: Jan Kara <jack@suse.cz>
R: Amir Goldstein <amir73il@gmail.com>
@@ -5948,8 +5955,8 @@ S: Maintained
F: scripts/get_maintainer.pl
GFS2 FILE SYSTEM
-M: Steven Whitehouse <swhiteho@redhat.com>
M: Bob Peterson <rpeterso@redhat.com>
+M: Andreas Gruenbacher <agruenba@redhat.com>
L: cluster-devel@redhat.com
W: http://sources.redhat.com/cluster/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git
@@ -6217,6 +6224,7 @@ L: linux-hwmon@vger.kernel.org
W: http://hwmon.wiki.kernel.org/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
S: Maintained
+F: Documentation/devicetree/bindings/hwmon/
F: Documentation/hwmon/
F: drivers/hwmon/
F: include/linux/hwmon*.h
@@ -8209,7 +8217,7 @@ F: drivers/misc/lkdtm/*
LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM)
M: Alan Stern <stern@rowland.harvard.edu>
-M: Andrea Parri <parri.andrea@gmail.com>
+M: Andrea Parri <andrea.parri@amarulasolutions.com>
M: Will Deacon <will.deacon@arm.com>
M: Peter Zijlstra <peterz@infradead.org>
M: Boqun Feng <boqun.feng@gmail.com>
@@ -8316,6 +8324,7 @@ F: Documentation/admin-guide/LSM/LoadPin.rst
LOCKING PRIMITIVES
M: Peter Zijlstra <peterz@infradead.org>
M: Ingo Molnar <mingo@redhat.com>
+M: Will Deacon <will.deacon@arm.com>
L: linux-kernel@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
S: Maintained
@@ -15639,7 +15648,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/zsmalloc.c
F: include/linux/zsmalloc.h
-F: Documentation/vm/zsmalloc.txt
+F: Documentation/vm/zsmalloc.rst
ZSWAP COMPRESSED SWAP CACHING
M: Seth Jennings <sjenning@redhat.com>
diff --git a/arch/Kconfig b/arch/Kconfig
index 75dd23acf133..b695a3e3e922 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -278,9 +278,6 @@ config HAVE_CLK
The <linux/clk.h> calls support software clock gating and
thus are a key power management tool on many systems.
-config HAVE_DMA_API_DEBUG
- bool
-
config HAVE_HW_BREAKPOINT
bool
depends on PERF_EVENTS
@@ -874,6 +871,21 @@ config OLD_SIGACTION
config COMPAT_OLD_SIGACTION
bool
+config 64BIT_TIME
+ def_bool ARCH_HAS_64BIT_TIME
+ help
+ This should be selected by all architectures that need to support
+ new system calls with a 64-bit time_t. This is relevant on all 32-bit
+ architectures, and 64-bit architectures as part of compat syscall
+ handling.
+
+config COMPAT_32BIT_TIME
+ def_bool (!64BIT && 64BIT_TIME) || COMPAT
+ help
+ This enables 32 bit time_t support in addition to 64 bit time_t support.
+ This is relevant on all 32-bit architectures, and 64-bit architectures
+ as part of compat syscall handling.
+
config ARCH_NO_COHERENT_DMA_MMAP
bool
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index f19dc31288c8..0c4805a572c8 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -10,6 +10,8 @@ config ALPHA
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
select VIRT_TO_BUS
select GENERIC_IRQ_PROBE
select AUTO_IRQ_AFFINITY if SMP
@@ -64,15 +66,6 @@ config ZONE_DMA
bool
default y
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
-
-config NEED_DMA_MAP_STATE
- def_bool y
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config GENERIC_ISA_DMA
bool
default y
@@ -346,9 +339,6 @@ config PCI_DOMAINS
config PCI_SYSCALL
def_bool PCI
-config IOMMU_HELPER
- def_bool PCI
-
config ALPHA_NONAME
bool
depends on ALPHA_BOOK1 || ALPHA_NONAME_CH
@@ -586,7 +576,7 @@ config ARCH_DISCONTIGMEM_ENABLE
Say Y to support efficient handling of discontiguous physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
or have huge holes in the physical address space for other reasons.
- See <file:Documentation/vm/numa> for more.
+ See <file:Documentation/vm/numa.rst> for more.
source "mm/Kconfig"
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 9b68790013e2..0580cb8c84b2 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
+generic-y += compat.h
generic-y += exec.h
generic-y += export.h
generic-y += fb.h
diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index b9ec55351924..cf6bc1e64d66 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -56,11 +56,6 @@ struct pci_controller {
/* IOMMU controls. */
-/* The PCI address space does not equal the physical memory address space.
- The networking and block device layers use this boolean for bounce buffer
- decisions. */
-#define PCI_DMA_BUS_IS_PHYS 0
-
/* TODO: integrate with include/asm-generic/pci.h ? */
static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
{
diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index 9afaba5e5503..1a5b75310cf4 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -2,4 +2,8 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += bpf_perf_event.h
+generic-y += ipcbuf.h
+generic-y += msgbuf.h
generic-y += poll.h
+generic-y += sembuf.h
+generic-y += shmbuf.h
diff --git a/arch/alpha/include/uapi/asm/ipcbuf.h b/arch/alpha/include/uapi/asm/ipcbuf.h
deleted file mode 100644
index 90d6445a14df..000000000000
--- a/arch/alpha/include/uapi/asm/ipcbuf.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/alpha/include/uapi/asm/msgbuf.h b/arch/alpha/include/uapi/asm/msgbuf.h
deleted file mode 100644
index 8c5d4d8c1b16..000000000000
--- a/arch/alpha/include/uapi/asm/msgbuf.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ALPHA_MSGBUF_H
-#define _ALPHA_MSGBUF_H
-
-/*
- * The msqid64_ds structure for alpha architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct msqid64_ds {
- struct ipc64_perm msg_perm;
- __kernel_time_t msg_stime; /* last msgsnd time */
- __kernel_time_t msg_rtime; /* last msgrcv time */
- __kernel_time_t msg_ctime; /* last change time */
- unsigned long msg_cbytes; /* current number of bytes on queue */
- unsigned long msg_qnum; /* number of messages in queue */
- unsigned long msg_qbytes; /* max number of bytes on queue */
- __kernel_pid_t msg_lspid; /* pid of last msgsnd */
- __kernel_pid_t msg_lrpid; /* last receive pid */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-#endif /* _ALPHA_MSGBUF_H */
diff --git a/arch/alpha/include/uapi/asm/sembuf.h b/arch/alpha/include/uapi/asm/sembuf.h
deleted file mode 100644
index f28ffa668b2f..000000000000
--- a/arch/alpha/include/uapi/asm/sembuf.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ALPHA_SEMBUF_H
-#define _ALPHA_SEMBUF_H
-
-/*
- * The semid64_ds structure for alpha architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct semid64_ds {
- struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
- __kernel_time_t sem_otime; /* last semop time */
- __kernel_time_t sem_ctime; /* last change time */
- unsigned long sem_nsems; /* no. of semaphores in array */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-#endif /* _ALPHA_SEMBUF_H */
diff --git a/arch/alpha/include/uapi/asm/shmbuf.h b/arch/alpha/include/uapi/asm/shmbuf.h
deleted file mode 100644
index 7e041ca2eb40..000000000000
--- a/arch/alpha/include/uapi/asm/shmbuf.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ALPHA_SHMBUF_H
-#define _ALPHA_SHMBUF_H
-
-/*
- * The shmid64_ds structure for alpha architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct shmid64_ds {
- struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
- __kernel_time_t shm_dtime; /* last detach time */
- __kernel_time_t shm_ctime; /* last change time */
- __kernel_pid_t shm_cpid; /* pid of creator */
- __kernel_pid_t shm_lpid; /* pid of last operator */
- unsigned long shm_nattch; /* no. of current attaches */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-struct shminfo64 {
- unsigned long shmmax;
- unsigned long shmmin;
- unsigned long shmmni;
- unsigned long shmseg;
- unsigned long shmall;
- unsigned long __unused1;
- unsigned long __unused2;
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _ALPHA_SHMBUF_H */
diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h
index 0cf3b527b274..db3f0138536f 100644
--- a/arch/alpha/include/uapi/asm/siginfo.h
+++ b/arch/alpha/include/uapi/asm/siginfo.h
@@ -7,18 +7,4 @@
#include <asm-generic/siginfo.h>
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
- * SIGTRAP si_codes
- */
-#ifdef __KERNEL__
-#define TRAP_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
#endif
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 89faa6f4de47..6e921754c8fc 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -871,8 +871,7 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer,
send a signal. Old exceptions are not signaled. */
fex = (exc >> IEEE_STATUS_TO_EXCSUM_SHIFT) & swcr;
if (fex) {
- siginfo_t info;
- int si_code = 0;
+ int si_code = FPE_FLTUNK;
if (fex & IEEE_TRAP_ENABLE_DNO) si_code = FPE_FLTUND;
if (fex & IEEE_TRAP_ENABLE_INE) si_code = FPE_FLTRES;
@@ -881,11 +880,9 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer,
if (fex & IEEE_TRAP_ENABLE_DZE) si_code = FPE_FLTDIV;
if (fex & IEEE_TRAP_ENABLE_INV) si_code = FPE_FLTINV;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = NULL; /* FIXME */
- send_sig_info(SIGFPE, &info, current);
+ send_sig_fault(SIGFPE, si_code,
+ (void __user *)NULL, /* FIXME */
+ 0, current);
}
return 0;
}
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 9ebb3bcbc626..8c0c4ee0be6e 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -219,14 +219,8 @@ do_sigreturn(struct sigcontext __user *sc)
/* Send SIGTRAP if we're single-stepping: */
if (ptrace_cancel_bpt (current)) {
- siginfo_t info;
-
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = (void __user *) regs->pc;
- info.si_trapno = 0;
- send_sig_info(SIGTRAP, &info, current);
+ send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0,
+ current);
}
return;
@@ -253,14 +247,8 @@ do_rt_sigreturn(struct rt_sigframe __user *frame)
/* Send SIGTRAP if we're single-stepping: */
if (ptrace_cancel_bpt (current)) {
- siginfo_t info;
-
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = (void __user *) regs->pc;
- info.si_trapno = 0;
- send_sig_info(SIGTRAP, &info, current);
+ send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0,
+ current);
}
return;
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index f43bd05dede2..bc9627698796 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -213,7 +213,6 @@ do_entArith(unsigned long summary, unsigned long write_mask,
struct pt_regs *regs)
{
long si_code = FPE_FLTINV;
- siginfo_t info;
if (summary & 1) {
/* Software-completion summary bit is set, so try to
@@ -228,17 +227,12 @@ do_entArith(unsigned long summary, unsigned long write_mask,
}
die_if_kernel("Arithmetic fault", regs, 0, NULL);
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *) regs->pc;
- send_sig_info(SIGFPE, &info, current);
+ send_sig_fault(SIGFPE, si_code, (void __user *) regs->pc, 0, current);
}
asmlinkage void
do_entIF(unsigned long type, struct pt_regs *regs)
{
- siginfo_t info;
int signo, code;
if ((regs->ps & ~IPL_MAX) == 0) {
@@ -270,31 +264,20 @@ do_entIF(unsigned long type, struct pt_regs *regs)
switch (type) {
case 0: /* breakpoint */
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_trapno = 0;
- info.si_addr = (void __user *) regs->pc;
-
if (ptrace_cancel_bpt(current)) {
regs->pc -= 4; /* make pc point to former bpt */
}
- send_sig_info(SIGTRAP, &info, current);
+ send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0,
+ current);
return;
case 1: /* bugcheck */
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_FIXME;
- info.si_addr = (void __user *) regs->pc;
- info.si_trapno = 0;
- send_sig_info(SIGTRAP, &info, current);
+ send_sig_fault(SIGTRAP, TRAP_UNK, (void __user *) regs->pc, 0,
+ current);
return;
case 2: /* gentrap */
- info.si_addr = (void __user *) regs->pc;
- info.si_trapno = regs->r16;
switch ((long) regs->r16) {
case GEN_INTOVF:
signo = SIGFPE;
@@ -326,7 +309,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
break;
case GEN_ROPRAND:
signo = SIGFPE;
- code = FPE_FIXME;
+ code = FPE_FLTUNK;
break;
case GEN_DECOVF:
@@ -348,15 +331,12 @@ do_entIF(unsigned long type, struct pt_regs *regs)
case GEN_SUBRNG7:
default:
signo = SIGTRAP;
- code = TRAP_FIXME;
+ code = TRAP_UNK;
break;
}
- info.si_signo = signo;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = (void __user *) regs->pc;
- send_sig_info(signo, &info, current);
+ send_sig_fault(signo, code, (void __user *) regs->pc, regs->r16,
+ current);
return;
case 4: /* opDEC */
@@ -380,11 +360,9 @@ do_entIF(unsigned long type, struct pt_regs *regs)
if (si_code == 0)
return;
if (si_code > 0) {
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *) regs->pc;
- send_sig_info(SIGFPE, &info, current);
+ send_sig_fault(SIGFPE, si_code,
+ (void __user *) regs->pc, 0,
+ current);
return;
}
}
@@ -409,11 +387,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
;
}
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLOPC;
- info.si_addr = (void __user *) regs->pc;
- send_sig_info(SIGILL, &info, current);
+ send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current);
}
/* There is an ifdef in the PALcode in MILO that enables a
@@ -426,15 +400,9 @@ do_entIF(unsigned long type, struct pt_regs *regs)
asmlinkage void
do_entDbg(struct pt_regs *regs)
{
- siginfo_t info;
-
die_if_kernel("Instruction fault", regs, 0, NULL);
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLOPC;
- info.si_addr = (void __user *) regs->pc;
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current);
}
@@ -758,7 +726,7 @@ do_entUnaUser(void __user * va, unsigned long opcode,
unsigned long tmp1, tmp2, tmp3, tmp4;
unsigned long fake_reg, *reg_addr = &fake_reg;
- siginfo_t info;
+ int si_code;
long error;
/* Check the UAC bits to decide what the user wants us to do
@@ -981,34 +949,27 @@ do_entUnaUser(void __user * va, unsigned long opcode,
give_sigsegv:
regs->pc -= 4; /* make pc point to faulting insn */
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
/* We need to replicate some of the logic in mm/fault.c,
since we don't have access to the fault code in the
exception handling return path. */
if ((unsigned long)va >= TASK_SIZE)
- info.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
else {
struct mm_struct *mm = current->mm;
down_read(&mm->mmap_sem);
if (find_vma(mm, (unsigned long)va))
- info.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
else
- info.si_code = SEGV_MAPERR;
+ si_code = SEGV_MAPERR;
up_read(&mm->mmap_sem);
}
- info.si_addr = va;
- send_sig_info(SIGSEGV, &info, current);
+ send_sig_fault(SIGSEGV, si_code, va, 0, current);
return;
give_sigbus:
regs->pc -= 4;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = va;
- send_sig_info(SIGBUS, &info, current);
+ send_sig_fault(SIGBUS, BUS_ADRALN, va, 0, current);
return;
}
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index cd3c572ee912..de2bd217adad 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -88,7 +88,6 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
struct mm_struct *mm = current->mm;
const struct exception_table_entry *fixup;
int fault, si_code = SEGV_MAPERR;
- siginfo_t info;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
@@ -221,21 +220,13 @@ retry:
up_read(&mm->mmap_sem);
/* Send a sigbus, regardless of whether we were in kernel
or user mode. */
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *) address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current);
if (!user_mode(regs))
goto no_context;
return;
do_sigsegv:
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *) address;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current);
return;
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d76bf4a83740..89d47eac18b2 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -9,11 +9,15 @@
config ARC
def_bool y
select ARC_TIMERS
+ select ARCH_HAS_SYNC_DMA_FOR_CPU
+ select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_HAS_SG_CHAIN
select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select COMMON_CLK
+ select DMA_NONCOHERENT_OPS
+ select DMA_NONCOHERENT_MMAP
select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
select GENERIC_CLOCKEVENTS
select GENERIC_FIND_FIRST_BIT
@@ -453,16 +457,11 @@ config ARC_HAS_PAE40
default n
depends on ISA_ARCV2
select HIGHMEM
+ select PHYS_ADDR_T_64BIT
help
Enable access to physical memory beyond 4G, only supported on
ARC cores with 40 bit Physical Addressing support
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool ARC_HAS_PAE40
-
-config ARCH_DMA_ADDR_T_64BIT
- bool
-
config ARC_KVADDR_SIZE
int "Kernel Virtual Address Space size (MB)"
range 0 512
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4bd5d4369e05..feed50ce89fa 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
generic-y += bugs.h
+generic-y += compat.h
generic-y += device.h
generic-y += div64.h
+generic-y += dma-mapping.h
generic-y += emergency-restart.h
generic-y += extable.h
generic-y += fb.h
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
deleted file mode 100644
index 7a16824bfe98..000000000000
--- a/arch/arc/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * DMA Mapping glue for ARC
- *
- * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef ASM_ARC_DMA_MAPPING_H
-#define ASM_ARC_DMA_MAPPING_H
-
-extern const struct dma_map_ops arc_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
- return &arc_dma_ops;
-}
-
-#endif
diff --git a/arch/arc/include/asm/pci.h b/arch/arc/include/asm/pci.h
index ba56c23c1b20..4ff53c041c64 100644
--- a/arch/arc/include/asm/pci.h
+++ b/arch/arc/include/asm/pci.h
@@ -16,12 +16,6 @@
#define PCIBIOS_MIN_MEM 0x100000
#define pcibios_assign_all_busses() 1
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS 1
#endif /* __KERNEL__ */
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 1dcc404b5aec..8c1071840979 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -16,13 +16,12 @@
* The default DMA address == Phy address which is 0x8000_0000 based.
*/
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
#include <asm/cache.h>
#include <asm/cacheflush.h>
-
-static void *arc_dma_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
{
unsigned long order = get_order(size);
struct page *page;
@@ -89,7 +88,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
return kvaddr;
}
-static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
phys_addr_t paddr = dma_handle;
@@ -105,9 +104,9 @@ static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
__free_pages(page, get_order(size));
}
-static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
+int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
{
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
@@ -130,149 +129,14 @@ static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
return ret;
}
-/*
- * streaming DMA Mapping API...
- * CPU accesses page via normal paddr, thus needs to explicitly made
- * consistent before each use
- */
-static void _dma_cache_sync(phys_addr_t paddr, size_t size,
- enum dma_data_direction dir)
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
{
- switch (dir) {
- case DMA_FROM_DEVICE:
- dma_cache_inv(paddr, size);
- break;
- case DMA_TO_DEVICE:
- dma_cache_wback(paddr, size);
- break;
- case DMA_BIDIRECTIONAL:
- dma_cache_wback_inv(paddr, size);
- break;
- default:
- pr_err("Invalid DMA dir [%d] for OP @ %pa[p]\n", dir, &paddr);
- }
+ dma_cache_wback(paddr, size);
}
-/*
- * arc_dma_map_page - map a portion of a page for streaming DMA
- *
- * Ensure that any data held in the cache is appropriately discarded
- * or written back.
- *
- * The device owns this memory once this call has completed. The CPU
- * can regain ownership by calling dma_unmap_page().
- *
- * Note: while it takes struct page as arg, caller can "abuse" it to pass
- * a region larger than PAGE_SIZE, provided it is physically contiguous
- * and this still works correctly
- */
-static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
{
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- _dma_cache_sync(paddr, size, dir);
-
- return paddr;
-}
-
-/*
- * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
- *
- * After this call, reads by the CPU to the buffer are guaranteed to see
- * whatever the device wrote there.
- *
- * Note: historically this routine was not implemented for ARC
- */
-static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t paddr = handle;
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- _dma_cache_sync(paddr, size, dir);
+ dma_cache_inv(paddr, size);
}
-
-static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir, unsigned long attrs)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sg, s, nents, i)
- s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
- s->length, dir);
-
- return nents;
-}
-
-static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sg, s, nents, i)
- arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir,
- attrs);
-}
-
-static void arc_dma_sync_single_for_cpu(struct device *dev,
- dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
-{
- _dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE);
-}
-
-static void arc_dma_sync_single_for_device(struct device *dev,
- dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
-{
- _dma_cache_sync(dma_handle, size, DMA_TO_DEVICE);
-}
-
-static void arc_dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sglist, int nelems,
- enum dma_data_direction dir)
-{
- int i;
- struct scatterlist *sg;
-
- for_each_sg(sglist, sg, nelems, i)
- _dma_cache_sync(sg_phys(sg), sg->length, dir);
-}
-
-static void arc_dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sglist, int nelems,
- enum dma_data_direction dir)
-{
- int i;
- struct scatterlist *sg;
-
- for_each_sg(sglist, sg, nelems, i)
- _dma_cache_sync(sg_phys(sg), sg->length, dir);
-}
-
-static int arc_dma_supported(struct device *dev, u64 dma_mask)
-{
- /* Support 32 bit DMA mask exclusively */
- return dma_mask == DMA_BIT_MASK(32);
-}
-
-const struct dma_map_ops arc_dma_ops = {
- .alloc = arc_dma_alloc,
- .free = arc_dma_free,
- .mmap = arc_dma_mmap,
- .map_page = arc_dma_map_page,
- .unmap_page = arc_dma_unmap_page,
- .map_sg = arc_dma_map_sg,
- .unmap_sg = arc_dma_unmap_sg,
- .sync_single_for_device = arc_dma_sync_single_for_device,
- .sync_single_for_cpu = arc_dma_sync_single_for_cpu,
- .sync_sg_for_cpu = arc_dma_sync_sg_for_cpu,
- .sync_sg_for_device = arc_dma_sync_sg_for_device,
- .dma_supported = arc_dma_supported,
-};
-EXPORT_SYMBOL(arc_dma_ops);
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index a0b7bd6d030d..b884bbd6f354 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -70,6 +70,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ clear_siginfo(&info);
+
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a7f8e7f4b88f..c43f5bb55ac8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -60,7 +60,6 @@ config ARM
select HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
@@ -96,6 +95,7 @@ config ARM
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_REL
+ select NEED_DMA_MAP_STATE
select NO_BOOTMEM
select OF_EARLY_FLATTREE if OF
select OF_RESERVED_MEM if OF
@@ -119,9 +119,6 @@ config ARM_HAS_SG_CHAIN
select ARCH_HAS_SG_CHAIN
bool
-config NEED_SG_DMA_LENGTH
- bool
-
config ARM_DMA_USE_IOMMU
bool
select ARM_HAS_SG_CHAIN
@@ -224,9 +221,6 @@ config ARCH_MAY_HAVE_PC_FDC
config ZONE_DMA
bool
-config NEED_DMA_MAP_STATE
- def_bool y
-
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -1778,12 +1772,6 @@ config SECCOMP
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
-config SWIOTLB
- def_bool y
-
-config IOMMU_HELPER
- def_bool SWIOTLB
-
config PARAVIRT
bool "Enable paravirtualization code"
help
@@ -1815,6 +1803,7 @@ config XEN
depends on MMU
select ARCH_DMA_ADDR_T_64BIT
select ARM_PSCI
+ select SWIOTLB
select SWIOTLB_XEN
select PARAVIRT
help
diff --git a/arch/arm/boot/dts/stm32mp157-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp157-pinctrl.dtsi
index c0743305f31b..eb96ac3e6c1d 100644
--- a/arch/arm/boot/dts/stm32mp157-pinctrl.dtsi
+++ b/arch/arm/boot/dts/stm32mp157-pinctrl.dtsi
@@ -12,6 +12,8 @@
#size-cells = <1>;
compatible = "st,stm32mp157-pinctrl";
ranges = <0 0x50002000 0xa400>;
+ interrupt-parent = <&exti>;
+ st,syscfg = <&exti 0x60 0xff>;
pins-are-numbered;
gpioa: gpio@50002000 {
@@ -166,6 +168,8 @@
compatible = "st,stm32mp157-z-pinctrl";
ranges = <0 0x54004000 0x400>;
pins-are-numbered;
+ interrupt-parent = <&exti>;
+ st,syscfg = <&exti 0x60 0xff>;
status = "disabled";
gpioz: gpio@54004000 {
diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi
index 9e17e42b02b2..4fa0df853c8a 100644
--- a/arch/arm/boot/dts/stm32mp157c.dtsi
+++ b/arch/arm/boot/dts/stm32mp157c.dtsi
@@ -183,6 +183,13 @@
status = "disabled";
};
+ exti: interrupt-controller@5000d000 {
+ compatible = "st,stm32mp1-exti", "syscon";
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ reg = <0x5000d000 0x400>;
+ };
+
usart1: serial@5c000000 {
compatible = "st,stm32h7-uart";
reg = <0x5c000000 0x400>;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 873e3c189279..1d66db9c9db5 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
generic-y += current.h
generic-y += early_ioremap.h
generic-y += emergency-restart.h
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 1f0de808d111..0abd389cf0ec 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -19,13 +19,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
}
#endif /* CONFIG_PCI_DOMAINS */
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (1)
-
#define HAVE_PCI_MMAP
#define ARCH_GENERIC_PCI_MMAP_RESOURCE
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 7724b0f661b3..36718a424358 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -205,6 +205,7 @@ void ptrace_break(struct task_struct *tsk, struct pt_regs *regs)
{
siginfo_t info;
+ clear_siginfo(&info);
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_BRKPT;
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index fc40a2b40595..35ca494c028c 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -754,7 +754,7 @@ int __init arm_add_memory(u64 start, u64 size)
else
size -= aligned_start - start;
-#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
+#ifndef CONFIG_PHYS_ADDR_T_64BIT
if (aligned_start > ULONG_MAX) {
pr_crit("Ignoring memory at 0x%08llx outside 32-bit physical address space\n",
(long long)start);
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 6e971e114879..80517f293eb9 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -100,6 +100,7 @@ static void set_segfault(struct pt_regs *regs, unsigned long addr)
{
siginfo_t info;
+ clear_siginfo(&info);
down_read(&current->mm->mmap_sem);
if (find_vma(current->mm, addr) == NULL)
info.si_code = SEGV_MAPERR;
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 2fe87109ae46..badf02ca3693 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -441,6 +441,7 @@ asmlinkage void do_undefinstr(struct pt_regs *regs)
siginfo_t info;
void __user *pc;
+ clear_siginfo(&info);
pc = (void __user *)instruction_pointer(regs);
if (processor_mode(regs) == SVC_MODE) {
@@ -540,6 +541,7 @@ static int bad_syscall(int n, struct pt_regs *regs)
{
siginfo_t info;
+ clear_siginfo(&info);
if ((current->personality & PER_MASK) != PER_LINUX) {
send_sig(SIGSEGV, current, 1);
return regs->ARM_r0;
@@ -607,6 +609,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
{
siginfo_t info;
+ clear_siginfo(&info);
if ((no >> 16) != (__ARM_NR_BASE>> 16))
return bad_syscall(no, regs);
@@ -743,6 +746,8 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs)
unsigned long addr = instruction_pointer(regs);
siginfo_t info;
+ clear_siginfo(&info);
+
#ifdef CONFIG_DEBUG_USER
if (user_debug & UDBG_BADABORT) {
pr_err("[%d] %s: bad data abort: code %d instr 0x%08lx\n",
diff --git a/arch/arm/mach-axxia/Kconfig b/arch/arm/mach-axxia/Kconfig
index bb2ce1c63fd9..d3eae6037913 100644
--- a/arch/arm/mach-axxia/Kconfig
+++ b/arch/arm/mach-axxia/Kconfig
@@ -2,7 +2,6 @@
config ARCH_AXXIA
bool "LSI Axxia platforms"
depends on ARCH_MULTI_V7 && ARM_LPAE
- select ARCH_DMA_ADDR_T_64BIT
select ARM_AMBA
select ARM_GIC
select ARM_TIMER_SP804
diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index c2f3b0d216a4..c46a728df44e 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -211,7 +211,6 @@ config ARCH_BRCMSTB
select BRCMSTB_L2_IRQ
select BCM7120_L2_IRQ
select ARCH_HAS_HOLES_MEMORYMODEL
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ZONE_DMA if ARM_LPAE
select SOC_BRCMSTB
select SOC_BUS
diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
index 647c319f9f5f..2ca405816846 100644
--- a/arch/arm/mach-exynos/Kconfig
+++ b/arch/arm/mach-exynos/Kconfig
@@ -112,7 +112,6 @@ config SOC_EXYNOS5440
bool "SAMSUNG EXYNOS5440"
default y
depends on ARCH_EXYNOS5
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select HAVE_ARM_ARCH_TIMER
select AUTO_ZRELADDR
select PINCTRL_EXYNOS5440
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 81110ec34226..5552968f07f8 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -1,7 +1,6 @@
config ARCH_HIGHBANK
bool "Calxeda ECX-1000/2000 (Highbank/Midway)"
depends on ARCH_MULTI_V7
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_SUPPORTS_BIG_ENDIAN
select ARM_AMBA
diff --git a/arch/arm/mach-rockchip/Kconfig b/arch/arm/mach-rockchip/Kconfig
index a4065966881a..fafd3d7f9f8c 100644
--- a/arch/arm/mach-rockchip/Kconfig
+++ b/arch/arm/mach-rockchip/Kconfig
@@ -3,7 +3,6 @@ config ARCH_ROCKCHIP
depends on ARCH_MULTI_V7
select PINCTRL
select PINCTRL_ROCKCHIP
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_HAS_RESET_CONTROLLER
select ARM_AMBA
select ARM_GIC
diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig
index 280e7312a9e1..fe60cd09a5ca 100644
--- a/arch/arm/mach-shmobile/Kconfig
+++ b/arch/arm/mach-shmobile/Kconfig
@@ -29,7 +29,6 @@ config ARCH_RMOBILE
menuconfig ARCH_RENESAS
bool "Renesas ARM SoCs"
depends on ARCH_MULTI_V7 && MMU
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_SHMOBILE
select ARM_GIC
select GPIOLIB
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index 1e0aeb47bac6..7f3b83e0d324 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -15,6 +15,5 @@ menuconfig ARCH_TEGRA
select RESET_CONTROLLER
select SOC_BUS
select ZONE_DMA if ARM_LPAE
- select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
help
This enables support for NVIDIA Tegra based systems.
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 7f14acf67caf..5a016bc80e26 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -661,6 +661,7 @@ config ARM_LPAE
bool "Support for the Large Physical Address Extension"
depends on MMU && CPU_32v7 && !CPU_32v6 && !CPU_32v5 && \
!CPU_32v4 && !CPU_32v3
+ select PHYS_ADDR_T_64BIT
help
Say Y if you have an ARMv7 processor supporting the LPAE page
table format and you would like to access memory beyond the
@@ -673,12 +674,6 @@ config ARM_PV_FIXUP
def_bool y
depends on ARM_LPAE && ARM_PATCH_PHYS_VIRT && ARCH_KEYSTONE
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool ARM_LPAE
-
-config ARCH_DMA_ADDR_T_64BIT
- bool
-
config ARM_THUMB
bool "Support Thumb user binaries" if !CPU_THUMBONLY && EXPERT
depends on CPU_THUMB_CAPABLE
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 2c96190e018b..bd2c739d8083 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -950,6 +950,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (ai_usermode & UM_SIGNAL) {
siginfo_t si;
+ clear_siginfo(&si);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_code = BUS_ADRALN;
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 619f24a42d09..f448a0663b10 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -241,12 +241,3 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
void arch_teardown_dma_ops(struct device *dev)
{
}
-
-#define PREALLOC_DMA_DEBUG_ENTRIES 4096
-
-static int __init dma_debug_do_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-core_initcall(dma_debug_do_init);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ada8eb206a90..4b6613b5e042 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1151,15 +1151,6 @@ int arm_dma_supported(struct device *dev, u64 mask)
return __dma_supported(dev, mask, false);
}
-#define PREALLOC_DMA_DEBUG_ENTRIES 4096
-
-static int __init dma_debug_do_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-core_initcall(dma_debug_do_init);
-
#ifdef CONFIG_ARM_DMA_USE_IOMMU
static int __dma_info_to_prot(enum dma_data_direction dir, unsigned long attrs)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b75eada23d0a..32034543f49c 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -163,6 +163,8 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr,
{
struct siginfo si;
+ clear_siginfo(&si);
+
#ifdef CONFIG_DEBUG_USER
if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
((user_debug & UDBG_BUS) && (sig == SIGBUS))) {
@@ -557,6 +559,7 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
inf->name, fsr, addr);
show_pte(current->mm, addr);
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
@@ -589,6 +592,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
inf->name, ifsr, addr);
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index af4ee2cef2f9..35d0f823e823 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
{
siginfo_t info;
- memset(&info, 0, sizeof(info));
-
+ clear_siginfo(&info);
info.si_signo = SIGFPE;
info.si_code = sicode;
info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf4938f6d..b25ed7834f6c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -105,7 +105,6 @@ config ARM64
select HAVE_CONTEXT_TRACKING
select HAVE_DEBUG_BUGVERBOSE
select HAVE_DEBUG_KMEMLEAK
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -133,6 +132,8 @@ config ARM64
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
select MULTI_IRQ_HANDLER
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
select NO_BOOTMEM
select OF
select OF_EARLY_FLATTREE
@@ -142,6 +143,7 @@ config ARM64
select POWER_SUPPLY
select REFCOUNT_FULL
select SPARSE_IRQ
+ select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
help
@@ -150,9 +152,6 @@ config ARM64
config 64BIT
def_bool y
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool y
-
config MMU
def_bool y
@@ -237,24 +236,9 @@ config ZONE_DMA32
config HAVE_GENERIC_GUP
def_bool y
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
-
-config NEED_DMA_MAP_STATE
- def_bool y
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config SMP
def_bool y
-config SWIOTLB
- def_bool y
-
-config IOMMU_HELPER
- def_bool SWIOTLB
-
config KERNEL_MODE_NEON
def_bool y
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index c00c62e1a4a3..1a037b94eba1 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -34,7 +34,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u16 __compat_uid_t;
@@ -66,16 +65,6 @@ typedef u32 compat_ulong_t;
typedef u64 compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
#ifdef __AARCH64EB__
short st_dev;
@@ -192,10 +181,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- compat_time_t sem_otime;
- compat_ulong_t __unused1;
- compat_time_t sem_ctime;
- compat_ulong_t __unused2;
+ compat_ulong_t sem_otime;
+ compat_ulong_t sem_otime_high;
+ compat_ulong_t sem_ctime;
+ compat_ulong_t sem_ctime_high;
compat_ulong_t sem_nsems;
compat_ulong_t __unused3;
compat_ulong_t __unused4;
@@ -203,12 +192,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- compat_time_t msg_stime;
- compat_ulong_t __unused1;
- compat_time_t msg_rtime;
- compat_ulong_t __unused2;
- compat_time_t msg_ctime;
- compat_ulong_t __unused3;
+ compat_ulong_t msg_stime;
+ compat_ulong_t msg_stime_high;
+ compat_ulong_t msg_rtime;
+ compat_ulong_t msg_rtime_high;
+ compat_ulong_t msg_ctime;
+ compat_ulong_t msg_ctime_high;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
@@ -221,12 +210,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
compat_size_t shm_segsz;
- compat_time_t shm_atime;
- compat_ulong_t __unused1;
- compat_time_t shm_dtime;
- compat_ulong_t __unused2;
- compat_time_t shm_ctime;
- compat_ulong_t __unused3;
+ compat_ulong_t shm_atime;
+ compat_ulong_t shm_atime_high;
+ compat_ulong_t shm_dtime;
+ compat_ulong_t shm_dtime_high;
+ compat_ulong_t shm_ctime;
+ compat_ulong_t shm_ctime_high;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
compat_ulong_t shm_nattch;
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 8747f7c5e0e7..9e690686e8aa 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -18,11 +18,6 @@
#define pcibios_assign_all_busses() \
(pci_has_flag(PCI_REASSIGN_ALL_BUS))
-/*
- * PCI address space differs from physical memory address space
- */
-#define PCI_DMA_BUS_IS_PHYS (0)
-
#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
extern int isa_dma_bridge_buggy;
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index ebdae15d665d..26c5bd7d88d8 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -122,11 +122,6 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- /*
- * Ensure prior spin_lock operations to other locks have completed
- * on this CPU before we test whether "lock" is locked.
- */
- smp_mb(); /* ^^^ */
return !arch_spin_value_unlocked(READ_ONCE(*lock));
}
diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h
index 15e35598ac40..eab738019707 100644
--- a/arch/arm64/include/asm/stat.h
+++ b/arch/arm64/include/asm/stat.h
@@ -20,6 +20,7 @@
#ifdef CONFIG_COMPAT
+#include <linux/compat_time.h>
#include <asm/compat.h>
/*
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 87a35364e750..4bcdd0318729 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -882,7 +882,7 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
si_code = FPE_FLTRES;
}
- memset(&info, 0, sizeof(info));
+ clear_siginfo(&info);
info.si_signo = SIGFPE;
info.si_code = si_code;
info.si_addr = (void __user *)instruction_pointer(regs);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 74bb56f656ef..413dbe530da8 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -30,7 +30,6 @@
#include <linux/smp.h>
#include <linux/uaccess.h>
-#include <asm/compat.h>
#include <asm/current.h>
#include <asm/debug-monitors.h>
#include <asm/hw_breakpoint.h>
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 1d091d048d04..0bbac612146e 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -1,11 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/perf_event.h>
#include <linux/bug.h>
#include <linux/sched/task_stack.h>
-#include <asm/compat.h>
#include <asm/perf_regs.h>
#include <asm/ptrace.h>
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 93ab57dcfc14..a6109825eeb9 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -112,6 +112,7 @@ long compat_arm_syscall(struct pt_regs *regs)
break;
}
+ clear_siginfo(&info);
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_ILLTRP;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 8bbdc17e49df..d399d459397b 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -635,6 +635,7 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
siginfo_t info;
void __user *pc = (void __user *)instruction_pointer(regs);
+ clear_siginfo(&info);
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_ILLOPC;
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a96ec0181818..db01f2709842 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -508,16 +508,6 @@ static int __init arm64_dma_init(void)
}
arch_initcall(arm64_dma_init);
-#define PREALLOC_DMA_DEBUG_ENTRIES 4096
-
-static int __init dma_debug_do_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-fs_initcall(dma_debug_do_init);
-
-
#ifdef CONFIG_IOMMU_DMA
#include <linux/dma-iommu.h>
#include <linux/platform_device.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2af3dd89bcdb..576f15153080 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -356,11 +356,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
*/
if (user_mode(regs)) {
const struct fault_info *inf = esr_to_fault_info(esr);
- struct siginfo si = {
- .si_signo = inf->sig,
- .si_code = inf->code,
- .si_addr = (void __user *)addr,
- };
+ struct siginfo si;
+
+ clear_siginfo(&si);
+ si.si_signo = inf->sig;
+ si.si_code = inf->code;
+ si.si_addr = (void __user *)addr;
__do_user_fault(&si, esr);
} else {
@@ -634,6 +635,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
nmi_exit();
}
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
@@ -738,6 +740,7 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
show_pte(addr);
}
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
@@ -780,6 +783,7 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
local_irq_enable();
}
+ clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRALN;
@@ -823,7 +827,6 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
struct pt_regs *regs)
{
const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
- struct siginfo info;
int rv;
/*
@@ -839,6 +842,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
if (!inf->fn(addr, esr, regs)) {
rv = 1;
} else {
+ struct siginfo info;
+
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index c6b4dd1418b4..bf59855628ac 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -6,11 +6,13 @@
config C6X
def_bool y
+ select ARCH_HAS_SYNC_DMA_FOR_CPU
+ select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select CLKDEV_LOOKUP
+ select DMA_NONCOHERENT_OPS
select GENERIC_ATOMIC64
select GENERIC_IRQ_SHOW
select HAVE_ARCH_TRACEHOOK
- select HAVE_DMA_API_DEBUG
select HAVE_MEMBLOCK
select SPARSE_IRQ
select IRQ_DOMAIN
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index fd4c840de837..33a2c94fed0d 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -1,10 +1,12 @@
generic-y += atomic.h
generic-y += barrier.h
generic-y += bugs.h
+generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
generic-y += dma.h
+generic-y += dma-mapping.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h
deleted file mode 100644
index 05daf1038111..000000000000
--- a/arch/c6x/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Port on Texas Instruments TMS320C6x architecture
- *
- * Copyright (C) 2004, 2009, 2010, 2011 Texas Instruments Incorporated
- * Author: Aurelien Jacquiot <aurelien.jacquiot@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#ifndef _ASM_C6X_DMA_MAPPING_H
-#define _ASM_C6X_DMA_MAPPING_H
-
-extern const struct dma_map_ops c6x_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
- return &c6x_dma_ops;
-}
-
-extern void coherent_mem_init(u32 start, u32 size);
-void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
- gfp_t gfp, unsigned long attrs);
-void c6x_dma_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, unsigned long attrs);
-
-#endif /* _ASM_C6X_DMA_MAPPING_H */
diff --git a/arch/c6x/include/asm/setup.h b/arch/c6x/include/asm/setup.h
index 852afb209afb..350f34debb19 100644
--- a/arch/c6x/include/asm/setup.h
+++ b/arch/c6x/include/asm/setup.h
@@ -28,5 +28,7 @@ extern unsigned char c6x_fuse_mac[6];
extern void machine_init(unsigned long dt_ptr);
extern void time_init(void);
+extern void coherent_mem_init(u32 start, u32 size);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_C6X_SETUP_H */
diff --git a/arch/c6x/kernel/Makefile b/arch/c6x/kernel/Makefile
index 02f340d7b8fe..fbe74174de87 100644
--- a/arch/c6x/kernel/Makefile
+++ b/arch/c6x/kernel/Makefile
@@ -8,6 +8,6 @@ extra-y := head.o vmlinux.lds
obj-y := process.o traps.o irq.o signal.o ptrace.o
obj-y += setup.o sys_c6x.o time.o devicetree.o
obj-y += switch_to.o entry.o vectors.o c6x_ksyms.o
-obj-y += soc.o dma.o
+obj-y += soc.o
obj-$(CONFIG_MODULES) += module.o
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
deleted file mode 100644
index 9fff8be75f58..000000000000
--- a/arch/c6x/kernel/dma.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (C) 2011 Texas Instruments Incorporated
- * Author: Mark Salter <msalter@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/mm_types.h>
-#include <linux/scatterlist.h>
-
-#include <asm/cacheflush.h>
-
-static void c6x_dma_sync(dma_addr_t handle, size_t size,
- enum dma_data_direction dir)
-{
- unsigned long paddr = handle;
-
- BUG_ON(!valid_dma_direction(dir));
-
- switch (dir) {
- case DMA_FROM_DEVICE:
- L2_cache_block_invalidate(paddr, paddr + size);
- break;
- case DMA_TO_DEVICE:
- L2_cache_block_writeback(paddr, paddr + size);
- break;
- case DMA_BIDIRECTIONAL:
- L2_cache_block_writeback_invalidate(paddr, paddr + size);
- break;
- default:
- break;
- }
-}
-
-static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- dma_addr_t handle = virt_to_phys(page_address(page) + offset);
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- c6x_dma_sync(handle, size, dir);
-
- return handle;
-}
-
-static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- c6x_dma_sync(handle, size, dir);
-}
-
-static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
- int nents, enum dma_data_direction dir, unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sglist, sg, nents, i) {
- sg->dma_address = sg_phys(sg);
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- c6x_dma_sync(sg->dma_address, sg->length, dir);
- }
-
- return nents;
-}
-
-static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nents, enum dma_data_direction dir, unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
-
- if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
- return;
-
- for_each_sg(sglist, sg, nents, i)
- c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
-}
-
-static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir)
-{
- c6x_dma_sync(handle, size, dir);
-
-}
-
-static void c6x_dma_sync_single_for_device(struct device *dev,
- dma_addr_t handle, size_t size, enum dma_data_direction dir)
-{
- c6x_dma_sync(handle, size, dir);
-
-}
-
-static void c6x_dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sglist, int nents,
- enum dma_data_direction dir)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sglist, sg, nents, i)
- c6x_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
- sg->length, dir);
-
-}
-
-static void c6x_dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sglist, int nents,
- enum dma_data_direction dir)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sglist, sg, nents, i)
- c6x_dma_sync_single_for_device(dev, sg_dma_address(sg),
- sg->length, dir);
-
-}
-
-const struct dma_map_ops c6x_dma_ops = {
- .alloc = c6x_dma_alloc,
- .free = c6x_dma_free,
- .map_page = c6x_dma_map_page,
- .unmap_page = c6x_dma_unmap_page,
- .map_sg = c6x_dma_map_sg,
- .unmap_sg = c6x_dma_unmap_sg,
- .sync_single_for_device = c6x_dma_sync_single_for_device,
- .sync_single_for_cpu = c6x_dma_sync_single_for_cpu,
- .sync_sg_for_device = c6x_dma_sync_sg_for_device,
- .sync_sg_for_cpu = c6x_dma_sync_sg_for_cpu,
-};
-EXPORT_SYMBOL(c6x_dma_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
- return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 4c1d4b84dd2b..5c60aea3b75a 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -244,7 +244,6 @@ static struct exception_info eexcept_table[128] = {
static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
- siginfo_t info;
if (except_info->code != TRAP_BRKPT)
pr_err("TRAP: %s PC[0x%lx] signo[%d] code[%d]\n",
@@ -253,12 +252,8 @@ static void do_trap(struct exception_info *except_info, struct pt_regs *regs)
die_if_kernel(except_info->kernel_str, regs, addr);
- info.si_signo = except_info->signo;
- info.si_errno = 0;
- info.si_code = except_info->code;
- info.si_addr = (void __user *)addr;
-
- force_sig_info(except_info->signo, &info, current);
+ force_sig_fault(except_info->signo, except_info->code,
+ (void __user *)addr, current);
}
/*
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index 95e38ad27c69..d0a8e0c4b27e 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -19,10 +19,12 @@
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/interrupt.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
#include <linux/memblock.h>
+#include <asm/cacheflush.h>
#include <asm/page.h>
+#include <asm/setup.h>
/*
* DMA coherent memory management, can be redefined using the memdma=
@@ -73,7 +75,7 @@ static void __free_dma_pages(u32 addr, int order)
* Allocate DMA coherent memory space and return both the kernel
* virtual and DMA address for that space.
*/
-void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
gfp_t gfp, unsigned long attrs)
{
u32 paddr;
@@ -98,7 +100,7 @@ void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
/*
* Free DMA coherent memory as defined by the above mapping.
*/
-void c6x_dma_free(struct device *dev, size_t size, void *vaddr,
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
int order;
@@ -139,3 +141,35 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
dma_bitmap = phys_to_virt(bitmap_phys);
memset(dma_bitmap, 0, dma_pages * PAGE_SIZE);
}
+
+static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ BUG_ON(!valid_dma_direction(dir));
+
+ switch (dir) {
+ case DMA_FROM_DEVICE:
+ L2_cache_block_invalidate(paddr, paddr + size);
+ break;
+ case DMA_TO_DEVICE:
+ L2_cache_block_writeback(paddr, paddr + size);
+ break;
+ case DMA_BIDIRECTIONAL:
+ L2_cache_block_writeback_invalidate(paddr, paddr + size);
+ break;
+ default:
+ break;
+ }
+}
+
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
+{
+ return c6x_dma_sync(dev, paddr, size, dir);
+}
+
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
+{
+ return c6x_dma_sync(dev, paddr, size, dir);
+}
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 14bac06b7116..a5d0b2991f47 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += barrier.h
generic-y += bugs.h
generic-y += cacheflush.h
generic-y += checksum.h
+generic-y += compat.h
generic-y += current.h
generic-y += delay.h
generic-y += device.h
diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h
index 7c9e55d62215..d4d345a52092 100644
--- a/arch/h8300/include/asm/pci.h
+++ b/arch/h8300/include/asm/pci.h
@@ -15,6 +15,4 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
/* We don't do dynamic PCI IRQ allocation */
}
-#define PCI_DMA_BUS_IS_PHYS (1)
-
#endif /* _ASM_H8300_PCI_H */
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 76d2f20d525e..37adb2003033 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -19,6 +19,7 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
+ select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
select GENERIC_SMP_IDLE_THREAD
@@ -63,9 +64,6 @@ config GENERIC_CSUM
config GENERIC_IRQ_PROBE
def_bool y
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config RWSEM_GENERIC_SPINLOCK
def_bool n
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index e9743f689fb8..dd2fd9c0d292 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -2,6 +2,7 @@
generic-y += barrier.h
generic-y += bug.h
generic-y += bugs.h
+generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index ad8347c29dcf..77459df34e2e 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -208,7 +208,6 @@ const struct dma_map_ops hexagon_dma_ops = {
.sync_single_for_cpu = hexagon_sync_single_for_cpu,
.sync_single_for_device = hexagon_sync_single_for_device,
.mapping_error = hexagon_mapping_error,
- .is_phys = 1,
};
void __init hexagon_dma_init(void)
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 2942a9204a9a..91ee04842c22 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -412,10 +412,6 @@ void do_trap0(struct pt_regs *regs)
case TRAP_DEBUG:
/* Trap0 0xdb is debug breakpoint */
if (user_mode(regs)) {
- struct siginfo info;
-
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
/*
* Some architecures add some per-thread state
* to distinguish between breakpoint traps and
@@ -423,9 +419,8 @@ void do_trap0(struct pt_regs *regs)
* set the si_code value appropriately, or we
* may want to use a different trap0 flavor.
*/
- info.si_code = TRAP_BRKPT;
- info.si_addr = (void __user *) pt_elr(regs);
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT,
+ (void __user *) pt_elr(regs), current);
} else {
#ifdef CONFIG_KGDB
kgdb_handle_exception(pt_cause(regs), SIGTRAP,
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 3eec33c5cfd7..933bbcef5363 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -50,7 +50,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- siginfo_t info;
+ int si_signo;
int si_code = SEGV_MAPERR;
int fault;
const struct exception_table_entry *fixup;
@@ -140,28 +140,22 @@ good_area:
* unable to fix up the page fault.
*/
if (fault & VM_FAULT_SIGBUS) {
- info.si_signo = SIGBUS;
- info.si_code = BUS_ADRERR;
+ si_signo = SIGBUS;
+ si_code = BUS_ADRERR;
}
/* Address is not in the memory map */
else {
- info.si_signo = SIGSEGV;
- info.si_code = SEGV_ACCERR;
+ si_signo = SIGSEGV;
+ si_code = SEGV_ACCERR;
}
- info.si_errno = 0;
- info.si_addr = (void __user *)address;
- force_sig_info(info.si_signo, &info, current);
+ force_sig_fault(si_signo, si_code, (void __user *)address, current);
return;
bad_area:
up_read(&mm->mmap_sem);
if (user_mode(regs)) {
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void *)address;
- force_sig_info(info.si_signo, &info, current);
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address, current);
return;
}
/* Kernel-mode fault falls through */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bbe12a038d21..792437d526c6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -29,7 +29,6 @@ config IA64
select HAVE_FUNCTION_TRACER
select TTY
select HAVE_ARCH_TRACEHOOK
- select HAVE_DMA_API_DEBUG
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
@@ -54,6 +53,8 @@ config IA64
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_AUDITSYSCALL
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
default y
help
The Itanium Processor Family is Intel's 64-bit successor to
@@ -78,18 +79,6 @@ config MMU
bool
default y
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
-
-config NEED_DMA_MAP_STATE
- def_bool y
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
-config SWIOTLB
- bool
-
config STACKTRACE_SUPPORT
def_bool y
@@ -146,7 +135,6 @@ config IA64_GENERIC
bool "generic"
select NUMA
select ACPI_NUMA
- select DMA_DIRECT_OPS
select SWIOTLB
select PCI_MSI
help
@@ -167,7 +155,6 @@ config IA64_GENERIC
config IA64_DIG
bool "DIG-compliant"
- select DMA_DIRECT_OPS
select SWIOTLB
config IA64_DIG_VTD
@@ -183,7 +170,6 @@ config IA64_HP_ZX1
config IA64_HP_ZX1_SWIOTLB
bool "HP-zx1/sx1000 with software I/O TLB"
- select DMA_DIRECT_OPS
select SWIOTLB
help
Build a kernel that runs on HP zx1 and sx1000 systems even when they
@@ -207,7 +193,6 @@ config IA64_SGI_UV
bool "SGI-UV"
select NUMA
select ACPI_NUMA
- select DMA_DIRECT_OPS
select SWIOTLB
help
Selecting this option will optimize the kernel for use on UV based
@@ -218,7 +203,6 @@ config IA64_SGI_UV
config IA64_HP_SIM
bool "Ski-simulator"
- select DMA_DIRECT_OPS
select SWIOTLB
depends on !PM
@@ -397,7 +381,7 @@ config ARCH_DISCONTIGMEM_ENABLE
Say Y to support efficient handling of discontiguous physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
or have huge holes in the physical address space for other reasons.
- See <file:Documentation/vm/numa> for more.
+ See <file:Documentation/vm/numa.rst> for more.
config ARCH_FLATMEM_ENABLE
def_bool y
@@ -613,6 +597,3 @@ source "security/Kconfig"
source "crypto/Kconfig"
source "lib/Kconfig"
-
-config IOMMU_HELPER
- def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB)
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index cb5cd86a5530..ee5b652d320a 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1845,9 +1845,6 @@ static void ioc_init(unsigned long hpa, struct ioc *ioc)
ioc_resource_init(ioc);
ioc_sac_init(ioc);
- if ((long) ~iovp_mask > (long) ia64_max_iommu_merge_mask)
- ia64_max_iommu_merge_mask = ~iovp_mask;
-
printk(KERN_INFO PFX
"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 6dd867873364..557bbc8ba9f5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
generic-y += exec.h
generic-y += irq_work.h
generic-y += mcs_spinlock.h
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index bdc4669c71c3..ccde7c2ba00f 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -13,7 +13,7 @@
#define __ARCH_IRQ_STAT 1
-#define local_softirq_pending() (local_cpu_data->softirq_pending)
+#define local_softirq_pending_ref ia64_cpu_info.softirq_pending
#include <linux/threads.h>
#include <linux/irq.h>
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index b1d04e8bafc8..780e8744ba85 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -30,23 +30,6 @@ struct pci_vector_struct {
#define PCIBIOS_MIN_IO 0x1000
#define PCIBIOS_MIN_MEM 0x10000000
-/*
- * PCI_DMA_BUS_IS_PHYS should be set to 1 if there is _necessarily_ a direct
- * correspondence between device bus addresses and CPU physical addresses.
- * Platforms with a hardware I/O MMU _must_ turn this off to suppress the
- * bounce buffer handling code in the block and network device layers.
- * Platforms with separate bus address spaces _must_ turn this off and provide
- * a device DMA mapping implementation that takes care of the necessary
- * address translation.
- *
- * For now, the ia64 platforms which may have separate/multiple bus address
- * spaces all have I/O MMUs which support the merging of physically
- * discontiguous buffers, so we can use that as the sole factor to determine
- * the setting of PCI_DMA_BUS_IS_PHYS.
- */
-extern unsigned long ia64_max_iommu_merge_mask;
-#define PCI_DMA_BUS_IS_PHYS (ia64_max_iommu_merge_mask == ~0UL)
-
#define HAVE_PCI_MMAP
#define ARCH_GENERIC_PCI_MMAP_RESOURCE
#define arch_can_pci_mmap_wc() 1
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index c0527cfc48f0..3982e673e967 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -2,5 +2,9 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += bpf_perf_event.h
+generic-y += ipcbuf.h
generic-y += kvm_para.h
+generic-y += msgbuf.h
generic-y += poll.h
+generic-y += sembuf.h
+generic-y += shmbuf.h
diff --git a/arch/ia64/include/uapi/asm/ipcbuf.h b/arch/ia64/include/uapi/asm/ipcbuf.h
deleted file mode 100644
index 90d6445a14df..000000000000
--- a/arch/ia64/include/uapi/asm/ipcbuf.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/ia64/include/uapi/asm/msgbuf.h b/arch/ia64/include/uapi/asm/msgbuf.h
deleted file mode 100644
index aa25df92d9dc..000000000000
--- a/arch/ia64/include/uapi/asm/msgbuf.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_MSGBUF_H
-#define _ASM_IA64_MSGBUF_H
-
-/*
- * The msqid64_ds structure for IA-64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct msqid64_ds {
- struct ipc64_perm msg_perm;
- __kernel_time_t msg_stime; /* last msgsnd time */
- __kernel_time_t msg_rtime; /* last msgrcv time */
- __kernel_time_t msg_ctime; /* last change time */
- unsigned long msg_cbytes; /* current number of bytes on queue */
- unsigned long msg_qnum; /* number of messages in queue */
- unsigned long msg_qbytes; /* max number of bytes on queue */
- __kernel_pid_t msg_lspid; /* pid of last msgsnd */
- __kernel_pid_t msg_lrpid; /* last receive pid */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-#endif /* _ASM_IA64_MSGBUF_H */
diff --git a/arch/ia64/include/uapi/asm/sembuf.h b/arch/ia64/include/uapi/asm/sembuf.h
deleted file mode 100644
index 6ed058760afc..000000000000
--- a/arch/ia64/include/uapi/asm/sembuf.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SEMBUF_H
-#define _ASM_IA64_SEMBUF_H
-
-/*
- * The semid64_ds structure for IA-64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct semid64_ds {
- struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
- __kernel_time_t sem_otime; /* last semop time */
- __kernel_time_t sem_ctime; /* last change time */
- unsigned long sem_nsems; /* no. of semaphores in array */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-#endif /* _ASM_IA64_SEMBUF_H */
diff --git a/arch/ia64/include/uapi/asm/shmbuf.h b/arch/ia64/include/uapi/asm/shmbuf.h
deleted file mode 100644
index 6ef57cb70dee..000000000000
--- a/arch/ia64/include/uapi/asm/shmbuf.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SHMBUF_H
-#define _ASM_IA64_SHMBUF_H
-
-/*
- * The shmid64_ds structure for IA-64 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct shmid64_ds {
- struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
- __kernel_time_t shm_dtime; /* last detach time */
- __kernel_time_t shm_ctime; /* last change time */
- __kernel_pid_t shm_cpid; /* pid of creator */
- __kernel_pid_t shm_lpid; /* pid of last operator */
- unsigned long shm_nattch; /* no. of current attaches */
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-struct shminfo64 {
- unsigned long shmmax;
- unsigned long shmmin;
- unsigned long shmmni;
- unsigned long shmseg;
- unsigned long shmall;
- unsigned long __unused1;
- unsigned long __unused2;
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _ASM_IA64_SHMBUF_H */
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
index 5aa454ed89db..52b5af424511 100644
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ b/arch/ia64/include/uapi/asm/siginfo.h
@@ -27,11 +27,4 @@
#define __ISR_VALID_BIT 0
#define __ISR_VALID (1 << __ISR_VALID_BIT)
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
#endif /* _UAPI_ASM_IA64_SIGINFO_H */
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 9bcc908bc85e..a61f6c6a36f8 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -62,6 +62,7 @@ ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
struct illegal_op_return rv;
long tmp_taken, unimplemented_address;
+ clear_siginfo(&siginfo);
rv.fkt = (unsigned long) -1;
/*
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index f2d57e66fd86..7a471d8d67d4 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -9,16 +9,6 @@ int iommu_detected __read_mostly;
const struct dma_map_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
- return 0;
-}
-fs_initcall(dma_init);
-
const struct dma_map_ops *dma_get_ops(struct device *dev)
{
return dma_ops;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index dee56bcb993d..ad43cbf70628 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -124,18 +124,6 @@ unsigned long ia64_i_cache_stride_shift = ~0;
unsigned long ia64_cache_stride_shift = ~0;
/*
- * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This
- * mask specifies a mask of address bits that must be 0 in order for two buffers to be
- * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
- * address of the second buffer must be aligned to (merge_mask+1) in order to be
- * mergeable). By default, we assume there is no I/O MMU which can merge physically
- * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu
- * page-size of 2^64.
- */
-unsigned long ia64_max_iommu_merge_mask = ~0UL;
-EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
-
-/*
* We use a special marker for the end of memory and it uses the extra (+1) slot
*/
struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 54547c7cf8a2..d1234a5ba4c5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -153,6 +153,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
return retval;
give_sigsegv:
+ clear_siginfo(&si);
si.si_signo = SIGSEGV;
si.si_errno = 0;
si.si_code = SI_KERNEL;
@@ -236,6 +237,7 @@ force_sigsegv_info (int sig, void __user *addr)
unsigned long flags;
struct siginfo si;
+ clear_siginfo(&si);
if (sig == SIGSEGV) {
/*
* Acquiring siglock around the sa_handler-update is almost
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 6d4e76a4267f..c6f4932073a1 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -104,6 +104,7 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
int sig, code;
/* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
+ clear_siginfo(&siginfo);
siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
siginfo.si_imm = break_num;
siginfo.si_flags = 0; /* clear __ISR_VALID */
@@ -293,7 +294,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
{
long exception, bundle[2];
unsigned long fault_ip;
- struct siginfo siginfo;
fault_ip = regs->cr_iip;
if (!fp_fault && (ia64_psr(regs)->ri == 0))
@@ -344,13 +344,16 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
return -1;
} else {
+ struct siginfo siginfo;
+
/* is next instruction a trap? */
if (exception & 2) {
ia64_increment_ip(regs);
}
+ clear_siginfo(&siginfo);
siginfo.si_signo = SIGFPE;
siginfo.si_errno = 0;
- siginfo.si_code = FPE_FIXME; /* default code */
+ siginfo.si_code = FPE_FLTUNK; /* default code */
siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
if (isr & 0x11) {
siginfo.si_code = FPE_FLTINV;
@@ -372,9 +375,12 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
return -1;
} else if (exception != 0) {
/* raise exception */
+ struct siginfo siginfo;
+
+ clear_siginfo(&siginfo);
siginfo.si_signo = SIGFPE;
siginfo.si_errno = 0;
- siginfo.si_code = FPE_FIXME; /* default code */
+ siginfo.si_code = FPE_FLTUNK; /* default code */
siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
if (isr & 0x880) {
siginfo.si_code = FPE_FLTOVF;
@@ -420,7 +426,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
if (die_if_kernel(buf, &regs, 0))
return rv;
- memset(&si, 0, sizeof(si));
+ clear_siginfo(&si);
si.si_signo = SIGILL;
si.si_code = ILL_ILLOPC;
si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
@@ -434,7 +440,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
long arg7, struct pt_regs regs)
{
unsigned long code, error = isr, iip;
- struct siginfo siginfo;
char buf[128];
int result, sig;
static const char *reason[] = {
@@ -485,6 +490,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
case 26: /* NaT Consumption */
if (user_mode(&regs)) {
+ struct siginfo siginfo;
void __user *addr;
if (((isr >> 4) & 0xf) == 2) {
@@ -499,6 +505,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
addr = (void __user *) (regs.cr_iip
+ ia64_psr(&regs)->ri);
}
+ clear_siginfo(&siginfo);
siginfo.si_signo = sig;
siginfo.si_code = code;
siginfo.si_errno = 0;
@@ -515,6 +522,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
case 31: /* Unsupported Data Reference */
if (user_mode(&regs)) {
+ struct siginfo siginfo;
+
+ clear_siginfo(&siginfo);
siginfo.si_signo = SIGILL;
siginfo.si_code = ILL_ILLOPN;
siginfo.si_errno = 0;
@@ -531,6 +541,10 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
case 29: /* Debug */
case 35: /* Taken Branch Trap */
case 36: /* Single Step Trap */
+ {
+ struct siginfo siginfo;
+
+ clear_siginfo(&siginfo);
if (fsys_mode(current, &regs)) {
extern char __kernel_syscall_via_break[];
/*
@@ -578,11 +592,15 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
siginfo.si_isr = isr;
force_sig_info(SIGTRAP, &siginfo, current);
return;
+ }
case 32: /* fp fault */
case 33: /* fp trap */
result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
+ struct siginfo siginfo;
+
+ clear_siginfo(&siginfo);
siginfo.si_signo = SIGFPE;
siginfo.si_errno = 0;
siginfo.si_code = FPE_FLTINV;
@@ -616,6 +634,9 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
} else {
/* Unimplemented Instr. Address Trap */
if (user_mode(&regs)) {
+ struct siginfo siginfo;
+
+ clear_siginfo(&siginfo);
siginfo.si_signo = SIGILL;
siginfo.si_code = ILL_BADIADDR;
siginfo.si_errno = 0;
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 72e9b4242564..e309f9859acc 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1537,6 +1537,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
/* NOT_REACHED */
}
force_sigbus:
+ clear_siginfo(&si);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_code = BUS_ADRALN;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index dfdc152d6737..817fa120645f 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -85,7 +85,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
int signal = SIGSEGV, code = SEGV_MAPERR;
struct vm_area_struct *vma, *prev_vma;
struct mm_struct *mm = current->mm;
- struct siginfo si;
unsigned long mask;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -249,6 +248,9 @@ retry:
return;
}
if (user_mode(regs)) {
+ struct siginfo si;
+
+ clear_siginfo(&si);
si.si_signo = signal;
si.si_errno = 0;
si.si_code = code;
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 11f2275570fb..8479e9a7ce16 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -480,11 +480,6 @@ sn_io_early_init(void)
tioca_init_provider();
tioce_init_provider();
- /*
- * This is needed to avoid bounce limit checks in the blk layer
- */
- ia64_max_iommu_merge_mask = ~PAGE_MASK;
-
sn_irq_lh_init();
INIT_LIST_HEAD(&sn_sysdata_list);
sn_init_cpei_timer();
diff --git a/arch/m68k/68000/timers.c b/arch/m68k/68000/timers.c
index 252455bce144..71ddb4c98726 100644
--- a/arch/m68k/68000/timers.c
+++ b/arch/m68k/68000/timers.c
@@ -125,7 +125,9 @@ int m68328_hwclk(int set, struct rtc_time *t)
{
if (!set) {
long now = RTCTIME;
- t->tm_year = t->tm_mon = t->tm_mday = 1;
+ t->tm_year = 1;
+ t->tm_mon = 0;
+ t->tm_mday = 1;
t->tm_hour = (now >> 24) % 24;
t->tm_min = (now >> 16) % 60;
t->tm_sec = now % 60;
diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c
index 0d27706f14d4..b2a6bc63f8cd 100644
--- a/arch/m68k/apollo/config.c
+++ b/arch/m68k/apollo/config.c
@@ -221,8 +221,10 @@ int dn_dummy_hwclk(int op, struct rtc_time *t) {
t->tm_hour=rtc->hours;
t->tm_mday=rtc->day_of_month;
t->tm_wday=rtc->day_of_week;
- t->tm_mon=rtc->month;
+ t->tm_mon = rtc->month - 1;
t->tm_year=rtc->year;
+ if (t->tm_year < 70)
+ t->tm_year += 100;
} else {
rtc->second=t->tm_sec;
rtc->minute=t->tm_min;
@@ -230,8 +232,8 @@ int dn_dummy_hwclk(int op, struct rtc_time *t) {
rtc->day_of_month=t->tm_mday;
if(t->tm_wday!=-1)
rtc->day_of_week=t->tm_wday;
- rtc->month=t->tm_mon;
- rtc->year=t->tm_year;
+ rtc->month = t->tm_mon + 1;
+ rtc->year = t->tm_year % 100;
}
return 0;
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 37a8e5ab8728..a874e54404d1 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -98,8 +98,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -204,7 +204,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -233,12 +233,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -259,7 +259,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -310,7 +310,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -414,6 +413,7 @@ CONFIG_ARIADNE=y
# CONFIG_NET_VENDOR_MARVELL is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
CONFIG_HYDRA=y
CONFIG_APNE=y
CONFIG_ZORRO8390=y
@@ -485,6 +485,7 @@ CONFIG_RTC_DRV_MSM6242=m
CONFIG_RTC_DRV_RP5C01=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_HEARTBEAT=y
CONFIG_PROC_HARDWARE=y
CONFIG_AMIGA_BUILTIN_SERIAL=y
@@ -621,6 +622,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -647,6 +649,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 6a466266b852..8ce39e23aa42 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -392,6 +391,7 @@ CONFIG_VETH=m
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -446,6 +446,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_HEARTBEAT=y
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
@@ -580,6 +581,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -606,6 +608,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index b0691a7a3345..346c4e75edf8 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -401,6 +400,7 @@ CONFIG_ATARILANCE=y
# CONFIG_NET_VENDOR_MARVELL is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
CONFIG_NE2000=y
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
@@ -461,6 +461,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_HEARTBEAT=y
CONFIG_PROC_HARDWARE=y
CONFIG_NATFEAT=y
@@ -602,6 +603,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -628,6 +630,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 6f6470fa9a50..fca9c7aa71a3 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_BVME6000_NET=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 31a1a2b5e860..f9eab174915c 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -96,8 +96,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -202,7 +202,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -231,12 +231,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -257,7 +257,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -308,7 +308,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -393,6 +392,7 @@ CONFIG_HPLANCE=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -449,6 +449,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -582,6 +583,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -608,6 +610,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 390d4a87441c..b52e597899eb 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -95,8 +95,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -201,7 +201,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -230,12 +230,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -256,7 +256,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -310,7 +310,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -410,6 +409,7 @@ CONFIG_MAC89x0=y
# CONFIG_NET_VENDOR_MICREL is not set
CONFIG_MACSONIC=y
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
CONFIG_MAC8390=y
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
@@ -471,6 +471,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -604,6 +605,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -630,6 +632,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 77be97d82dc3..2a84eeec5b02 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -105,8 +105,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -211,7 +211,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -240,12 +240,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -266,7 +266,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -320,7 +320,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -452,6 +451,7 @@ CONFIG_MVME16x_NET=y
# CONFIG_NET_VENDOR_MICREL is not set
CONFIG_MACSONIC=y
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
CONFIG_HYDRA=y
CONFIG_MAC8390=y
CONFIG_NE2000=y
@@ -541,6 +541,7 @@ CONFIG_RTC_DRV_RP5C01=m
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_HEARTBEAT=y
CONFIG_PROC_HARDWARE=y
CONFIG_NATFEAT=y
@@ -684,6 +685,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -710,6 +712,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 2ca140757b0f..476e69994340 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -93,8 +93,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -199,7 +199,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -228,12 +228,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -254,7 +254,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -305,7 +305,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_MVME147_NET=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 6a3b4dcc5aab..1477cda9146e 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -391,6 +390,7 @@ CONFIG_MVME16x_NET=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -439,6 +439,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -572,6 +573,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -598,6 +600,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2a3e29c97652..b3a543dc48a0 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -94,8 +94,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -200,7 +200,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -229,12 +229,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -255,7 +255,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -306,7 +306,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -400,6 +399,7 @@ CONFIG_VETH=m
# CONFIG_NET_VENDOR_MARVELL is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
CONFIG_NE2000=y
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
@@ -461,6 +461,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_HEARTBEAT=y
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
@@ -595,6 +596,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -621,6 +623,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index cba2494c99b2..d543ed5dfa96 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -91,8 +91,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -197,7 +197,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -226,12 +226,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -252,7 +252,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -303,7 +303,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -388,6 +387,7 @@ CONFIG_SUN3_82586=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -441,6 +441,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -573,6 +574,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -599,6 +601,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index d911561137fd..a67e54246023 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -91,8 +91,8 @@ CONFIG_NF_CONNTRACK_SANE=m
CONFIG_NF_CONNTRACK_SIP=m
CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_TABLES=m
-CONFIG_NF_TABLES_INET=m
-CONFIG_NF_TABLES_NETDEV=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_NETDEV=y
CONFIG_NFT_EXTHDR=m
CONFIG_NFT_META=m
CONFIG_NFT_RT=m
@@ -197,7 +197,7 @@ CONFIG_NF_SOCKET_IPV4=m
CONFIG_NFT_CHAIN_ROUTE_IPV4=m
CONFIG_NFT_DUP_IPV4=m
CONFIG_NFT_FIB_IPV4=m
-CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_TABLES_ARP=y
CONFIG_NF_FLOW_TABLE_IPV4=m
CONFIG_NF_LOG_ARP=m
CONFIG_NFT_CHAIN_NAT_IPV4=m
@@ -226,12 +226,12 @@ CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_CONNTRACK_IPV6=m
CONFIG_NF_SOCKET_IPV6=m
CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_DUP_IPV6=m
-CONFIG_NFT_FIB_IPV6=m
-CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_NFT_CHAIN_NAT_IPV6=m
CONFIG_NFT_MASQ_IPV6=m
CONFIG_NFT_REDIR_IPV6=m
+CONFIG_NFT_DUP_IPV6=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NF_FLOW_TABLE_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -252,7 +252,7 @@ CONFIG_IP6_NF_RAW=m
CONFIG_IP6_NF_NAT=m
CONFIG_IP6_NF_TARGET_MASQUERADE=m
CONFIG_IP6_NF_TARGET_NPT=m
-CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_BRIDGE=y
CONFIG_NFT_BRIDGE_META=m
CONFIG_NFT_BRIDGE_REJECT=m
CONFIG_NF_LOG_BRIDGE=m
@@ -303,7 +303,6 @@ CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
CONFIG_NET_NSH=m
-CONFIG_NET_L3_MASTER_DEV=y
CONFIG_AF_KCM=m
# CONFIG_WIRELESS is not set
CONFIG_PSAMPLE=m
@@ -389,6 +388,7 @@ CONFIG_SUN3LANCE=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RENESAS is not set
# CONFIG_NET_VENDOR_ROCKER is not set
@@ -441,6 +441,7 @@ CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=m
# CONFIG_VIRTIO_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_DAX=m
CONFIG_PROC_HARDWARE=y
CONFIG_EXT4_FS=y
CONFIG_REISERFS_FS=m
@@ -574,6 +575,7 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_KEYWRAP=m
@@ -600,6 +602,8 @@ CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 88a9d27df1ac..4d8d68c4e3dd 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -1,4 +1,5 @@
generic-y += barrier.h
+generic-y += compat.h
generic-y += device.h
generic-y += emergency-restart.h
generic-y += exec.h
diff --git a/arch/m68k/include/asm/delay.h b/arch/m68k/include/asm/delay.h
index 7f474121e4ca..751712f8beea 100644
--- a/arch/m68k/include/asm/delay.h
+++ b/arch/m68k/include/asm/delay.h
@@ -49,8 +49,6 @@ extern void __bad_udelay(void);
* The simpler m68k and ColdFire processors do not have a 32*32->64
* multiply instruction. So we need to handle them a little differently.
* We use a bit of shifting and a single 32*32->32 multiply to get close.
- * This is a macro so that the const version can factor out the first
- * multiply and shift.
*/
#define HZSCALE (268435456 / (1000000 / HZ))
@@ -115,6 +113,13 @@ static inline void __udelay(unsigned long usecs)
*/
#define HZSCALE (268435456 / (1000000 / HZ))
-#define ndelay(n) __delay(DIV_ROUND_UP((n) * ((((HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6), 1000))
+static inline void ndelay(unsigned long nsec)
+{
+ __delay(DIV_ROUND_UP(nsec *
+ ((((HZSCALE) >> 11) *
+ (loops_per_jiffy >> 11)) >> 6),
+ 1000));
+}
+#define ndelay(n) ndelay(n)
#endif /* defined(_M68K_DELAY_H) */
diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h
index ef26fae8cf0b..5a4bc223743b 100644
--- a/arch/m68k/include/asm/pci.h
+++ b/arch/m68k/include/asm/pci.h
@@ -4,12 +4,6 @@
#include <asm-generic/pci.h>
-/* The PCI address space does equal the physical memory
- * address space. The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (1)
-
#define pcibios_assign_all_busses() 1
#define PCIBIOS_MIN_IO 0x00000100
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index 75c172e909ac..c4cb889660aa 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -141,10 +141,12 @@ asm volatile ("\n" \
case 4: \
__get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \
break; \
-/* case 8: disabled because gcc-4.1 has a broken typeof \
- { \
- const void *__gu_ptr = (ptr); \
- u64 __gu_val; \
+ case 8: { \
+ const void *__gu_ptr = (ptr); \
+ union { \
+ u64 l; \
+ __typeof__(*(ptr)) t; \
+ } __gu_val; \
asm volatile ("\n" \
"1: "MOVES".l (%2)+,%1\n" \
"2: "MOVES".l (%2),%R1\n" \
@@ -162,13 +164,13 @@ asm volatile ("\n" \
" .long 1b,10b\n" \
" .long 2b,10b\n" \
" .previous" \
- : "+d" (__gu_err), "=&r" (__gu_val), \
+ : "+d" (__gu_err), "=&r" (__gu_val.l), \
"+a" (__gu_ptr) \
: "i" (-EFAULT) \
: "memory"); \
- (x) = (__force typeof(*(ptr)))__gu_val; \
+ (x) = __gu_val.t; \
break; \
- } */ \
+ } \
default: \
__gu_err = __get_user_bad(); \
break; \
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index c01b9b8f97bf..463572c4943f 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -9,6 +9,7 @@
#include <linux/dma-mapping.h>
#include <linux/device.h>
#include <linux/kernel.h>
+#include <linux/platform_device.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -165,3 +166,12 @@ const struct dma_map_ops m68k_dma_ops = {
.sync_sg_for_device = m68k_dma_sync_sg_for_device,
};
EXPORT_SYMBOL(m68k_dma_ops);
+
+void arch_setup_pdev_archdata(struct platform_device *pdev)
+{
+ if (pdev->dev.coherent_dma_mask == DMA_MASK_NONE &&
+ pdev->dev.dma_mask == NULL) {
+ pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+ }
+}
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index f7cd5ecfacd3..72850b85ecf8 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -576,41 +576,42 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs *
static inline void siginfo_build_tests(void)
{
- /* This needs to be tested on m68k as it has a lesser
- * alignment requirment than x86 and that can cause surprises.
+ /*
+ * This needs to be tested on m68k as it has a lesser
+ * alignment requirement than x86 and that can cause surprises.
*/
/* This is part of the ABI and can never change in size: */
BUILD_BUG_ON(sizeof(siginfo_t) != 128);
- /* Ensure the know fields never change in location */
+ /* Ensure the known fields never change in location */
BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8);
/* _kill */
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x10);
/* _timer */
- BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x10);
BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x14);
/* _rt */
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x10);
BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x14);
/* _sigchld */
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x10);
BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x14);
BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x1C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x1c);
/* _sigfault */
- BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x0c);
/* _sigfault._mcerr */
BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x10);
@@ -623,11 +624,11 @@ static inline void siginfo_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
/* _sigpoll */
- BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10);
/* _sigsys */
- BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x0c);
BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x10);
BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x14);
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 97dd4e26f234..3a8b47f8f97b 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -71,23 +71,26 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
return IRQ_HANDLED;
}
-void read_persistent_clock(struct timespec *ts)
+#ifdef CONFIG_M68KCLASSIC
+#if !IS_BUILTIN(CONFIG_RTC_DRV_GENERIC)
+void read_persistent_clock64(struct timespec64 *ts)
{
struct rtc_time time;
+
ts->tv_sec = 0;
ts->tv_nsec = 0;
- if (mach_hwclk) {
- mach_hwclk(0, &time);
+ if (!mach_hwclk)
+ return;
- if ((time.tm_year += 1900) < 1970)
- time.tm_year += 100;
- ts->tv_sec = mktime(time.tm_year, time.tm_mon, time.tm_mday,
- time.tm_hour, time.tm_min, time.tm_sec);
- }
+ mach_hwclk(0, &time);
+
+ ts->tv_sec = mktime64(time.tm_year + 1900, time.tm_mon + 1, time.tm_mday,
+ time.tm_hour, time.tm_min, time.tm_sec);
}
+#endif
-#if defined(CONFIG_ARCH_USES_GETTIMEOFFSET) && IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
{
mach_hwclk(0, tm);
@@ -145,8 +148,8 @@ static int __init rtc_init(void)
}
module_init(rtc_init);
-
-#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
+#endif /* CONFIG_RTC_DRV_GENERIC */
+#endif /* CONFIG M68KCLASSIC */
void __init time_init(void)
{
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index c1cc4e99aa94..b2fd000b9285 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1007,9 +1007,9 @@ void bad_super_trap (struct frame *fp)
asmlinkage void trap_c(struct frame *fp)
{
- int sig;
+ int sig, si_code;
+ void __user *addr;
int vector = (fp->ptregs.vector >> 2) & 0xff;
- siginfo_t info;
if (fp->ptregs.sr & PS_S) {
if (vector == VEC_TRACE) {
@@ -1029,21 +1029,21 @@ asmlinkage void trap_c(struct frame *fp)
/* send the appropriate signal to the user program */
switch (vector) {
case VEC_ADDRERR:
- info.si_code = BUS_ADRALN;
+ si_code = BUS_ADRALN;
sig = SIGBUS;
break;
case VEC_ILLEGAL:
case VEC_LINE10:
case VEC_LINE11:
- info.si_code = ILL_ILLOPC;
+ si_code = ILL_ILLOPC;
sig = SIGILL;
break;
case VEC_PRIV:
- info.si_code = ILL_PRVOPC;
+ si_code = ILL_PRVOPC;
sig = SIGILL;
break;
case VEC_COPROC:
- info.si_code = ILL_COPROC;
+ si_code = ILL_COPROC;
sig = SIGILL;
break;
case VEC_TRAP1:
@@ -1060,76 +1060,74 @@ asmlinkage void trap_c(struct frame *fp)
case VEC_TRAP12:
case VEC_TRAP13:
case VEC_TRAP14:
- info.si_code = ILL_ILLTRP;
+ si_code = ILL_ILLTRP;
sig = SIGILL;
break;
case VEC_FPBRUC:
case VEC_FPOE:
case VEC_FPNAN:
- info.si_code = FPE_FLTINV;
+ si_code = FPE_FLTINV;
sig = SIGFPE;
break;
case VEC_FPIR:
- info.si_code = FPE_FLTRES;
+ si_code = FPE_FLTRES;
sig = SIGFPE;
break;
case VEC_FPDIVZ:
- info.si_code = FPE_FLTDIV;
+ si_code = FPE_FLTDIV;
sig = SIGFPE;
break;
case VEC_FPUNDER:
- info.si_code = FPE_FLTUND;
+ si_code = FPE_FLTUND;
sig = SIGFPE;
break;
case VEC_FPOVER:
- info.si_code = FPE_FLTOVF;
+ si_code = FPE_FLTOVF;
sig = SIGFPE;
break;
case VEC_ZERODIV:
- info.si_code = FPE_INTDIV;
+ si_code = FPE_INTDIV;
sig = SIGFPE;
break;
case VEC_CHK:
case VEC_TRAP:
- info.si_code = FPE_INTOVF;
+ si_code = FPE_INTOVF;
sig = SIGFPE;
break;
case VEC_TRACE: /* ptrace single step */
- info.si_code = TRAP_TRACE;
+ si_code = TRAP_TRACE;
sig = SIGTRAP;
break;
case VEC_TRAP15: /* breakpoint */
- info.si_code = TRAP_BRKPT;
+ si_code = TRAP_BRKPT;
sig = SIGTRAP;
break;
default:
- info.si_code = ILL_ILLOPC;
+ si_code = ILL_ILLOPC;
sig = SIGILL;
break;
}
- info.si_signo = sig;
- info.si_errno = 0;
switch (fp->ptregs.format) {
default:
- info.si_addr = (void *) fp->ptregs.pc;
+ addr = (void __user *) fp->ptregs.pc;
break;
case 2:
- info.si_addr = (void *) fp->un.fmt2.iaddr;
+ addr = (void __user *) fp->un.fmt2.iaddr;
break;
case 7:
- info.si_addr = (void *) fp->un.fmt7.effaddr;
+ addr = (void __user *) fp->un.fmt7.effaddr;
break;
case 9:
- info.si_addr = (void *) fp->un.fmt9.iaddr;
+ addr = (void __user *) fp->un.fmt9.iaddr;
break;
case 10:
- info.si_addr = (void *) fp->un.fmta.daddr;
+ addr = (void __user *) fp->un.fmta.daddr;
break;
case 11:
- info.si_addr = (void *) fp->un.fmtb.daddr;
+ addr = (void __user*) fp->un.fmtb.daddr;
break;
}
- force_sig_info (sig, &info, current);
+ force_sig_fault(sig, si_code, addr, current);
}
void die_if_kernel (char *str, struct pt_regs *fp, int nr)
@@ -1161,12 +1159,6 @@ asmlinkage void fpsp040_die(void)
#ifdef CONFIG_M68KFPU_EMU
asmlinkage void fpemu_signal(int signal, int code, void *addr)
{
- siginfo_t info;
-
- info.si_signo = signal;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = addr;
- force_sig_info(signal, &info, current);
+ force_sig_fault(signal, code, addr, current);
}
#endif
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 0c3275aa0197..e522307db47c 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -1005,7 +1005,7 @@ int __init mac_platform_init(void)
struct resource swim_rsrc = {
.flags = IORESOURCE_MEM,
.start = (resource_size_t)swim_base,
- .end = (resource_size_t)swim_base + 0x2000,
+ .end = (resource_size_t)swim_base + 0x1FFF,
};
platform_device_register_simple("swim", -1, &swim_rsrc, 1);
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 03253c4f8e6a..f2ff3779875a 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -21,35 +21,32 @@ extern void die_if_kernel(char *, struct pt_regs *, long);
int send_fault_sig(struct pt_regs *regs)
{
- siginfo_t siginfo;
+ int signo, si_code;
+ void __user *addr;
- clear_siginfo(&siginfo);
- siginfo.si_signo = current->thread.signo;
- siginfo.si_code = current->thread.code;
- siginfo.si_addr = (void *)current->thread.faddr;
- pr_debug("send_fault_sig: %p,%d,%d\n", siginfo.si_addr,
- siginfo.si_signo, siginfo.si_code);
+ signo = current->thread.signo;
+ si_code = current->thread.code;
+ addr = (void __user *)current->thread.faddr;
+ pr_debug("send_fault_sig: %p,%d,%d\n", addr, signo, si_code);
if (user_mode(regs)) {
- force_sig_info(siginfo.si_signo,
- &siginfo, current);
+ force_sig_fault(signo, si_code, addr, current);
} else {
if (fixup_exception(regs))
return -1;
- //if (siginfo.si_signo == SIGBUS)
- // force_sig_info(siginfo.si_signo,
- // &siginfo, current);
+ //if (signo == SIGBUS)
+ // force_sig_fault(si_signo, si_code, addr, current);
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
*/
- if ((unsigned long)siginfo.si_addr < PAGE_SIZE)
+ if ((unsigned long)addr < PAGE_SIZE)
pr_alert("Unable to handle kernel NULL pointer dereference");
else
pr_alert("Unable to handle kernel access");
- pr_cont(" at virtual address %p\n", siginfo.si_addr);
+ pr_cont(" at virtual address %p\n", addr);
die_if_kernel("Oops", regs, 0 /*error_code*/);
do_exit(SIGKILL);
}
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index c2a38321c96d..3b420f6d8822 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -89,7 +89,8 @@ static inline void free_io_area(void *addr)
for (p = &iolist ; (tmp = *p) ; p = &tmp->next) {
if (tmp->addr == addr) {
*p = tmp->next;
- __iounmap(tmp->addr, tmp->size);
+ /* remove gap added in get_io_area() */
+ __iounmap(tmp->addr, tmp->size - IO_SIZE);
kfree(tmp);
return;
}
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index 8778612d1f31..f8a710fd84cd 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -153,12 +153,14 @@ int mvme147_hwclk(int op, struct rtc_time *t)
if (!op) {
m147_rtc->ctrl = RTC_READ;
t->tm_year = bcd2int (m147_rtc->bcd_year);
- t->tm_mon = bcd2int (m147_rtc->bcd_mth);
+ t->tm_mon = bcd2int(m147_rtc->bcd_mth) - 1;
t->tm_mday = bcd2int (m147_rtc->bcd_dom);
t->tm_hour = bcd2int (m147_rtc->bcd_hr);
t->tm_min = bcd2int (m147_rtc->bcd_min);
t->tm_sec = bcd2int (m147_rtc->bcd_sec);
m147_rtc->ctrl = 0;
+ if (t->tm_year < 70)
+ t->tm_year += 100;
}
return 0;
}
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index 6fa06d4d16bf..4ffd9ef98de4 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -400,12 +400,14 @@ int mvme16x_hwclk(int op, struct rtc_time *t)
if (!op) {
rtc->ctrl = RTC_READ;
t->tm_year = bcd2int (rtc->bcd_year);
- t->tm_mon = bcd2int (rtc->bcd_mth);
+ t->tm_mon = bcd2int(rtc->bcd_mth) - 1;
t->tm_mday = bcd2int (rtc->bcd_dom);
t->tm_hour = bcd2int (rtc->bcd_hr);
t->tm_min = bcd2int (rtc->bcd_min);
t->tm_sec = bcd2int (rtc->bcd_sec);
rtc->ctrl = 0;
+ if (t->tm_year < 70)
+ t->tm_year += 100;
}
return 0;
}
diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c
index 2cd0bcbe6f30..d911070af02a 100644
--- a/arch/m68k/sun3/intersil.c
+++ b/arch/m68k/sun3/intersil.c
@@ -48,9 +48,9 @@ int sun3_hwclk(int set, struct rtc_time *t)
todintersil->hour = t->tm_hour;
todintersil->minute = t->tm_min;
todintersil->second = t->tm_sec;
- todintersil->month = t->tm_mon;
+ todintersil->month = t->tm_mon + 1;
todintersil->day = t->tm_mday;
- todintersil->year = t->tm_year - 68;
+ todintersil->year = (t->tm_year - 68) % 100;
todintersil->weekday = t->tm_wday;
} else {
/* read clock */
@@ -58,10 +58,12 @@ int sun3_hwclk(int set, struct rtc_time *t)
t->tm_hour = todintersil->hour;
t->tm_min = todintersil->minute;
t->tm_sec = todintersil->second;
- t->tm_mon = todintersil->month;
+ t->tm_mon = todintersil->month - 1;
t->tm_mday = todintersil->day;
t->tm_year = todintersil->year + 68;
t->tm_wday = todintersil->weekday;
+ if (t->tm_year < 70)
+ t->tm_year += 100;
}
intersil_clock->cmd_reg = START_VAL;
diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c
index 7a2c53d9f779..047e2bcee3d7 100644
--- a/arch/m68k/sun3x/time.c
+++ b/arch/m68k/sun3x/time.c
@@ -52,8 +52,8 @@ int sun3x_hwclk(int set, struct rtc_time *t)
h->hour = bin2bcd(t->tm_hour);
h->wday = bin2bcd(t->tm_wday);
h->mday = bin2bcd(t->tm_mday);
- h->month = bin2bcd(t->tm_mon);
- h->year = bin2bcd(t->tm_year);
+ h->month = bin2bcd(t->tm_mon + 1);
+ h->year = bin2bcd(t->tm_year % 100);
h->csr &= ~C_WRITE;
} else {
h->csr |= C_READ;
@@ -62,9 +62,11 @@ int sun3x_hwclk(int set, struct rtc_time *t)
t->tm_hour = bcd2bin(h->hour);
t->tm_wday = bcd2bin(h->wday);
t->tm_mday = bcd2bin(h->mday);
- t->tm_mon = bcd2bin(h->month);
+ t->tm_mon = bcd2bin(h->month) - 1;
t->tm_year = bcd2bin(h->year);
h->csr &= ~C_READ;
+ if (t->tm_year < 70)
+ t->tm_year += 100;
}
local_irq_restore(flags);
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3817a3e2146c..d14782100088 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -19,7 +19,6 @@ config MICROBLAZE
select HAVE_ARCH_HASH
select HAVE_ARCH_KGDB
select HAVE_DEBUG_KMEMLEAK
- select HAVE_DMA_API_DEBUG
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 3c80a5a308ed..fe6a6c6e5003 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -2,6 +2,7 @@ generic-y += barrier.h
generic-y += bitops.h
generic-y += bug.h
generic-y += bugs.h
+generic-y += compat.h
generic-y += device.h
generic-y += div64.h
generic-y += emergency-restart.h
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 5de871eb4a59..00337861472e 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -62,12 +62,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
#define HAVE_PCI_LEGACY 1
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU). The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (1)
-
extern void pcibios_claim_one_bus(struct pci_bus *b);
extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index c91e8cef98dd..3145e7dc8ab1 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -184,14 +184,3 @@ const struct dma_map_ops dma_nommu_ops = {
.sync_sg_for_device = dma_nommu_sync_sg_for_device,
};
EXPORT_SYMBOL(dma_nommu_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
- return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index e6f338d0496b..eafff21fcb0e 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -60,16 +60,10 @@ asmlinkage void sw_exception(struct pt_regs *regs)
void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
{
- siginfo_t info;
-
if (kernel_mode(regs))
die("Exception in kernel mode", regs, signr);
- info.si_signo = signr;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = (void __user *) addr;
- force_sig_info(signr, &info, current);
+ force_sig_fault(signr, code, (void __user *)addr, current);
}
asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index f91b30f8aaa8..af607447c683 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -88,7 +88,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- siginfo_t info;
int code = SEGV_MAPERR;
int is_write = error_code & ESR_S;
int fault;
@@ -269,11 +268,6 @@ bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address);
-/* info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = (void *) address;
- force_sig_info(SIGSEGV, &info, current);*/
return;
}
@@ -295,11 +289,7 @@ out_of_memory:
do_sigbus:
up_read(&mm->mmap_sem);
if (user_mode(regs)) {
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
return;
}
bad_page_fault(regs, address, SIGBUS);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 225c95da23ce..7074b2215f36 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -42,7 +42,6 @@ config MIPS
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_EXIT_THREAD
@@ -132,7 +131,7 @@ config MIPS_GENERIC
config MIPS_ALCHEMY
bool "Alchemy processor based machines"
- select ARCH_PHYS_ADDR_T_64BIT
+ select PHYS_ADDR_T_64BIT
select CEVT_R4K
select CSRC_R4K
select IRQ_MIPS_CPU
@@ -890,7 +889,7 @@ config CAVIUM_OCTEON_SOC
bool "Cavium Networks Octeon SoC based boards"
select CEVT_R4K
select ARCH_HAS_PHYS_TO_DMA
- select ARCH_PHYS_ADDR_T_64BIT
+ select PHYS_ADDR_T_64BIT
select DMA_COHERENT
select SYS_SUPPORTS_64BIT_KERNEL
select SYS_SUPPORTS_BIG_ENDIAN
@@ -912,6 +911,7 @@ config CAVIUM_OCTEON_SOC
select MIPS_NR_CPU_NR_MAP_1024
select BUILTIN_DTB
select MTD_COMPLEX_MAPPINGS
+ select SWIOTLB
select SYS_SUPPORTS_RELOCATABLE
help
This option supports all of the Octeon reference boards from Cavium
@@ -936,7 +936,7 @@ config NLM_XLR_BOARD
select SWAP_IO_SPACE
select SYS_SUPPORTS_32BIT_KERNEL
select SYS_SUPPORTS_64BIT_KERNEL
- select ARCH_PHYS_ADDR_T_64BIT
+ select PHYS_ADDR_T_64BIT
select SYS_SUPPORTS_BIG_ENDIAN
select SYS_SUPPORTS_HIGHMEM
select DMA_COHERENT
@@ -962,7 +962,7 @@ config NLM_XLP_BOARD
select HW_HAS_PCI
select SYS_SUPPORTS_32BIT_KERNEL
select SYS_SUPPORTS_64BIT_KERNEL
- select ARCH_PHYS_ADDR_T_64BIT
+ select PHYS_ADDR_T_64BIT
select GPIOLIB
select SYS_SUPPORTS_BIG_ENDIAN
select SYS_SUPPORTS_LITTLE_ENDIAN
@@ -1101,9 +1101,6 @@ config GPIO_TXX9
config FW_CFE
bool
-config ARCH_DMA_ADDR_T_64BIT
- def_bool (HIGHMEM && ARCH_PHYS_ADDR_T_64BIT) || 64BIT
-
config ARCH_SUPPORTS_UPROBES
bool
@@ -1122,9 +1119,6 @@ config DMA_NONCOHERENT
bool
select NEED_DMA_MAP_STATE
-config NEED_DMA_MAP_STATE
- bool
-
config SYS_HAS_EARLY_PRINTK
bool
@@ -1373,6 +1367,7 @@ config CPU_LOONGSON3
select MIPS_PGD_C0_CONTEXT
select MIPS_L1_CACHE_SHIFT_6
select GPIOLIB
+ select SWIOTLB
help
The Loongson 3 processor implements the MIPS64R2 instruction
set with many extensions.
@@ -1770,7 +1765,7 @@ config CPU_MIPS32_R5_XPA
depends on SYS_SUPPORTS_HIGHMEM
select XPA
select HIGHMEM
- select ARCH_PHYS_ADDR_T_64BIT
+ select PHYS_ADDR_T_64BIT
default n
help
Choose this option if you want to enable the Extended Physical
@@ -2402,9 +2397,6 @@ config SB1_PASS_2_1_WORKAROUNDS
default y
-config ARCH_PHYS_ADDR_T_64BIT
- bool
-
choice
prompt "SmartMIPS or microMIPS ASE support"
@@ -2556,7 +2548,7 @@ config ARCH_DISCONTIGMEM_ENABLE
Say Y to support efficient handling of discontiguous physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
or have huge holes in the physical address space for other reasons.
- See <file:Documentation/vm/numa> for more.
+ See <file:Documentation/vm/numa.rst> for more.
config ARCH_SPARSEMEM_ENABLE
bool
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index b5eee1a57d6c..4984e462be30 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,18 +67,6 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY
help
Lock the kernel's implementation of memcpy() into L2.
-config IOMMU_HELPER
- bool
-
-config NEED_SG_DMA_LENGTH
- bool
-
-config SWIOTLB
- def_bool y
- select DMA_DIRECT_OPS
- select IOMMU_HELPER
- select NEED_SG_DMA_LENGTH
-
config OCTEON_ILM
tristate "Module to measure interrupt latency using Octeon CIU Timer"
help
diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index 9a0fa66b81ac..78675f19440f 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -14,7 +14,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_suseconds_t;
@@ -38,24 +37,16 @@ typedef struct {
typedef s32 compat_timer_t;
typedef s32 compat_key_t;
+typedef s16 compat_short_t;
typedef s32 compat_int_t;
typedef s32 compat_long_t;
typedef s64 compat_s64;
+typedef u16 compat_ushort_t;
typedef u32 compat_uint_t;
typedef u32 compat_ulong_t;
typedef u64 compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev;
s32 st_pad1[3];
@@ -168,35 +159,35 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- compat_time_t sem_otime;
- compat_time_t sem_ctime;
+ compat_ulong_t sem_otime;
+ compat_ulong_t sem_ctime;
compat_ulong_t sem_nsems;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
+ compat_ulong_t sem_otime_high;
+ compat_ulong_t sem_ctime_high;
};
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
#ifndef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused1;
+ compat_ulong_t msg_stime_high;
#endif
- compat_time_t msg_stime;
+ compat_ulong_t msg_stime;
#ifdef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused1;
+ compat_ulong_t msg_stime_high;
#endif
#ifndef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused2;
+ compat_ulong_t msg_rtime_high;
#endif
- compat_time_t msg_rtime;
+ compat_ulong_t msg_rtime;
#ifdef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused2;
+ compat_ulong_t msg_rtime_high;
#endif
#ifndef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused3;
+ compat_ulong_t msg_ctime_high;
#endif
- compat_time_t msg_ctime;
+ compat_ulong_t msg_ctime;
#ifdef CONFIG_CPU_LITTLE_ENDIAN
- compat_ulong_t __unused3;
+ compat_ulong_t msg_ctime_high;
#endif
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
@@ -210,14 +201,16 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
compat_size_t shm_segsz;
- compat_time_t shm_atime;
- compat_time_t shm_dtime;
- compat_time_t shm_ctime;
+ compat_ulong_t shm_atime;
+ compat_ulong_t shm_dtime;
+ compat_ulong_t shm_ctime;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
compat_ulong_t shm_nattch;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
+ compat_ushort_t shm_atime_high;
+ compat_ushort_t shm_dtime_high;
+ compat_ushort_t shm_ctime_high;
+ compat_ushort_t __unused2;
};
/* MIPS has unusual order of fields in stack_t */
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 2339f42f047a..436099883022 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -121,13 +121,6 @@ extern unsigned long PCIBIOS_MIN_MEM;
#include <linux/string.h>
#include <asm/io.h>
-/*
- * The PCI address space does equal the physical memory address space.
- * The networking and block device layers use this boolean for bounce
- * buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (1)
-
#ifdef CONFIG_PCI_DOMAINS_GENERIC
static inline int pci_proc_domain(struct pci_bus *bus)
{
diff --git a/arch/mips/include/uapi/asm/msgbuf.h b/arch/mips/include/uapi/asm/msgbuf.h
index eb4d0f9d7364..46aa15b13e4e 100644
--- a/arch/mips/include/uapi/asm/msgbuf.h
+++ b/arch/mips/include/uapi/asm/msgbuf.h
@@ -9,33 +9,15 @@
* between kernel and user space.
*
* Pad space is left for:
- * - extension of time_t to 64-bit on 32-bitsystem to solve the y2038 problem
* - 2 miscellaneous unsigned long values
*/
+#if defined(__mips64)
struct msqid64_ds {
struct ipc64_perm msg_perm;
-#if !defined(__mips64) && defined(__MIPSEB__)
- unsigned long __unused1;
-#endif
__kernel_time_t msg_stime; /* last msgsnd time */
-#if !defined(__mips64) && defined(__MIPSEL__)
- unsigned long __unused1;
-#endif
-#if !defined(__mips64) && defined(__MIPSEB__)
- unsigned long __unused2;
-#endif
__kernel_time_t msg_rtime; /* last msgrcv time */
-#if !defined(__mips64) && defined(__MIPSEL__)
- unsigned long __unused2;
-#endif
-#if !defined(__mips64) && defined(__MIPSEB__)
- unsigned long __unused3;
-#endif
__kernel_time_t msg_ctime; /* last change time */
-#if !defined(__mips64) && defined(__MIPSEL__)
- unsigned long __unused3;
-#endif
unsigned long msg_cbytes; /* current number of bytes on queue */
unsigned long msg_qnum; /* number of messages in queue */
unsigned long msg_qbytes; /* max number of bytes on queue */
@@ -44,5 +26,42 @@ struct msqid64_ds {
unsigned long __unused4;
unsigned long __unused5;
};
+#elif defined (__MIPSEB__)
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ unsigned long msg_stime_high;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_ctime; /* last change time */
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#elif defined (__MIPSEL__)
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_stime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_ctime; /* last change time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#else
+#warning no endianess set
+#endif
#endif /* _ASM_MSGBUF_H */
diff --git a/arch/mips/include/uapi/asm/sembuf.h b/arch/mips/include/uapi/asm/sembuf.h
index 2c0f507ab7d1..60c89e6cb25b 100644
--- a/arch/mips/include/uapi/asm/sembuf.h
+++ b/arch/mips/include/uapi/asm/sembuf.h
@@ -7,10 +7,11 @@
* Note extra padding because this structure is passed back and forth
* between kernel and user space.
*
- * Pad space is left for:
- * - 2 miscellaneous 64-bit values
+ * Pad space is left for 2 miscellaneous 64-bit values on mips64,
+ * but used for the upper 32 bit of the time values on mips32.
*/
+#ifdef __mips64
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
__kernel_time_t sem_otime; /* last semop time */
@@ -19,5 +20,15 @@ struct semid64_ds {
unsigned long __unused1;
unsigned long __unused2;
};
+#else
+struct semid64_ds {
+ struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_nsems; /* no. of semaphores in array */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime_high;
+};
+#endif
#endif /* _ASM_SEMBUF_H */
diff --git a/arch/mips/include/uapi/asm/shmbuf.h b/arch/mips/include/uapi/asm/shmbuf.h
index 379e6bca518b..9b9bba3401f2 100644
--- a/arch/mips/include/uapi/asm/shmbuf.h
+++ b/arch/mips/include/uapi/asm/shmbuf.h
@@ -7,10 +7,13 @@
* Note extra padding because this structure is passed back and forth
* between kernel and user space.
*
- * Pad space is left for:
- * - 2 miscellaneous 32-bit rsp. 64-bit values
+ * As MIPS was lacking proper padding after shm_?time, we use 48 bits
+ * of the padding at the end to store a few additional bits of the time.
+ * libc implementations need to take care to convert this into a proper
+ * data structure when moving to 64-bit time_t.
*/
+#ifdef __mips64
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
size_t shm_segsz; /* size of segment (bytes) */
@@ -23,6 +26,22 @@ struct shmid64_ds {
unsigned long __unused1;
unsigned long __unused2;
};
+#else
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ size_t shm_segsz; /* size of segment (bytes) */
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_ctime; /* last change time */
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ unsigned long shm_nattch; /* no. of current attaches */
+ unsigned short shm_atime_high;
+ unsigned short shm_dtime_high;
+ unsigned short shm_ctime_high;
+ unsigned short __unused1;
+};
+#endif
struct shminfo64 {
unsigned long shmmax;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index c4db910a8794..b5d9e1784aff 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -8,13 +8,13 @@
* Copyright (C) 1999, 2000 Silicon Graphics, Inc.
* Copyright (C) 2016, Imagination Technologies Ltd.
*/
+#include <linux/compat.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/syscalls.h>
-#include <asm/compat.h>
#include <asm/compat-signal.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 967e9e4e795e..d67fa74622ee 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -699,17 +699,11 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
asmlinkage void do_ov(struct pt_regs *regs)
{
enum ctx_state prev_state;
- siginfo_t info;
-
- clear_siginfo(&info);
- info.si_signo = SIGFPE;
- info.si_code = FPE_INTOVF;
- info.si_addr = (void __user *)regs->cp0_epc;
prev_state = exception_enter();
die_if_kernel("Integer overflow", regs);
- force_sig_info(SIGFPE, &info, current);
+ force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc, current);
exception_exit(prev_state);
}
@@ -722,32 +716,27 @@ asmlinkage void do_ov(struct pt_regs *regs)
void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
struct task_struct *tsk)
{
- struct siginfo si;
-
- clear_siginfo(&si);
- si.si_addr = fault_addr;
- si.si_signo = SIGFPE;
+ int si_code = FPE_FLTUNK;
if (fcr31 & FPU_CSR_INV_X)
- si.si_code = FPE_FLTINV;
+ si_code = FPE_FLTINV;
else if (fcr31 & FPU_CSR_DIV_X)
- si.si_code = FPE_FLTDIV;
+ si_code = FPE_FLTDIV;
else if (fcr31 & FPU_CSR_OVF_X)
- si.si_code = FPE_FLTOVF;
+ si_code = FPE_FLTOVF;
else if (fcr31 & FPU_CSR_UDF_X)
- si.si_code = FPE_FLTUND;
+ si_code = FPE_FLTUND;
else if (fcr31 & FPU_CSR_INE_X)
- si.si_code = FPE_FLTRES;
+ si_code = FPE_FLTRES;
- force_sig_info(SIGFPE, &si, tsk);
+ force_sig_fault(SIGFPE, si_code, fault_addr, tsk);
}
int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
{
- struct siginfo si;
+ int si_code;
struct vm_area_struct *vma;
- clear_siginfo(&si);
switch (sig) {
case 0:
return 0;
@@ -757,23 +746,18 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
return 1;
case SIGBUS:
- si.si_addr = fault_addr;
- si.si_signo = sig;
- si.si_code = BUS_ADRERR;
- force_sig_info(sig, &si, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr, current);
return 1;
case SIGSEGV:
- si.si_addr = fault_addr;
- si.si_signo = sig;
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, (unsigned long)fault_addr);
if (vma && (vma->vm_start <= (unsigned long)fault_addr))
- si.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
else
- si.si_code = SEGV_MAPERR;
+ si_code = SEGV_MAPERR;
up_read(&current->mm->mmap_sem);
- force_sig_info(sig, &si, current);
+ force_sig_fault(SIGSEGV, si_code, fault_addr, current);
return 1;
default:
@@ -896,10 +880,8 @@ out:
void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
const char *str)
{
- siginfo_t info;
char b[40];
- clear_siginfo(&info);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_TRAP, str, regs, code, current->thread.trap_nr,
SIGTRAP) == NOTIFY_STOP)
@@ -921,13 +903,9 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
case BRK_DIVZERO:
scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
die_if_kernel(b, regs);
- if (code == BRK_DIVZERO)
- info.si_code = FPE_INTDIV;
- else
- info.si_code = FPE_INTOVF;
- info.si_signo = SIGFPE;
- info.si_addr = (void __user *) regs->cp0_epc;
- force_sig_info(SIGFPE, &info, current);
+ force_sig_fault(SIGFPE,
+ code == BRK_DIVZERO ? FPE_INTDIV : FPE_INTOVF,
+ (void __user *) regs->cp0_epc, current);
break;
case BRK_BUG:
die_if_kernel("Kernel bug detected", regs);
@@ -952,9 +930,7 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
die_if_kernel(b, regs);
if (si_code) {
- info.si_signo = SIGTRAP;
- info.si_code = si_code;
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, si_code, NULL, current);
} else {
force_sig(SIGTRAP, current);
}
@@ -1506,13 +1482,8 @@ asmlinkage void do_mdmx(struct pt_regs *regs)
*/
asmlinkage void do_watch(struct pt_regs *regs)
{
- siginfo_t info;
enum ctx_state prev_state;
- clear_siginfo(&info);
- info.si_signo = SIGTRAP;
- info.si_code = TRAP_HWBKPT;
-
prev_state = exception_enter();
/*
* Clear WP (bit 22) bit of cause register so we don't loop
@@ -1528,7 +1499,7 @@ asmlinkage void do_watch(struct pt_regs *regs)
if (test_tsk_thread_flag(current, TIF_LOAD_WATCH)) {
mips_read_watch_registers();
local_irq_enable();
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL, current);
} else {
mips_clear_watch_registers();
local_irq_enable();
diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig
index 72af0c183969..c79e6a565572 100644
--- a/arch/mips/loongson64/Kconfig
+++ b/arch/mips/loongson64/Kconfig
@@ -130,21 +130,6 @@ config LOONGSON_UART_BASE
default y
depends on EARLY_PRINTK || SERIAL_8250
-config IOMMU_HELPER
- bool
-
-config NEED_SG_DMA_LENGTH
- bool
-
-config SWIOTLB
- bool "Soft IOMMU Support for All-Memory DMA"
- default y
- depends on CPU_LOONGSON3
- select DMA_DIRECT_OPS
- select IOMMU_HELPER
- select NEED_SG_DMA_LENGTH
- select NEED_DMA_MAP_STATE
-
config PHYS48_TO_HT40
bool
default y if CPU_LOONGSON3
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index dcafa43613b6..f9fef0028ca2 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -402,13 +402,3 @@ static const struct dma_map_ops mips_default_dma_map_ops = {
const struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
EXPORT_SYMBOL(mips_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init mips_dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
- return 0;
-}
-fs_initcall(mips_dma_init);
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4f8f5bf46977..5f71f2b903b7 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -42,7 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
const int field = sizeof(unsigned long) * 2;
- siginfo_t info;
+ int si_code;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -63,7 +63,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
return;
#endif
- info.si_code = SEGV_MAPERR;
+ si_code = SEGV_MAPERR;
/*
* We fault-in kernel-space virtual memory on-demand. The
@@ -112,7 +112,7 @@ retry:
* we can handle it..
*/
good_area:
- info.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
if (write) {
if (!(vma->vm_flags & VM_WRITE))
@@ -223,11 +223,7 @@ bad_area_nosemaphore:
pr_cont("\n");
}
current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void __user *) address;
- force_sig_info(SIGSEGV, &info, tsk);
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
return;
}
@@ -283,11 +279,7 @@ do_sigbus:
#endif
current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
tsk->thread.cp0_badvaddr = address;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *) address;
- force_sig_info(SIGBUS, &info, tsk);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
return;
#ifndef CONFIG_64BIT
diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig
index 7fcfc7fe9f14..412351c5acc6 100644
--- a/arch/mips/netlogic/Kconfig
+++ b/arch/mips/netlogic/Kconfig
@@ -83,10 +83,4 @@ endif
config NLM_COMMON
bool
-config IOMMU_HELPER
- bool
-
-config NEED_SG_DMA_LENGTH
- bool
-
endif
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index b7404f2dcf5b..6aed974276d8 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -5,10 +5,13 @@
config NDS32
def_bool y
+ select ARCH_HAS_SYNC_DMA_FOR_CPU
+ select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_WANT_FRAME_POINTERS if FTRACE
select CLKSRC_MMIO
select CLONE_BACKWARDS
select COMMON_CLK
+ select DMA_NONCOHERENT_OPS
select GENERIC_ASHLDI3
select GENERIC_ASHRDI3
select GENERIC_LSHRDI3
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index 142e612aa639..dbc4e5422550 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -9,10 +9,12 @@ generic-y += checksum.h
generic-y += clkdev.h
generic-y += cmpxchg.h
generic-y += cmpxchg-local.h
+generic-y += compat.h
generic-y += cputime.h
generic-y += device.h
generic-y += div64.h
generic-y += dma.h
+generic-y += dma-mapping.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
diff --git a/arch/nds32/include/asm/dma-mapping.h b/arch/nds32/include/asm/dma-mapping.h
deleted file mode 100644
index 2dd47d245c25..000000000000
--- a/arch/nds32/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2005-2017 Andes Technology Corporation
-
-#ifndef ASMNDS32_DMA_MAPPING_H
-#define ASMNDS32_DMA_MAPPING_H
-
-extern struct dma_map_ops nds32_dma_ops;
-
-static inline struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
- return &nds32_dma_ops;
-}
-
-#endif
diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c
index d291800fc621..d0dbd4fe9645 100644
--- a/arch/nds32/kernel/dma.c
+++ b/arch/nds32/kernel/dma.c
@@ -3,17 +3,14 @@
#include <linux/types.h>
#include <linux/mm.h>
-#include <linux/export.h>
#include <linux/string.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
#include <linux/io.h>
#include <linux/cache.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#include <asm/dma-mapping.h>
#include <asm/proc-fns.h>
/*
@@ -22,11 +19,6 @@
static pte_t *consistent_pte;
static DEFINE_RAW_SPINLOCK(consistent_lock);
-enum master_type {
- FOR_CPU = 0,
- FOR_DEVICE = 1,
-};
-
/*
* VM region handling support.
*
@@ -124,10 +116,8 @@ out:
return c;
}
-/* FIXME: attrs is not used. */
-static void *nds32_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t * handle, gfp_t gfp,
- unsigned long attrs)
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+ gfp_t gfp, unsigned long attrs)
{
struct page *page;
struct arch_vm_region *c;
@@ -232,8 +222,8 @@ no_page:
return NULL;
}
-static void nds32_dma_free(struct device *dev, size_t size, void *cpu_addr,
- dma_addr_t handle, unsigned long attrs)
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t handle, unsigned long attrs)
{
struct arch_vm_region *c;
unsigned long flags, addr;
@@ -333,145 +323,69 @@ static int __init consistent_init(void)
}
core_initcall(consistent_init);
-static void consistent_sync(void *vaddr, size_t size, int direction, int master_type);
-static dma_addr_t nds32_dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- consistent_sync((void *)(page_address(page) + offset), size, dir, FOR_DEVICE);
- return page_to_phys(page) + offset;
-}
-
-static void nds32_dma_unmap_page(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- consistent_sync(phys_to_virt(handle), size, dir, FOR_CPU);
-}
-
-/*
- * Make an area consistent for devices.
- */
-static void consistent_sync(void *vaddr, size_t size, int direction, int master_type)
-{
- unsigned long start = (unsigned long)vaddr;
- unsigned long end = start + size;
-
- if (master_type == FOR_CPU) {
- switch (direction) {
- case DMA_TO_DEVICE:
- break;
- case DMA_FROM_DEVICE:
- case DMA_BIDIRECTIONAL:
- cpu_dma_inval_range(start, end);
- break;
- default:
- BUG();
- }
- } else {
- /* FOR_DEVICE */
- switch (direction) {
- case DMA_FROM_DEVICE:
- break;
- case DMA_TO_DEVICE:
- case DMA_BIDIRECTIONAL:
- cpu_dma_wb_range(start, end);
- break;
- default:
- BUG();
- }
- }
-}
-static int nds32_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir,
- unsigned long attrs)
+static inline void cache_op(phys_addr_t paddr, size_t size,
+ void (*fn)(unsigned long start, unsigned long end))
{
- int i;
+ struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+ unsigned offset = paddr & ~PAGE_MASK;
+ size_t left = size;
+ unsigned long start;
- for (i = 0; i < nents; i++, sg++) {
- void *virt;
- unsigned long pfn;
- struct page *page = sg_page(sg);
+ do {
+ size_t len = left;
- sg->dma_address = sg_phys(sg);
- pfn = page_to_pfn(page) + sg->offset / PAGE_SIZE;
- page = pfn_to_page(pfn);
if (PageHighMem(page)) {
- virt = kmap_atomic(page);
- consistent_sync(virt, sg->length, dir, FOR_CPU);
- kunmap_atomic(virt);
+ void *addr;
+
+ if (offset + len > PAGE_SIZE) {
+ if (offset >= PAGE_SIZE) {
+ page += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
+ }
+ len = PAGE_SIZE - offset;
+ }
+
+ addr = kmap_atomic(page);
+ start = (unsigned long)(addr + offset);
+ fn(start, start + len);
+ kunmap_atomic(addr);
} else {
- if (sg->offset > PAGE_SIZE)
- panic("sg->offset:%08x > PAGE_SIZE\n",
- sg->offset);
- virt = page_address(page) + sg->offset;
- consistent_sync(virt, sg->length, dir, FOR_CPU);
+ start = (unsigned long)phys_to_virt(paddr);
+ fn(start, start + size);
}
- }
- return nents;
+ offset = 0;
+ page++;
+ left -= len;
+ } while (left);
}
-static void nds32_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nhwentries, enum dma_data_direction dir,
- unsigned long attrs)
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
{
-}
-
-static void
-nds32_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir)
-{
- consistent_sync((void *)phys_to_virt(handle), size, dir, FOR_CPU);
-}
-
-static void
-nds32_dma_sync_single_for_device(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir)
-{
- consistent_sync((void *)phys_to_virt(handle), size, dir, FOR_DEVICE);
-}
-
-static void
-nds32_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir)
-{
- int i;
-
- for (i = 0; i < nents; i++, sg++) {
- char *virt =
- page_address((struct page *)sg->page_link) + sg->offset;
- consistent_sync(virt, sg->length, dir, FOR_CPU);
+ switch (dir) {
+ case DMA_FROM_DEVICE:
+ break;
+ case DMA_TO_DEVICE:
+ case DMA_BIDIRECTIONAL:
+ cache_op(paddr, size, cpu_dma_wb_range);
+ break;
+ default:
+ BUG();
}
}
-static void
-nds32_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
{
- int i;
-
- for (i = 0; i < nents; i++, sg++) {
- char *virt =
- page_address((struct page *)sg->page_link) + sg->offset;
- consistent_sync(virt, sg->length, dir, FOR_DEVICE);
+ switch (dir) {
+ case DMA_TO_DEVICE:
+ break;
+ case DMA_FROM_DEVICE:
+ case DMA_BIDIRECTIONAL:
+ cache_op(paddr, size, cpu_dma_inval_range);
+ break;
+ default:
+ BUG();
}
}
-
-struct dma_map_ops nds32_dma_ops = {
- .alloc = nds32_dma_alloc_coherent,
- .free = nds32_dma_free,
- .map_page = nds32_dma_map_page,
- .unmap_page = nds32_dma_unmap_page,
- .map_sg = nds32_dma_map_sg,
- .unmap_sg = nds32_dma_unmap_sg,
- .sync_single_for_device = nds32_dma_sync_single_for_device,
- .sync_single_for_cpu = nds32_dma_sync_single_for_cpu,
- .sync_sg_for_cpu = nds32_dma_sync_sg_for_cpu,
- .sync_sg_for_device = nds32_dma_sync_sg_for_device,
-};
-
-EXPORT_SYMBOL(nds32_dma_ops);
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index 6e34eb9824a4..a6205fd4db52 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -222,19 +222,13 @@ void die_if_kernel(const char *str, struct pt_regs *regs, int err)
int bad_syscall(int n, struct pt_regs *regs)
{
- siginfo_t info;
-
if (current->personality != PER_LINUX) {
send_sig(SIGSEGV, current, 1);
return regs->uregs[0];
}
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLTRP;
- info.si_addr = (void __user *)instruction_pointer(regs) - 4;
-
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLTRP,
+ (void __user *)instruction_pointer(regs) - 4, current);
die_if_kernel("Oops - bad syscall", regs, n);
return regs->uregs[0];
}
@@ -287,16 +281,11 @@ void __init early_trap_init(void)
void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
int error_code, int si_code)
{
- struct siginfo info;
-
tsk->thread.trap_no = ENTRY_DEBUG_RELATED;
tsk->thread.error_code = error_code;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = si_code;
- info.si_addr = (void __user *)instruction_pointer(regs);
- force_sig_info(SIGTRAP, &info, tsk);
+ force_sig_fault(SIGTRAP, si_code,
+ (void __user *)instruction_pointer(regs), tsk);
}
void do_debug_trap(unsigned long entry, unsigned long addr,
@@ -318,29 +307,22 @@ void do_debug_trap(unsigned long entry, unsigned long addr,
void unhandled_interruption(struct pt_regs *regs)
{
- siginfo_t si;
pr_emerg("unhandled_interruption\n");
show_regs(regs);
if (!user_mode(regs))
do_exit(SIGKILL);
- si.si_signo = SIGKILL;
- si.si_errno = 0;
- force_sig_info(SIGKILL, &si, current);
+ force_sig(SIGKILL, current);
}
void unhandled_exceptions(unsigned long entry, unsigned long addr,
unsigned long type, struct pt_regs *regs)
{
- siginfo_t si;
pr_emerg("Unhandled Exception: entry: %lx addr:%lx itype:%lx\n", entry,
addr, type);
show_regs(regs);
if (!user_mode(regs))
do_exit(SIGKILL);
- si.si_signo = SIGKILL;
- si.si_errno = 0;
- si.si_addr = (void *)addr;
- force_sig_info(SIGKILL, &si, current);
+ force_sig(SIGKILL, current);
}
extern int do_page_fault(unsigned long entry, unsigned long addr,
@@ -363,14 +345,11 @@ void do_dispatch_tlb_misc(unsigned long entry, unsigned long addr,
void do_revinsn(struct pt_regs *regs)
{
- siginfo_t si;
pr_emerg("Reserved Instruction\n");
show_regs(regs);
if (!user_mode(regs))
do_exit(SIGILL);
- si.si_signo = SIGILL;
- si.si_errno = 0;
- force_sig_info(SIGILL, &si, current);
+ force_sig(SIGILL, current);
}
#ifdef CONFIG_ALIGNMENT_TRAP
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 3a246fb8098c..9bdb7c3ecbb6 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -72,7 +72,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
- siginfo_t info;
+ int si_code;
int fault;
unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -80,7 +80,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
error_code = error_code & (ITYPE_mskINST | ITYPE_mskETYPE);
tsk = current;
mm = tsk->mm;
- info.si_code = SEGV_MAPERR;
+ si_code = SEGV_MAPERR;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
@@ -161,7 +161,7 @@ retry:
*/
good_area:
- info.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
/* first do some preliminary protection checks */
if (entry == ENTRY_PTE_NOT_PRESENT) {
@@ -266,11 +266,7 @@ bad_area_nosemaphore:
tsk->thread.address = addr;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = entry;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void *)addr;
- force_sig_info(SIGSEGV, &info, tsk);
+ force_sig_fault(SIGSEGV, si_code, (void __user *)addr, tsk);
return;
}
@@ -339,11 +335,7 @@ do_sigbus:
tsk->thread.address = addr;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = entry;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void *)addr;
- force_sig_info(SIGBUS, &info, tsk);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, tsk);
return;
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index d232da2cbb38..64ed3d656956 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += bitops.h
generic-y += bug.h
generic-y += bugs.h
generic-y += cmpxchg.h
+generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 8184e7d6b385..3bc3cd22b750 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -26,13 +26,7 @@ static DEFINE_SPINLOCK(die_lock);
static void _send_sig(int signo, int code, unsigned long addr)
{
- siginfo_t info;
-
- info.si_signo = signo;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = (void __user *) addr;
- force_sig_info(signo, &info, current);
+ force_sig_fault(signo, code, (void __user *) addr, current);
}
void die(const char *str, struct pt_regs *regs, long err)
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index f05c722a21f8..65964d390b10 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -2,6 +2,7 @@ generic-y += barrier.h
generic-y += bug.h
generic-y += bugs.h
generic-y += checksum.h
+generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index a945f00011b4..ec7fd45704d2 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -247,14 +247,3 @@ const struct dma_map_ops or1k_dma_map_ops = {
.sync_single_for_device = or1k_sync_single_for_device,
};
EXPORT_SYMBOL(or1k_dma_map_ops);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
- return 0;
-}
-fs_initcall(dma_init);
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 113c175fe469..fac246e6f37a 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -250,27 +250,16 @@ void __init trap_init(void)
asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
{
- siginfo_t info;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = TRAP_TRACE;
- info.si_addr = (void *)address;
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address, current);
regs->pc += 4;
}
asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
{
- siginfo_t info;
-
if (user_mode(regs)) {
/* Send a SIGBUS */
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address, current);
} else {
printk("KERNEL: Unaligned Access 0x%.8lx\n", address);
show_registers(regs);
@@ -281,15 +270,9 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
{
- siginfo_t info;
-
if (user_mode(regs)) {
/* Send a SIGBUS */
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void *)address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
} else { /* Kernel mode */
printk("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address);
show_registers(regs);
@@ -464,7 +447,6 @@ static inline void simulate_swa(struct pt_regs *regs, unsigned long address,
asmlinkage void do_illegal_instruction(struct pt_regs *regs,
unsigned long address)
{
- siginfo_t info;
unsigned int op;
unsigned int insn = *((unsigned int *)address);
@@ -485,11 +467,7 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs,
if (user_mode(regs)) {
/* Send a SIGILL */
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLOPC;
- info.si_addr = (void *)address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address, current);
} else { /* Kernel mode */
printk("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n",
address);
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index d0021dfae20a..9f011d16cc46 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -52,7 +52,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
- siginfo_t info;
+ int si_code;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -97,7 +97,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
}
mm = tsk->mm;
- info.si_code = SEGV_MAPERR;
+ si_code = SEGV_MAPERR;
/*
* If we're in an interrupt or have no user
@@ -139,7 +139,7 @@ retry:
*/
good_area:
- info.si_code = SEGV_ACCERR;
+ si_code = SEGV_ACCERR;
/* first do some preliminary protection checks */
@@ -213,11 +213,7 @@ bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
if (user_mode(regs)) {
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void *)address;
- force_sig_info(SIGSEGV, &info, tsk);
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
return;
}
@@ -282,11 +278,7 @@ do_sigbus:
* Send a sigbus, regardless of whether we were in kernel
* or user mode.
*/
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void *)address;
- force_sig_info(SIGBUS, &info, tsk);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fc5a574c3482..4d8f64d48597 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,8 @@ config PARISC
select GENERIC_CLOCKEVENTS
select ARCH_NO_COHERENT_DMA_MMAP
select CPU_NO_EFFICIENT_FFS
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
help
The PA-RISC microprocessor is designed by Hewlett-Packard and used
@@ -111,12 +113,6 @@ config PM
config STACKTRACE_SUPPORT
def_bool y
-config NEED_DMA_MAP_STATE
- def_bool y
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config ISA_DMA_API
bool
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index 57b8b2a2fd4e..ab8a54771507 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -13,7 +13,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u32 __compat_uid_t;
@@ -40,16 +39,6 @@ typedef u32 compat_ulong_t;
typedef u64 compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev; /* dev_t is 32 bits on parisc */
compat_ino_t st_ino; /* 32 bits */
@@ -149,10 +138,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- unsigned int __unused1;
- compat_time_t sem_otime;
- unsigned int __unused2;
- compat_time_t sem_ctime;
+ unsigned int sem_otime_high;
+ unsigned int sem_otime;
+ unsigned int sem_ctime_high;
+ unsigned int sem_ctime;
compat_ulong_t sem_nsems;
compat_ulong_t __unused3;
compat_ulong_t __unused4;
@@ -160,12 +149,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- unsigned int __unused1;
- compat_time_t msg_stime;
- unsigned int __unused2;
- compat_time_t msg_rtime;
- unsigned int __unused3;
- compat_time_t msg_ctime;
+ unsigned int msg_stime_high;
+ unsigned int msg_stime;
+ unsigned int msg_rtime_high;
+ unsigned int msg_rtime;
+ unsigned int msg_ctime_high;
+ unsigned int msg_ctime;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
@@ -177,12 +166,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
- unsigned int __unused1;
- compat_time_t shm_atime;
- unsigned int __unused2;
- compat_time_t shm_dtime;
- unsigned int __unused3;
- compat_time_t shm_ctime;
+ unsigned int shm_atime_high;
+ unsigned int shm_atime;
+ unsigned int shm_dtime_high;
+ unsigned int shm_dtime;
+ unsigned int shm_ctime_high;
+ unsigned int shm_ctime;
unsigned int __unused4;
compat_size_t shm_segsz;
compat_pid_t shm_cpid;
diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h
index 077815169258..1a1235a9d533 100644
--- a/arch/parisc/include/asm/hardirq.h
+++ b/arch/parisc/include/asm/hardirq.h
@@ -34,14 +34,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member)
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
#define __inc_irq_stat(member) __this_cpu_inc(irq_stat.member)
-#define local_softirq_pending() this_cpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x) \
- this_cpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x))
-
#define ack_bad_irq(irq) WARN(1, "unexpected IRQ trap at vector %02x\n", irq)
#endif /* _PARISC_HARDIRQ_H */
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 96b7deec512d..3328fd17c19d 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -88,29 +88,6 @@ struct pci_hba_data {
#endif /* !CONFIG_64BIT */
/*
- * If the PCI device's view of memory is the same as the CPU's view of memory,
- * PCI_DMA_BUS_IS_PHYS is true. The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#ifdef CONFIG_PA20
-/* All PA-2.0 machines have an IOMMU. */
-#define PCI_DMA_BUS_IS_PHYS 0
-#define parisc_has_iommu() do { } while (0)
-#else
-
-#if defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA)
-extern int parisc_bus_is_phys; /* in arch/parisc/kernel/setup.c */
-#define PCI_DMA_BUS_IS_PHYS parisc_bus_is_phys
-#define parisc_has_iommu() do { parisc_bus_is_phys = 0; } while (0)
-#else
-#define PCI_DMA_BUS_IS_PHYS 1
-#define parisc_has_iommu() do { } while (0)
-#endif
-
-#endif /* !CONFIG_PA20 */
-
-
-/*
** Most PCI devices (eg Tulip, NCR720) also export the same registers
** to both MMIO and I/O port space. Due to poor performance of I/O Port
** access under HP PCI bus adapters, strongly recommend the use of MMIO
diff --git a/arch/parisc/include/uapi/asm/msgbuf.h b/arch/parisc/include/uapi/asm/msgbuf.h
index b48b810e626b..6a2e9ab2ef8d 100644
--- a/arch/parisc/include/uapi/asm/msgbuf.h
+++ b/arch/parisc/include/uapi/asm/msgbuf.h
@@ -10,31 +10,30 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct msqid64_ds {
struct ipc64_perm msg_perm;
-#if __BITS_PER_LONG != 64
- unsigned int __pad1;
-#endif
+#if __BITS_PER_LONG == 64
__kernel_time_t msg_stime; /* last msgsnd time */
-#if __BITS_PER_LONG != 64
- unsigned int __pad2;
-#endif
__kernel_time_t msg_rtime; /* last msgrcv time */
-#if __BITS_PER_LONG != 64
- unsigned int __pad3;
-#endif
__kernel_time_t msg_ctime; /* last change time */
- unsigned long msg_cbytes; /* current number of bytes on queue */
- unsigned long msg_qnum; /* number of messages in queue */
- unsigned long msg_qbytes; /* max number of bytes on queue */
- __kernel_pid_t msg_lspid; /* pid of last msgsnd */
- __kernel_pid_t msg_lrpid; /* last receive pid */
- unsigned long __unused1;
- unsigned long __unused2;
+#else
+ unsigned long msg_stime_high;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_ctime; /* last change time */
+#endif
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ unsigned long __unused1;
+ unsigned long __unused2;
};
#endif /* _PARISC_MSGBUF_H */
diff --git a/arch/parisc/include/uapi/asm/sembuf.h b/arch/parisc/include/uapi/asm/sembuf.h
index 746c5d86a9b1..3c31163b1241 100644
--- a/arch/parisc/include/uapi/asm/sembuf.h
+++ b/arch/parisc/include/uapi/asm/sembuf.h
@@ -10,21 +10,21 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
-#if __BITS_PER_LONG != 64
- unsigned int __pad1;
-#endif
+#if __BITS_PER_LONG == 64
__kernel_time_t sem_otime; /* last semop time */
-#if __BITS_PER_LONG != 64
- unsigned int __pad2;
-#endif
__kernel_time_t sem_ctime; /* last change time */
- unsigned long sem_nsems; /* no. of semaphores in array */
+#else
+ unsigned long sem_otime_high;
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_ctime_high;
+ unsigned long sem_ctime; /* last change time */
+#endif
+ unsigned long sem_nsems; /* no. of semaphores in array */
unsigned long __unused1;
unsigned long __unused2;
};
diff --git a/arch/parisc/include/uapi/asm/shmbuf.h b/arch/parisc/include/uapi/asm/shmbuf.h
index cd4dbce55d0b..c89b3dd8db21 100644
--- a/arch/parisc/include/uapi/asm/shmbuf.h
+++ b/arch/parisc/include/uapi/asm/shmbuf.h
@@ -10,25 +10,22 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
-#if __BITS_PER_LONG != 64
- unsigned int __pad1;
-#endif
+#if __BITS_PER_LONG == 64
__kernel_time_t shm_atime; /* last attach time */
-#if __BITS_PER_LONG != 64
- unsigned int __pad2;
-#endif
__kernel_time_t shm_dtime; /* last detach time */
-#if __BITS_PER_LONG != 64
- unsigned int __pad3;
-#endif
__kernel_time_t shm_ctime; /* last change time */
-#if __BITS_PER_LONG != 64
+#else
+ unsigned long shm_atime_high;
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_dtime_high;
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_ctime_high;
+ unsigned long shm_ctime; /* last change time */
unsigned int __pad4;
#endif
__kernel_size_t shm_segsz; /* size of segment (bytes) */
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 1a2be6e639b5..7aa1d4d0d444 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -76,8 +76,6 @@ void user_enable_single_step(struct task_struct *task)
set_tsk_thread_flag(task, TIF_SINGLESTEP);
if (pa_psw(task)->n) {
- struct siginfo si;
-
/* Nullified, just crank over the queue. */
task_regs(task)->iaoq[0] = task_regs(task)->iaoq[1];
task_regs(task)->iasq[0] = task_regs(task)->iasq[1];
@@ -90,11 +88,9 @@ void user_enable_single_step(struct task_struct *task)
ptrace_disable(task);
/* Don't wake up the task, but let the
parent know something happened. */
- si.si_code = TRAP_TRACE;
- si.si_addr = (void __user *) (task_regs(task)->iaoq[0] & ~3);
- si.si_signo = SIGTRAP;
- si.si_errno = 0;
- force_sig_info(SIGTRAP, &si, task);
+ force_sig_fault(SIGTRAP, TRAP_TRACE,
+ (void __user *) (task_regs(task)->iaoq[0] & ~3),
+ task);
/* notify_parent(task, SIGCHLD); */
return;
}
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 0e9675f857a5..8d3a7b80ac42 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -58,11 +58,6 @@ struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
-#if !defined(CONFIG_PA20) && (defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA))
-int parisc_bus_is_phys __read_mostly = 1; /* Assume no IOMMU is present */
-EXPORT_SYMBOL(parisc_bus_is_phys);
-#endif
-
void __init setup_cmdline(char **cmdline_p)
{
extern unsigned int boot_args[];
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 71d31274d782..4309ad31a874 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -297,13 +297,8 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
#define GDB_BREAK_INSN 0x10004
static void handle_gdb_break(struct pt_regs *regs, int wot)
{
- struct siginfo si;
-
- si.si_signo = SIGTRAP;
- si.si_errno = 0;
- si.si_code = wot;
- si.si_addr = (void __user *) (regs->iaoq[0] & ~3);
- force_sig_info(SIGTRAP, &si, current);
+ force_sig_fault(SIGTRAP, wot,
+ (void __user *) (regs->iaoq[0] & ~3), current);
}
static void handle_break(struct pt_regs *regs)
@@ -487,7 +482,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
{
unsigned long fault_address = 0;
unsigned long fault_space = 0;
- struct siginfo si;
+ int si_code;
if (code == 1)
pdc_console_restart(); /* switch back to pdc if HPMC */
@@ -571,7 +566,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
case 8:
/* Illegal instruction trap */
die_if_kernel("Illegal instruction", regs, code);
- si.si_code = ILL_ILLOPC;
+ si_code = ILL_ILLOPC;
goto give_sigill;
case 9:
@@ -582,7 +577,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
case 10:
/* Privileged operation trap */
die_if_kernel("Privileged operation", regs, code);
- si.si_code = ILL_PRVOPC;
+ si_code = ILL_PRVOPC;
goto give_sigill;
case 11:
@@ -605,20 +600,16 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
}
die_if_kernel("Privileged register usage", regs, code);
- si.si_code = ILL_PRVREG;
+ si_code = ILL_PRVREG;
give_sigill:
- si.si_signo = SIGILL;
- si.si_errno = 0;
- si.si_addr = (void __user *) regs->iaoq[0];
- force_sig_info(SIGILL, &si, current);
+ force_sig_fault(SIGILL, si_code,
+ (void __user *) regs->iaoq[0], current);
return;
case 12:
/* Overflow Trap, let the userland signal handler do the cleanup */
- si.si_signo = SIGFPE;
- si.si_code = FPE_INTOVF;
- si.si_addr = (void __user *) regs->iaoq[0];
- force_sig_info(SIGFPE, &si, current);
+ force_sig_fault(SIGFPE, FPE_INTOVF,
+ (void __user *) regs->iaoq[0], current);
return;
case 13:
@@ -626,13 +617,11 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
The condition succeeds in an instruction which traps
on condition */
if(user_mode(regs)){
- si.si_signo = SIGFPE;
/* Let userspace app figure it out from the insn pointed
* to by si_addr.
*/
- si.si_code = FPE_CONDTRAP;
- si.si_addr = (void __user *) regs->iaoq[0];
- force_sig_info(SIGFPE, &si, current);
+ force_sig_fault(SIGFPE, FPE_CONDTRAP,
+ (void __user *) regs->iaoq[0], current);
return;
}
/* The kernel doesn't want to handle condition codes */
@@ -741,14 +730,10 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
return;
die_if_kernel("Protection id trap", regs, code);
- si.si_code = SEGV_MAPERR;
- si.si_signo = SIGSEGV;
- si.si_errno = 0;
- if (code == 7)
- si.si_addr = (void __user *) regs->iaoq[0];
- else
- si.si_addr = (void __user *) regs->ior;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR,
+ (code == 7)?
+ ((void __user *) regs->iaoq[0]) :
+ ((void __user *) regs->ior), current);
return;
case 28:
@@ -762,11 +747,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
"handle_interruption() pid=%d command='%s'\n",
task_pid_nr(current), current->comm);
/* SIGBUS, for lack of a better one. */
- si.si_signo = SIGBUS;
- si.si_code = BUS_OBJERR;
- si.si_errno = 0;
- si.si_addr = (void __user *) regs->ior;
- force_sig_info(SIGBUS, &si, current);
+ force_sig_fault(SIGBUS, BUS_OBJERR,
+ (void __user *)regs->ior, current);
return;
}
pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
@@ -781,11 +763,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
"User fault %d on space 0x%08lx, pid=%d command='%s'\n",
code, fault_space,
task_pid_nr(current), current->comm);
- si.si_signo = SIGSEGV;
- si.si_errno = 0;
- si.si_code = SEGV_MAPERR;
- si.si_addr = (void __user *) regs->ior;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR,
+ (void __user *)regs->ior, current);
return;
}
}
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index e36f7b75ab07..932bfc0b7cd8 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -452,7 +452,6 @@ void handle_unaligned(struct pt_regs *regs)
unsigned long newbase = R1(regs->iir)?regs->gr[R1(regs->iir)]:0;
int modify = 0;
int ret = ERR_NOTHANDLED;
- struct siginfo si;
register int flop=0; /* true if this is a flop */
__inc_irq_stat(irq_unaligned_count);
@@ -690,21 +689,15 @@ void handle_unaligned(struct pt_regs *regs)
if (ret == ERR_PAGEFAULT)
{
- si.si_signo = SIGSEGV;
- si.si_errno = 0;
- si.si_code = SEGV_MAPERR;
- si.si_addr = (void __user *)regs->ior;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR,
+ (void __user *)regs->ior, current);
}
else
{
force_sigbus:
/* couldn't handle it ... */
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_code = BUS_ADRALN;
- si.si_addr = (void __user *)regs->ior;
- force_sig_info(SIGBUS, &si, current);
+ force_sig_fault(SIGBUS, BUS_ADRALN,
+ (void __user *)regs->ior, current);
}
return;
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 2fb59d2e2b29..0590e05571d1 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -81,7 +81,6 @@ int
handle_fpe(struct pt_regs *regs)
{
extern void printbinary(unsigned long x, int nbits);
- struct siginfo si;
unsigned int orig_sw, sw;
int signalcode;
/* need an intermediate copy of float regs because FPU emulation
@@ -117,11 +116,8 @@ handle_fpe(struct pt_regs *regs)
memcpy(regs->fr, frcopy, sizeof regs->fr);
if (signalcode != 0) {
- si.si_signo = signalcode >> 24;
- si.si_errno = 0;
- si.si_code = signalcode & 0xffffff;
- si.si_addr = (void __user *) regs->iaoq[0];
- force_sig_info(si.si_signo, &si, current);
+ force_sig_fault(signalcode >> 24, signalcode & 0xffffff,
+ (void __user *) regs->iaoq[0], current);
return -1;
}
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e247edbca68e..a80117980fc2 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -353,23 +353,22 @@ bad_area:
up_read(&mm->mmap_sem);
if (user_mode(regs)) {
- struct siginfo si;
- unsigned int lsb = 0;
+ int signo, si_code;
switch (code) {
case 15: /* Data TLB miss fault/Data page fault */
/* send SIGSEGV when outside of vma */
if (!vma ||
address < vma->vm_start || address >= vma->vm_end) {
- si.si_signo = SIGSEGV;
- si.si_code = SEGV_MAPERR;
+ signo = SIGSEGV;
+ si_code = SEGV_MAPERR;
break;
}
/* send SIGSEGV for wrong permissions */
if ((vma->vm_flags & acc_type) != acc_type) {
- si.si_signo = SIGSEGV;
- si.si_code = SEGV_ACCERR;
+ signo = SIGSEGV;
+ si_code = SEGV_ACCERR;
break;
}
@@ -377,43 +376,40 @@ bad_area:
/* fall through */
case 17: /* NA data TLB miss / page fault */
case 18: /* Unaligned access - PCXS only */
- si.si_signo = SIGBUS;
- si.si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR;
+ signo = SIGBUS;
+ si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR;
break;
case 16: /* Non-access instruction TLB miss fault */
case 26: /* PCXL: Data memory access rights trap */
default:
- si.si_signo = SIGSEGV;
- si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR;
+ signo = SIGSEGV;
+ si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR;
break;
}
-
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ unsigned int lsb = 0;
printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n",
tsk->comm, tsk->pid, address);
- si.si_signo = SIGBUS;
- si.si_code = BUS_MCEERR_AR;
+ /*
+ * Either small page or large page may be poisoned.
+ * In other words, VM_FAULT_HWPOISON_LARGE and
+ * VM_FAULT_HWPOISON are mutually exclusive.
+ */
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ else if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *) address,
+ lsb, current);
+ return;
}
#endif
+ show_signal_msg(regs, code, address, tsk, vma);
- /*
- * Either small page or large page may be poisoned.
- * In other words, VM_FAULT_HWPOISON_LARGE and
- * VM_FAULT_HWPOISON are mutually exclusive.
- */
- if (fault & VM_FAULT_HWPOISON_LARGE)
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- else if (fault & VM_FAULT_HWPOISON)
- lsb = PAGE_SHIFT;
- else
- show_signal_msg(regs, code, address, tsk, vma);
- si.si_addr_lsb = lsb;
-
- si.si_errno = 0;
- si.si_addr = (void __user *) address;
- force_sig_info(si.si_signo, &si, current);
+ force_sig_fault(signo, si_code, (void __user *) address, current);
return;
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c32a181a7cbb..f674006dea2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -13,12 +13,6 @@ config 64BIT
bool
default y if PPC64
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool PPC64 || PHYS_64BIT
-
-config ARCH_DMA_ADDR_T_64BIT
- def_bool ARCH_PHYS_ADDR_T_64BIT
-
config MMU
bool
default y
@@ -187,7 +181,6 @@ config PPC
select HAVE_CONTEXT_TRACKING if PPC64
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
- select HAVE_DMA_API_DEBUG
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL
select HAVE_EBPF_JIT if PPC64
@@ -223,9 +216,11 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_IRQ_TIME_ACCOUNTING
+ select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
+ select NEED_SG_DMA_LENGTH
select NO_BOOTMEM
select OF
select OF_EARLY_FLATTREE
@@ -478,19 +473,6 @@ config MPROFILE_KERNEL
depends on PPC64 && CPU_LITTLE_ENDIAN
def_bool !DISABLE_MPROFILE_KERNEL
-config IOMMU_HELPER
- def_bool PPC64
-
-config SWIOTLB
- bool "SWIOTLB support"
- default n
- select IOMMU_HELPER
- ---help---
- Support for IO bounce buffering for systems without an IOMMU.
- This allows us to DMA to the full physical address space on
- platforms where the size of a physical address is larger
- than the bus address. Not all platforms support this.
-
config HOTPLUG_CPU
bool "Support for enabling/disabling CPUs"
depends on SMP && (PPC_PSERIES || \
@@ -883,7 +865,7 @@ config PPC_MEM_KEYS
page-based protections, but without requiring modification of the
page tables when an application changes protection domains.
- For details, see Documentation/vm/protection-keys.txt
+ For details, see Documentation/vm/protection-keys.rst
If unsure, say y.
@@ -913,9 +895,6 @@ config ZONE_DMA
config NEED_DMA_MAP_STATE
def_bool (PPC64 || NOT_COHERENT_CACHE)
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config GENERIC_ISA_DMA
bool
depends on ISA_DMA_API
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index 62168e1158f1..85c8af2bb272 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -17,7 +17,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u32 __compat_uid_t;
@@ -45,16 +44,6 @@ typedef u32 compat_ulong_t;
typedef u64 compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev;
compat_ino_t st_ino;
@@ -173,10 +162,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- unsigned int __unused1;
- compat_time_t sem_otime;
- unsigned int __unused2;
- compat_time_t sem_ctime;
+ unsigned int sem_otime_high;
+ unsigned int sem_otime;
+ unsigned int sem_ctime_high;
+ unsigned int sem_ctime;
compat_ulong_t sem_nsems;
compat_ulong_t __unused3;
compat_ulong_t __unused4;
@@ -184,12 +173,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- unsigned int __unused1;
- compat_time_t msg_stime;
- unsigned int __unused2;
- compat_time_t msg_rtime;
- unsigned int __unused3;
- compat_time_t msg_ctime;
+ unsigned int msg_stime_high;
+ unsigned int msg_stime;
+ unsigned int msg_rtime_high;
+ unsigned int msg_rtime;
+ unsigned int msg_ctime_high;
+ unsigned int msg_ctime;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
@@ -201,12 +190,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
- unsigned int __unused1;
- compat_time_t shm_atime;
- unsigned int __unused2;
- compat_time_t shm_dtime;
- unsigned int __unused3;
- compat_time_t shm_ctime;
+ unsigned int shm_atime_high;
+ unsigned int shm_atime;
+ unsigned int shm_dtime_high;
+ unsigned int shm_dtime;
+ unsigned int shm_ctime_high;
+ unsigned int shm_ctime;
unsigned int __unused4;
compat_size_t shm_segsz;
compat_pid_t shm_cpid;
diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h
index 5986d473722b..383f628acbf8 100644
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -25,15 +25,8 @@ typedef struct {
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define __ARCH_IRQ_STAT
-
-#define local_softirq_pending() __this_cpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
#define __ARCH_IRQ_EXIT_IRQS_DISABLED
-#define set_softirq_pending(x) __this_cpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x) __this_cpu_or(irq_stat.__softirq_pending, (x))
-
static inline void ack_bad_irq(unsigned int irq)
{
printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq);
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 401c62aad5e4..2af9ded80540 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -92,24 +92,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
#define HAVE_PCI_LEGACY 1
-#ifdef CONFIG_PPC64
-
-/* The PCI address space does not equal the physical memory address
- * space (we have an IOMMU). The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (0)
-
-#else /* 32-bit */
-
-/* The PCI address space does equal the physical memory
- * address space (no IOMMU). The IDE and SCSI device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (1)
-
-#endif /* CONFIG_PPC64 */
-
extern void pcibios_claim_one_bus(struct pci_bus *b);
extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
diff --git a/arch/powerpc/include/uapi/asm/msgbuf.h b/arch/powerpc/include/uapi/asm/msgbuf.h
index 65beb0942500..2b1b37797a47 100644
--- a/arch/powerpc/include/uapi/asm/msgbuf.h
+++ b/arch/powerpc/include/uapi/asm/msgbuf.h
@@ -10,18 +10,18 @@
struct msqid64_ds {
struct ipc64_perm msg_perm;
-#ifndef __powerpc64__
- unsigned int __unused1;
-#endif
+#ifdef __powerpc64__
__kernel_time_t msg_stime; /* last msgsnd time */
-#ifndef __powerpc64__
- unsigned int __unused2;
-#endif
__kernel_time_t msg_rtime; /* last msgrcv time */
-#ifndef __powerpc64__
- unsigned int __unused3;
-#endif
__kernel_time_t msg_ctime; /* last change time */
+#else
+ unsigned long msg_stime_high;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_ctime; /* last change time */
+#endif
unsigned long msg_cbytes; /* current number of bytes on queue */
unsigned long msg_qnum; /* number of messages in queue */
unsigned long msg_qbytes; /* max number of bytes on queue */
diff --git a/arch/powerpc/include/uapi/asm/sembuf.h b/arch/powerpc/include/uapi/asm/sembuf.h
index 8f393d60f02d..3f60946f77e3 100644
--- a/arch/powerpc/include/uapi/asm/sembuf.h
+++ b/arch/powerpc/include/uapi/asm/sembuf.h
@@ -15,20 +15,20 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
+ * - 2 miscellaneous 32/64-bit values
*/
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
#ifndef __powerpc64__
- unsigned long __unused1;
-#endif
+ unsigned long sem_otime_high;
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_ctime_high;
+ unsigned long sem_ctime; /* last change time */
+#else
__kernel_time_t sem_otime; /* last semop time */
-#ifndef __powerpc64__
- unsigned long __unused2;
-#endif
__kernel_time_t sem_ctime; /* last change time */
+#endif
unsigned long sem_nsems; /* no. of semaphores in array */
unsigned long __unused3;
unsigned long __unused4;
diff --git a/arch/powerpc/include/uapi/asm/shmbuf.h b/arch/powerpc/include/uapi/asm/shmbuf.h
index deb1c3e503d3..b591c4d7e4c5 100644
--- a/arch/powerpc/include/uapi/asm/shmbuf.h
+++ b/arch/powerpc/include/uapi/asm/shmbuf.h
@@ -16,25 +16,22 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
-#ifndef __powerpc64__
- unsigned long __unused1;
-#endif
+#ifdef __powerpc64__
__kernel_time_t shm_atime; /* last attach time */
-#ifndef __powerpc64__
- unsigned long __unused2;
-#endif
__kernel_time_t shm_dtime; /* last detach time */
-#ifndef __powerpc64__
- unsigned long __unused3;
-#endif
__kernel_time_t shm_ctime; /* last change time */
-#ifndef __powerpc64__
+#else
+ unsigned long shm_atime_high;
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_dtime_high;
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_ctime_high;
+ unsigned long shm_ctime; /* last change time */
unsigned long __unused4;
#endif
size_t shm_segsz; /* size of segment (bytes) */
diff --git a/arch/powerpc/include/uapi/asm/siginfo.h b/arch/powerpc/include/uapi/asm/siginfo.h
index 9f142451a01f..1d51d9b88221 100644
--- a/arch/powerpc/include/uapi/asm/siginfo.h
+++ b/arch/powerpc/include/uapi/asm/siginfo.h
@@ -15,19 +15,4 @@
#include <asm-generic/siginfo.h>
-/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
- * SIGTRAP si_codes
- */
-#ifdef __KERNEL__
-#define TRAP_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-
#endif /* _ASM_POWERPC_SIGINFO_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 373dc1d6ef44..8817c5a6bcc2 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -13,6 +13,7 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/compat.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -42,7 +43,6 @@
#include <asm/paca.h>
#include <asm/lppaca.h>
#include <asm/cache.h>
-#include <asm/compat.h>
#include <asm/mmu.h>
#include <asm/hvcall.h>
#include <asm/xics.h>
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index da20569de9d4..138157deeadf 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -309,8 +309,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
int dma_set_mask(struct device *dev, u64 dma_mask)
{
if (ppc_md.dma_set_mask)
@@ -361,7 +359,6 @@ EXPORT_SYMBOL_GPL(dma_get_required_mask);
static int __init dma_init(void)
{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
#endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13fed51..26ea9793d290 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -632,6 +632,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
hw_breakpoint_disable();
/* Deliver the signal to userspace */
+ clear_siginfo(&info);
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_HWBKPT;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0904492e7032..0e17dcb48720 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -296,7 +296,6 @@ NOKPROBE_SYMBOL(die);
void user_single_step_siginfo(struct task_struct *tsk,
struct pt_regs *regs, siginfo_t *info)
{
- memset(info, 0, sizeof(*info));
info->si_signo = SIGTRAP;
info->si_code = TRAP_TRACE;
info->si_addr = (void __user *)regs->nip;
@@ -334,7 +333,7 @@ void _exception_pkey(int signr, struct pt_regs *regs, int code,
*/
thread_pkey_regs_save(&current->thread);
- memset(&info, 0, sizeof(info));
+ clear_siginfo(&info);
info.si_signo = signr;
info.si_code = code;
info.si_addr = (void __user *) addr;
@@ -970,7 +969,7 @@ void unknown_exception(struct pt_regs *regs)
printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
regs->nip, regs->msr, regs->trap);
- _exception(SIGTRAP, regs, TRAP_FIXME, 0);
+ _exception(SIGTRAP, regs, TRAP_UNK, 0);
exception_exit(prev_state);
}
@@ -992,7 +991,7 @@ bail:
void RunModeException(struct pt_regs *regs)
{
- _exception(SIGTRAP, regs, TRAP_FIXME, 0);
+ _exception(SIGTRAP, regs, TRAP_UNK, 0);
}
void single_step_exception(struct pt_regs *regs)
@@ -1032,7 +1031,7 @@ static void emulate_single_step(struct pt_regs *regs)
static inline int __parse_fpscr(unsigned long fpscr)
{
- int ret = FPE_FIXME;
+ int ret = FPE_FLTUNK;
/* Invalid operation */
if ((fpscr & FPSCR_VE) && (fpscr & FPSCR_VX))
@@ -1973,7 +1972,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
extern int do_spe_mathemu(struct pt_regs *regs);
unsigned long spefscr;
int fpexc_mode;
- int code = FPE_FIXME;
+ int code = FPE_FLTUNK;
int err;
flush_spe_to_thread(current);
@@ -2042,7 +2041,7 @@ void SPEFloatingPointRoundException(struct pt_regs *regs)
printk(KERN_ERR "unrecognized spe instruction "
"in %s at %lx\n", current->comm, regs->nip);
} else {
- _exception(SIGFPE, regs, FPE_FIXME, regs->nip);
+ _exception(SIGFPE, regs, FPE_FLTUNK, regs->nip);
return;
}
}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687a..ef268d5d9db7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -168,6 +168,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
return SIGBUS;
current->thread.trap_nr = BUS_ADRERR;
+ clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRERR;
diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c
index ecc66d5f02c9..ad054dd0d666 100644
--- a/arch/powerpc/oprofile/backtrace.c
+++ b/arch/powerpc/oprofile/backtrace.c
@@ -7,6 +7,7 @@
* 2 of the License, or (at your option) any later version.
**/
+#include <linux/compat_time.h>
#include <linux/oprofile.h>
#include <linux/sched.h>
#include <asm/processor.h>
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 67d3125d0610..84b58abc08ee 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -222,6 +222,7 @@ config PTE_64BIT
config PHYS_64BIT
bool 'Large physical address support' if E500 || PPC_86xx
depends on (44x || E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx
+ select PHYS_ADDR_T_64BIT
---help---
This option enables kernel support for larger than 32-bit physical
addresses. This feature may not be available on all cores.
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 870c0a82d560..1e002e94d0f6 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -44,7 +44,7 @@ static void spufs_handle_event(struct spu_context *ctx,
return;
}
- memset(&info, 0, sizeof(info));
+ clear_siginfo(&info);
switch (type) {
case SPE_EVENT_INVALID_DMA:
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index cd4fd85fde84..274bc064c41f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -3,8 +3,16 @@
# see Documentation/kbuild/kconfig-language.txt.
#
+config 64BIT
+ bool
+
+config 32BIT
+ bool
+
config RISCV
def_bool y
+ # even on 32-bit, physical (and DMA) addresses are > 32-bits
+ select PHYS_ADDR_T_64BIT
select OF
select OF_EARLY_FLATTREE
select OF_IRQ
@@ -22,7 +30,6 @@ config RISCV
select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_GENERIC_DMA_COHERENT
select IRQ_DOMAIN
@@ -39,16 +46,9 @@ config RISCV
config MMU
def_bool y
-# even on 32-bit, physical (and DMA) addresses are > 32-bits
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool y
-
config ZONE_DMA32
bool
- default y
-
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
+ default y if 64BIT
config PAGE_OFFSET
hex
@@ -101,7 +101,6 @@ choice
config ARCH_RV32I
bool "RV32I"
- select CPU_SUPPORTS_32BIT_KERNEL
select 32BIT
select GENERIC_ASHLDI3
select GENERIC_ASHRDI3
@@ -109,13 +108,13 @@ config ARCH_RV32I
config ARCH_RV64I
bool "RV64I"
- select CPU_SUPPORTS_64BIT_KERNEL
select 64BIT
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select SWIOTLB
endchoice
@@ -171,11 +170,6 @@ config NR_CPUS
depends on SMP
default "8"
-config CPU_SUPPORTS_32BIT_KERNEL
- bool
-config CPU_SUPPORTS_64BIT_KERNEL
- bool
-
choice
prompt "CPU Tuning"
default TUNE_GENERIC
@@ -202,24 +196,6 @@ endmenu
menu "Kernel type"
-choice
- prompt "Kernel code model"
- default 64BIT
-
-config 32BIT
- bool "32-bit kernel"
- depends on CPU_SUPPORTS_32BIT_KERNEL
- help
- Select this option to build a 32-bit kernel.
-
-config 64BIT
- bool "64-bit kernel"
- depends on CPU_SUPPORTS_64BIT_KERNEL
- help
- Select this option to build a 64-bit kernel.
-
-endchoice
-
source "mm/Kconfig"
source "kernel/Kconfig.preempt"
diff --git a/arch/riscv/include/asm/dma-mapping.h b/arch/riscv/include/asm/dma-mapping.h
new file mode 100644
index 000000000000..8facc1c8fa05
--- /dev/null
+++ b/arch/riscv/include/asm/dma-mapping.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _RISCV_ASM_DMA_MAPPING_H
+#define _RISCV_ASM_DMA_MAPPING_H 1
+
+#ifdef CONFIG_SWIOTLB
+#include <linux/swiotlb.h>
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+{
+ return &swiotlb_dma_ops;
+}
+#else
+#include <asm-generic/dma-mapping.h>
+#endif /* CONFIG_SWIOTLB */
+
+#endif /* _RISCV_ASM_DMA_MAPPING_H */
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index 0f2fc9ef20fc..b3638c505728 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -26,9 +26,6 @@
/* RISC-V shim does not initialize PCI bus */
#define pcibios_assign_all_busses() 1
-/* We do not have an IOMMU */
-#define PCI_DMA_BUS_IS_PHYS 1
-
extern int isa_dma_bridge_buggy;
#ifdef CONFIG_PCI
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index c11f40c1b2a8..ee44a48faf79 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -29,6 +29,7 @@
#include <linux/of_fdt.h>
#include <linux/of_platform.h>
#include <linux/sched/task.h>
+#include <linux/swiotlb.h>
#include <asm/setup.h>
#include <asm/sections.h>
@@ -206,6 +207,7 @@ void __init setup_arch(char **cmdline_p)
setup_bootmem();
paging_init();
unflatten_device_tree();
+ swiotlb_init(1);
#ifdef CONFIG_SMP
setup_smp();
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 93132cb59184..b99d9dd21fd0 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -63,18 +63,6 @@ void die(struct pt_regs *regs, const char *str)
do_exit(SIGSEGV);
}
-static inline void do_trap_siginfo(int signo, int code,
- unsigned long addr, struct task_struct *tsk)
-{
- siginfo_t info;
-
- info.si_signo = signo;
- info.si_errno = 0;
- info.si_code = code;
- info.si_addr = (void __user *)addr;
- force_sig_info(signo, &info, tsk);
-}
-
void do_trap(struct pt_regs *regs, int signo, int code,
unsigned long addr, struct task_struct *tsk)
{
@@ -87,7 +75,7 @@ void do_trap(struct pt_regs *regs, int signo, int code,
show_regs(regs);
}
- do_trap_siginfo(signo, code, addr, tsk);
+ force_sig_fault(signo, code, (void __user *)addr, tsk);
}
static void do_trap_error(struct pt_regs *regs, int signo, int code,
@@ -149,7 +137,7 @@ asmlinkage void do_trap_break(struct pt_regs *regs)
}
#endif /* CONFIG_GENERIC_BUG */
- do_trap_siginfo(SIGTRAP, TRAP_BRKPT, regs->sepc, current);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc), current);
regs->sepc += 0x4;
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 199ac3e4da1d..6a64287ec1da 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -35,9 +35,6 @@ config GENERIC_BUG
config GENERIC_BUG_RELATIVE_POINTERS
def_bool y
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
-
config GENERIC_LOCKBREAK
def_bool y if SMP && PREEMPT
@@ -133,7 +130,6 @@ config S390
select HAVE_CMPXCHG_LOCAL
select HAVE_COPY_THREAD_TLS
select HAVE_DEBUG_KMEMLEAK
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select DMA_DIRECT_OPS
select HAVE_DYNAMIC_FTRACE
@@ -709,7 +705,11 @@ config QDIO
menuconfig PCI
bool "PCI support"
select PCI_MSI
+ select IOMMU_HELPER
select IOMMU_SUPPORT
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
+
help
Enable PCI support.
@@ -733,15 +733,6 @@ config PCI_DOMAINS
config HAS_IOMEM
def_bool PCI
-config IOMMU_HELPER
- def_bool PCI
-
-config NEED_SG_DMA_LENGTH
- def_bool PCI
-
-config NEED_DMA_MAP_STATE
- def_bool PCI
-
config CHSC_SCH
def_tristate m
prompt "Support for CHSC subchannels"
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index ae0ed8dd5f1b..5d85a039391c 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -13,7 +13,6 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/uaccess.h>
-#include <asm/compat.h>
#include <asm/diag.h>
#include <asm/sclp.h>
#include "hypfs.h"
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 9830fb6b076e..97db2fba546a 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -53,7 +53,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u16 __compat_uid_t;
@@ -97,16 +96,6 @@ typedef struct {
u32 gprs_high[NUM_GPRS];
} s390_compat_regs_high;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev;
u16 __pad1;
@@ -243,10 +232,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- compat_time_t sem_otime;
- compat_ulong_t __pad1;
- compat_time_t sem_ctime;
- compat_ulong_t __pad2;
+ compat_ulong_t sem_otime;
+ compat_ulong_t sem_otime_high;
+ compat_ulong_t sem_ctime;
+ compat_ulong_t sem_ctime_high;
compat_ulong_t sem_nsems;
compat_ulong_t __unused1;
compat_ulong_t __unused2;
@@ -254,12 +243,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- compat_time_t msg_stime;
- compat_ulong_t __pad1;
- compat_time_t msg_rtime;
- compat_ulong_t __pad2;
- compat_time_t msg_ctime;
- compat_ulong_t __pad3;
+ compat_ulong_t msg_stime;
+ compat_ulong_t msg_stime_high;
+ compat_ulong_t msg_rtime;
+ compat_ulong_t msg_rtime_high;
+ compat_ulong_t msg_ctime;
+ compat_ulong_t msg_ctime_high;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
@@ -272,12 +261,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
compat_size_t shm_segsz;
- compat_time_t shm_atime;
- compat_ulong_t __pad1;
- compat_time_t shm_dtime;
- compat_ulong_t __pad2;
- compat_time_t shm_ctime;
- compat_ulong_t __pad3;
+ compat_ulong_t shm_atime;
+ compat_ulong_t shm_atime_high;
+ compat_ulong_t shm_dtime;
+ compat_ulong_t shm_dtime_high;
+ compat_ulong_t shm_ctime;
+ compat_ulong_t shm_ctime_high;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
compat_ulong_t shm_nattch;
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 1a61b1b997f2..7d22a474a040 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -125,8 +125,9 @@
* ELF register definitions..
*/
+#include <linux/compat.h>
+
#include <asm/ptrace.h>
-#include <asm/compat.h>
#include <asm/syscall.h>
#include <asm/user.h>
@@ -136,7 +137,6 @@ typedef s390_regs elf_gregset_t;
typedef s390_fp_regs compat_elf_fpregset_t;
typedef s390_compat_regs compat_elf_gregset_t;
-#include <linux/compat.h>
#include <linux/sched/mm.h> /* for task_struct */
#include <asm/mmu_context.h>
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index a296c6acfd07..dfbc3c6c0674 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -14,6 +14,8 @@
#include <asm/lowcore.h>
#define local_softirq_pending() (S390_lowcore.softirq_pending)
+#define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x))
+#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x))
#define __ARCH_IRQ_STAT
#define __ARCH_HAS_DO_SOFTIRQ
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 12fe3591034f..94f8db468c9b 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -2,8 +2,6 @@
#ifndef __ASM_S390_PCI_H
#define __ASM_S390_PCI_H
-/* must be set before including asm-generic/pci.h */
-#define PCI_DMA_BUS_IS_PHYS (0)
/* must be set before including pci_clp.h */
#define PCI_BAR_COUNT 6
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index faef3f7e8353..e364873e0d10 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -9,9 +9,12 @@ generic-y += errno.h
generic-y += fcntl.h
generic-y += ioctl.h
generic-y += mman.h
+generic-y += msgbuf.h
generic-y += param.h
generic-y += poll.h
generic-y += resource.h
+generic-y += sembuf.h
+generic-y += shmbuf.h
generic-y += sockios.h
generic-y += swab.h
generic-y += termbits.h
diff --git a/arch/s390/include/uapi/asm/msgbuf.h b/arch/s390/include/uapi/asm/msgbuf.h
deleted file mode 100644
index 604f847cd68c..000000000000
--- a/arch/s390/include/uapi/asm/msgbuf.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _S390_MSGBUF_H
-#define _S390_MSGBUF_H
-
-/*
- * The msqid64_ds structure for S/390 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct msqid64_ds {
- struct ipc64_perm msg_perm;
- __kernel_time_t msg_stime; /* last msgsnd time */
-#ifndef __s390x__
- unsigned long __unused1;
-#endif /* ! __s390x__ */
- __kernel_time_t msg_rtime; /* last msgrcv time */
-#ifndef __s390x__
- unsigned long __unused2;
-#endif /* ! __s390x__ */
- __kernel_time_t msg_ctime; /* last change time */
-#ifndef __s390x__
- unsigned long __unused3;
-#endif /* ! __s390x__ */
- unsigned long msg_cbytes; /* current number of bytes on queue */
- unsigned long msg_qnum; /* number of messages in queue */
- unsigned long msg_qbytes; /* max number of bytes on queue */
- __kernel_pid_t msg_lspid; /* pid of last msgsnd */
- __kernel_pid_t msg_lrpid; /* last receive pid */
- unsigned long __unused4;
- unsigned long __unused5;
-};
-
-#endif /* _S390_MSGBUF_H */
diff --git a/arch/s390/include/uapi/asm/sembuf.h b/arch/s390/include/uapi/asm/sembuf.h
deleted file mode 100644
index 3e917697b668..000000000000
--- a/arch/s390/include/uapi/asm/sembuf.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _S390_SEMBUF_H
-#define _S390_SEMBUF_H
-
-/*
- * The semid64_ds structure for S/390 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem (for !__s390x__)
- * - 2 miscellaneous 32-bit values
- */
-
-struct semid64_ds {
- struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
- __kernel_time_t sem_otime; /* last semop time */
-#ifndef __s390x__
- unsigned long __unused1;
-#endif /* ! __s390x__ */
- __kernel_time_t sem_ctime; /* last change time */
-#ifndef __s390x__
- unsigned long __unused2;
-#endif /* ! __s390x__ */
- unsigned long sem_nsems; /* no. of semaphores in array */
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _S390_SEMBUF_H */
diff --git a/arch/s390/include/uapi/asm/shmbuf.h b/arch/s390/include/uapi/asm/shmbuf.h
deleted file mode 100644
index 9cdce8d7ce60..000000000000
--- a/arch/s390/include/uapi/asm/shmbuf.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _S390_SHMBUF_H
-#define _S390_SHMBUF_H
-
-/*
- * The shmid64_ds structure for S/390 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem (for !__s390x__)
- * - 2 miscellaneous 32-bit values
- */
-
-struct shmid64_ds {
- struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
-#ifndef __s390x__
- unsigned long __unused1;
-#endif /* ! __s390x__ */
- __kernel_time_t shm_dtime; /* last detach time */
-#ifndef __s390x__
- unsigned long __unused2;
-#endif /* ! __s390x__ */
- __kernel_time_t shm_ctime; /* last change time */
-#ifndef __s390x__
- unsigned long __unused3;
-#endif /* ! __s390x__ */
- __kernel_pid_t shm_cpid; /* pid of creator */
- __kernel_pid_t shm_lpid; /* pid of last operator */
- unsigned long shm_nattch; /* no. of current attaches */
- unsigned long __unused4;
- unsigned long __unused5;
-};
-
-struct shminfo64 {
- unsigned long shmmax;
- unsigned long shmmin;
- unsigned long shmmni;
- unsigned long shmseg;
- unsigned long shmall;
- unsigned long __unused1;
- unsigned long __unused2;
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _S390_SHMBUF_H */
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a5297a22bc1e..8003b38c1688 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -44,14 +44,8 @@ int is_valid_bugaddr(unsigned long addr)
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
{
- siginfo_t info;
-
if (user_mode(regs)) {
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = get_trap_ip(regs);
- force_sig_info(si_signo, &info, current);
+ force_sig_fault(si_signo, si_code, get_trap_ip(regs), current);
report_user_fault(regs, si_signo, 0);
} else {
const struct exception_table_entry *fixup;
@@ -80,18 +74,12 @@ NOKPROBE_SYMBOL(do_trap);
void do_per_trap(struct pt_regs *regs)
{
- siginfo_t info;
-
if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP)
return;
if (!current->ptrace)
return;
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_HWBKPT;
- info.si_addr =
- (void __force __user *) current->thread.per_event.address;
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_HWBKPT,
+ (void __force __user *) current->thread.per_event.address, current);
}
NOKPROBE_SYMBOL(do_per_trap);
@@ -165,7 +153,6 @@ void translation_exception(struct pt_regs *regs)
void illegal_op(struct pt_regs *regs)
{
- siginfo_t info;
__u8 opcode[6];
__u16 __user *location;
int is_uprobe_insn = 0;
@@ -177,13 +164,9 @@ void illegal_op(struct pt_regs *regs)
if (get_user(*((__u16 *) opcode), (__u16 __user *) location))
return;
if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
- if (current->ptrace) {
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = location;
- force_sig_info(SIGTRAP, &info, current);
- } else
+ if (current->ptrace)
+ force_sig_fault(SIGTRAP, TRAP_BRKPT, location, current);
+ else
signal = SIGILL;
#ifdef CONFIG_UPROBES
} else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) {
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ebfa0442e569..a3bce0e84346 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -26,7 +26,6 @@
#include <asm/gmap.h>
#include <asm/io.h>
#include <asm/ptrace.h>
-#include <asm/compat.h>
#include <asm/sclp.h>
#include "gaccess.h"
#include "kvm-s390.h"
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 93faeca52284..e074480d3598 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -265,14 +265,10 @@ void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
*/
static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
{
- struct siginfo si;
-
report_user_fault(regs, SIGSEGV, 1);
- si.si_signo = SIGSEGV;
- si.si_errno = 0;
- si.si_code = si_code;
- si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, si_code,
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+ current);
}
static noinline void do_no_context(struct pt_regs *regs)
@@ -316,18 +312,13 @@ static noinline void do_low_address(struct pt_regs *regs)
static noinline void do_sigbus(struct pt_regs *regs)
{
- struct task_struct *tsk = current;
- struct siginfo si;
-
/*
* Send a sigbus, regardless of whether we were in kernel
* or user mode.
*/
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_code = BUS_ADRERR;
- si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
- force_sig_info(SIGBUS, &si, tsk);
+ force_sig_fault(SIGBUS, BUS_ADRERR,
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+ current);
}
static noinline int signal_return(struct pt_regs *regs)
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 93cd0f1ca12b..19b2d2a9b43d 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -19,7 +19,6 @@
#include <linux/uaccess.h>
#include <asm/pci_debug.h>
#include <asm/pci_clp.h>
-#include <asm/compat.h>
#include <asm/clp.h>
#include <uapi/asm/clp.h>
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 2d15d84c20ed..d387a0fbdd7e 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -668,15 +668,6 @@ void zpci_dma_exit(void)
kmem_cache_destroy(dma_region_table_cache);
}
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init dma_debug_do_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-fs_initcall(dma_debug_do_init);
-
const struct dma_map_ops s390_pci_dma_ops = {
.alloc = s390_dma_alloc,
.free = s390_dma_free,
@@ -685,8 +676,6 @@ const struct dma_map_ops s390_pci_dma_ops = {
.map_page = s390_dma_map_pages,
.unmap_page = s390_dma_unmap_pages,
.mapping_error = s390_mapping_error,
- /* if we support direct DMA this must be conditional */
- .is_phys = 0,
/* dma_supported is unconditionally true without a callback */
};
EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 1851eaeee131..a97538b607a4 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -14,7 +14,6 @@ config SUPERH
select HAVE_OPROFILE
select HAVE_GENERIC_DMA_COHERENT
select HAVE_ARCH_TRACEHOOK
- select HAVE_DMA_API_DEBUG
select HAVE_PERF_EVENTS
select HAVE_DEBUG_BUGVERBOSE
select ARCH_HAVE_CUSTOM_GPIO_H
@@ -51,6 +50,9 @@ config SUPERH
select HAVE_ARCH_AUDITSYSCALL
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_NMI
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
+
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
@@ -161,12 +163,6 @@ config DMA_COHERENT
config DMA_NONCOHERENT
def_bool !DMA_COHERENT
-config NEED_DMA_MAP_STATE
- def_bool DMA_NONCOHERENT
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config PGTABLE_LEVELS
default 3 if X2TLB
default 2
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 1efcce74997b..46dd82ab2c29 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -1,3 +1,4 @@
+generic-y += compat.h
generic-y += current.h
generic-y += delay.h
generic-y += div64.h
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 0033f0df2b3b..10a36b1cf2ea 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -71,12 +71,6 @@ extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
* SuperH has everything mapped statically like x86.
*/
-/* The PCI address space does equal the physical memory
- * address space. The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
#ifdef CONFIG_PCI
/*
* None of the SH PCI controllers support MWI, it is always treated as a
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index 178457d7620c..3e3a32fc676e 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -78,7 +78,6 @@ const struct dma_map_ops nommu_dma_ops = {
.sync_single_for_device = nommu_sync_single_for_device,
.sync_sg_for_device = nommu_sync_sg_for_device,
#endif
- .is_phys = 1,
};
void __init no_iommu_init(void)
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index afe965712a69..8648ed05ccf0 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -347,13 +347,8 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
/* Deliver the signal to userspace */
if (!arch_check_bp_in_kernelspace(bp)) {
- siginfo_t info;
-
- info.si_signo = args->signr;
- info.si_errno = notifier_to_errno(rc);
- info.si_code = TRAP_HWBKPT;
-
- force_sig_info(args->signr, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_HWBKPT,
+ (void __user *)NULL, current);
}
rcu_read_unlock();
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 245dbeb20afe..5717c7cbdd97 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -44,7 +44,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stat[j].__nmi_count);
+ seq_printf(p, "%10u ", nmi_count(j));
seq_printf(p, " Non-maskable interrupts\n");
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index b3770bb26211..60709ad17fc7 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -477,7 +477,6 @@ asmlinkage void do_address_error(struct pt_regs *regs,
{
unsigned long error_code = 0;
mm_segment_t oldfs;
- siginfo_t info;
insn_size_t instruction;
int tmp;
@@ -537,11 +536,7 @@ uspace_segv:
"access (PC %lx PR %lx)\n", current->comm, regs->pc,
regs->pr);
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, si_code, (void __user *)address, current);
} else {
inc_unaligned_kernel_access();
@@ -598,19 +593,20 @@ int is_dsp_inst(struct pt_regs *regs)
#ifdef CONFIG_CPU_SH2A
asmlinkage void do_divide_error(unsigned long r4)
{
- siginfo_t info;
+ int code;
switch (r4) {
case TRAP_DIVZERO_ERROR:
- info.si_code = FPE_INTDIV;
+ code = FPE_INTDIV;
break;
case TRAP_DIVOVF_ERROR:
- info.si_code = FPE_INTOVF;
+ code = FPE_INTOVF;
break;
+ default:
+ /* Let gcc know unhandled cases don't make it past here */
+ return;
}
-
- info.si_signo = SIGFPE;
- force_sig_info(info.si_signo, &info, current);
+ force_sig_fault(SIGFPE, code, NULL, current);
}
#endif
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index c86f4360c6ce..a0fa8fc88739 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -507,7 +507,6 @@ static int ieee_fpe_handler(struct pt_regs *regs)
unsigned short insn = *(unsigned short *)regs->pc;
unsigned short finsn;
unsigned long nextpc;
- siginfo_t info;
int nib[4] = {
(insn >> 12) & 0xf,
(insn >> 8) & 0xf,
@@ -560,11 +559,8 @@ static int ieee_fpe_handler(struct pt_regs *regs)
~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
task_thread_info(tsk)->status |= TS_USEDFPU;
} else {
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = FPE_FLTINV;
- info.si_addr = (void __user *)regs->pc;
- force_sig_info(SIGFPE, &info, tsk);
+ force_sig_fault(SIGFPE, FPE_FLTINV,
+ (void __user *)regs->pc, tsk);
}
regs->pc = nextpc;
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index f1b44697ad68..fceb2adfcac7 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -20,18 +20,9 @@
#include <asm/cacheflush.h>
#include <asm/addrspace.h>
-#define PREALLOC_DMA_DEBUG_ENTRIES 4096
-
const struct dma_map_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-fs_initcall(dma_init);
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
unsigned long attrs)
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 6fd1bf7481c7..b8e7bb84b6b1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -42,14 +42,7 @@ static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk)
{
- siginfo_t info;
-
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
-
- force_sig_info(si_signo, &info, tsk);
+ force_sig_fault(si_signo, si_code, (void __user *)address, tsk);
}
/*
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8767e45f1b2b..435dbc033afe 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,7 +25,6 @@ config SPARC
select RTC_CLASS
select RTC_DRV_M48T59
select RTC_SYSTOHC
- select HAVE_DMA_API_DEBUG
select HAVE_ARCH_JUMP_LABEL if SPARC64
select GENERIC_IRQ_SHOW
select ARCH_WANT_IPC_PARSE_VERSION
@@ -44,6 +43,8 @@ config SPARC
select ARCH_HAS_SG_CHAIN
select CPU_NO_EFFICIENT_FFS
select LOCKDEP_SMALL if LOCKDEP
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
config SPARC32
def_bool !64BIT
@@ -67,6 +68,7 @@ config SPARC64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_CONTEXT_TRACKING
select HAVE_DEBUG_KMEMLEAK
+ select IOMMU_HELPER
select SPARSE_IRQ
select RTC_DRV_CMOS
select RTC_DRV_BQ4802
@@ -102,14 +104,6 @@ config ARCH_ATU
bool
default y if SPARC64
-config ARCH_DMA_ADDR_T_64BIT
- bool
- default y if ARCH_ATU
-
-config IOMMU_HELPER
- bool
- default y if SPARC64
-
config STACKTRACE_SUPPORT
bool
default y if SPARC64
@@ -146,12 +140,6 @@ config ZONE_DMA
bool
default y if SPARC32
-config NEED_DMA_MAP_STATE
- def_bool y
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config GENERIC_ISA_DMA
bool
default y if SPARC32
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 615283e16f22..4eb51d2dae98 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -11,7 +11,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u16 __compat_uid_t;
@@ -39,16 +38,6 @@ typedef u32 compat_ulong_t;
typedef u64 compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev;
compat_ino_t st_ino;
@@ -168,6 +157,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
return (u32)(unsigned long)uptr;
}
+#ifdef CONFIG_COMPAT
static inline void __user *arch_compat_alloc_user_space(long len)
{
struct pt_regs *regs = current_thread_info()->kregs;
@@ -184,6 +174,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
return (void __user *) usp;
}
+#endif
struct compat_ipc64_perm {
compat_key_t key;
@@ -201,10 +192,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- unsigned int __pad1;
- compat_time_t sem_otime;
- unsigned int __pad2;
- compat_time_t sem_ctime;
+ unsigned int sem_otime_high;
+ unsigned int sem_otime;
+ unsigned int sem_ctime_high;
+ unsigned int sem_ctime;
u32 sem_nsems;
u32 __unused1;
u32 __unused2;
@@ -212,12 +203,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- unsigned int __pad1;
- compat_time_t msg_stime;
- unsigned int __pad2;
- compat_time_t msg_rtime;
- unsigned int __pad3;
- compat_time_t msg_ctime;
+ unsigned int msg_stime_high;
+ unsigned int msg_stime;
+ unsigned int msg_rtime_high;
+ unsigned int msg_rtime;
+ unsigned int msg_ctime_high;
+ unsigned int msg_ctime;
unsigned int msg_cbytes;
unsigned int msg_qnum;
unsigned int msg_qbytes;
@@ -229,12 +220,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
- unsigned int __pad1;
- compat_time_t shm_atime;
- unsigned int __pad2;
- compat_time_t shm_dtime;
- unsigned int __pad3;
- compat_time_t shm_ctime;
+ unsigned int shm_atime_high;
+ unsigned int shm_atime;
+ unsigned int shm_dtime_high;
+ unsigned int shm_dtime;
+ unsigned int shm_ctime_high;
+ unsigned int shm_ctime;
compat_size_t shm_segsz;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
@@ -243,6 +234,7 @@ struct compat_shmid64_ds {
unsigned int __unused2;
};
+#ifdef CONFIG_COMPAT
static inline int is_compat_task(void)
{
return test_thread_flag(TIF_32BIT);
@@ -254,5 +246,6 @@ static inline bool in_compat_syscall(void)
return pt_regs_trap_type(current_pt_regs()) == 0x110;
}
#define in_compat_syscall in_compat_syscall
+#endif
#endif /* _ASM_SPARC64_COMPAT_H */
diff --git a/arch/sparc/include/asm/hardirq_64.h b/arch/sparc/include/asm/hardirq_64.h
index f56540271993..75b92bfe04b5 100644
--- a/arch/sparc/include/asm/hardirq_64.h
+++ b/arch/sparc/include/asm/hardirq_64.h
@@ -10,8 +10,9 @@
#include <asm/cpudata.h>
#define __ARCH_IRQ_STAT
-#define local_softirq_pending() \
- (local_cpu_data().__softirq_pending)
+
+#define local_softirq_pending_ref \
+ __cpu_data.__softirq_pending
void ack_bad_irq(unsigned int irq);
diff --git a/include/linux/iommu-common.h b/arch/sparc/include/asm/iommu-common.h
index 802c90c79d1f..802c90c79d1f 100644
--- a/include/linux/iommu-common.h
+++ b/arch/sparc/include/asm/iommu-common.h
diff --git a/arch/sparc/include/asm/iommu_64.h b/arch/sparc/include/asm/iommu_64.h
index 9ed6b54caa4b..0ef6dedf747e 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -17,7 +17,7 @@
#define IOPTE_WRITE 0x0000000000000002UL
#define IOMMU_NUM_CTXS 4096
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
struct iommu_arena {
unsigned long *map;
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 98917e48727d..cfc0ee9476c6 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -17,10 +17,6 @@
#define PCI_IRQ_NONE 0xffffffff
-/* Dynamic DMA mapping stuff.
- */
-#define PCI_DMA_BUS_IS_PHYS (0)
-
#endif /* __KERNEL__ */
#ifndef CONFIG_LEON_PCI
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 671274e36cfa..fac77813402c 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -17,12 +17,6 @@
#define PCI_IRQ_NONE 0xffffffff
-/* The PCI address space does not equal the physical memory
- * address space. The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS (0)
-
/* PCI IOMMU mapping bypass support. */
/* PCI 64-bit addressing works for all slots on all controller
diff --git a/arch/sparc/include/uapi/asm/msgbuf.h b/arch/sparc/include/uapi/asm/msgbuf.h
index b601c4f4d956..ffc46c211d6d 100644
--- a/arch/sparc/include/uapi/asm/msgbuf.h
+++ b/arch/sparc/include/uapi/asm/msgbuf.h
@@ -8,25 +8,22 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
-
-#if defined(__sparc__) && defined(__arch64__)
-# define PADDING(x)
-#else
-# define PADDING(x) unsigned int x;
-#endif
-
-
struct msqid64_ds {
struct ipc64_perm msg_perm;
- PADDING(__pad1)
+#if defined(__sparc__) && defined(__arch64__)
__kernel_time_t msg_stime; /* last msgsnd time */
- PADDING(__pad2)
__kernel_time_t msg_rtime; /* last msgrcv time */
- PADDING(__pad3)
__kernel_time_t msg_ctime; /* last change time */
+#else
+ unsigned long msg_stime_high;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_ctime; /* last change time */
+#endif
unsigned long msg_cbytes; /* current number of bytes on queue */
unsigned long msg_qnum; /* number of messages in queue */
unsigned long msg_qbytes; /* max number of bytes on queue */
@@ -35,5 +32,4 @@ struct msqid64_ds {
unsigned long __unused1;
unsigned long __unused2;
};
-#undef PADDING
#endif /* _SPARC_MSGBUF_H */
diff --git a/arch/sparc/include/uapi/asm/sembuf.h b/arch/sparc/include/uapi/asm/sembuf.h
index f49b0ffa0ab8..f3d309c2e1cd 100644
--- a/arch/sparc/include/uapi/asm/sembuf.h
+++ b/arch/sparc/include/uapi/asm/sembuf.h
@@ -8,25 +8,23 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
-#if defined(__sparc__) && defined(__arch64__)
-# define PADDING(x)
-#else
-# define PADDING(x) unsigned int x;
-#endif
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
- PADDING(__pad1)
+#if defined(__sparc__) && defined(__arch64__)
__kernel_time_t sem_otime; /* last semop time */
- PADDING(__pad2)
__kernel_time_t sem_ctime; /* last change time */
+#else
+ unsigned long sem_otime_high;
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_ctime_high;
+ unsigned long sem_ctime; /* last change time */
+#endif
unsigned long sem_nsems; /* no. of semaphores in array */
unsigned long __unused1;
unsigned long __unused2;
};
-#undef PADDING
#endif /* _SPARC64_SEMBUF_H */
diff --git a/arch/sparc/include/uapi/asm/shmbuf.h b/arch/sparc/include/uapi/asm/shmbuf.h
index 286631db705c..06618b84822d 100644
--- a/arch/sparc/include/uapi/asm/shmbuf.h
+++ b/arch/sparc/include/uapi/asm/shmbuf.h
@@ -8,24 +8,23 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
-#if defined(__sparc__) && defined(__arch64__)
-# define PADDING(x)
-#else
-# define PADDING(x) unsigned int x;
-#endif
-
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
- PADDING(__pad1)
+#if defined(__sparc__) && defined(__arch64__)
__kernel_time_t shm_atime; /* last attach time */
- PADDING(__pad2)
__kernel_time_t shm_dtime; /* last detach time */
- PADDING(__pad3)
__kernel_time_t shm_ctime; /* last change time */
+#else
+ unsigned long shm_atime_high;
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_dtime_high;
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_ctime_high;
+ unsigned long shm_ctime; /* last change time */
+#endif
size_t shm_segsz; /* size of segment (bytes) */
__kernel_pid_t shm_cpid; /* pid of creator */
__kernel_pid_t shm_lpid; /* pid of last operator */
@@ -46,6 +45,4 @@ struct shminfo64 {
unsigned long __unused4;
};
-#undef PADDING
-
#endif /* _SPARC_SHMBUF_H */
diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h
index 896ce447d16a..e7049550ac82 100644
--- a/arch/sparc/include/uapi/asm/siginfo.h
+++ b/arch/sparc/include/uapi/asm/siginfo.h
@@ -18,13 +18,6 @@
#define SI_NOINFO 32767 /* no information in siginfo_t */
/*
- * SIGFPE si_codes
- */
-#ifdef __KERNEL__
-#define FPE_FIXME 0 /* Broken dup of SI_USER */
-#endif /* __KERNEL__ */
-
-/*
* SIGEMT si_codes
*/
#define EMT_TAGOVF 1 /* tag overflow */
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 76cb57750dda..cf8640841b7a 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_SPARC32) += leon_pmc.o
obj-$(CONFIG_SPARC64) += reboot.o
obj-$(CONFIG_SPARC64) += sysfs.o
-obj-$(CONFIG_SPARC64) += iommu.o
+obj-$(CONFIG_SPARC64) += iommu.o iommu-common.o
obj-$(CONFIG_SPARC64) += central.o
obj-$(CONFIG_SPARC64) += starfire.o
obj-$(CONFIG_SPARC64) += power.o
@@ -74,8 +74,6 @@ obj-$(CONFIG_SPARC64) += pcr.o
obj-$(CONFIG_SPARC64) += nmi.o
obj-$(CONFIG_SPARC64_SMP) += cpumap.o
-obj-y += dma.o
-
obj-$(CONFIG_PCIC_PCI) += pcic.o
obj-$(CONFIG_LEON_PCI) += leon_pci.o
obj-$(CONFIG_SPARC_GRPCI2)+= leon_pci_grpci2.o
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
deleted file mode 100644
index f73e7597c971..000000000000
--- a/arch/sparc/kernel/dma.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 15)
-
-static int __init dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-fs_initcall(dma_init);
diff --git a/lib/iommu-common.c b/arch/sparc/kernel/iommu-common.c
index 55b00de106b5..59cb16691322 100644
--- a/lib/iommu-common.c
+++ b/arch/sparc/kernel/iommu-common.c
@@ -8,9 +8,9 @@
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/iommu-helper.h>
-#include <linux/iommu-common.h>
#include <linux/dma-mapping.h>
#include <linux/hash.h>
+#include <asm/iommu-common.h>
static unsigned long iommu_large_alloc = 15;
@@ -93,7 +93,6 @@ void iommu_tbl_pool_init(struct iommu_map_table *iommu,
p->hint = p->start;
p->end = num_entries;
}
-EXPORT_SYMBOL(iommu_tbl_pool_init);
unsigned long iommu_tbl_range_alloc(struct device *dev,
struct iommu_map_table *iommu,
@@ -224,7 +223,6 @@ bail:
return n;
}
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
unsigned long entry)
@@ -264,4 +262,3 @@ void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
bitmap_clear(iommu->map, entry, npages);
spin_unlock_irqrestore(&(pool->lock), flags);
}
-EXPORT_SYMBOL(iommu_tbl_range_free);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index b08dc3416f06..40d008b0bd3e 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/iommu-helper.h>
#include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
#ifdef CONFIG_PCI
#include <linux/pci.h>
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 86b625f9d8dc..c0fa3ef6cf01 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -16,7 +16,7 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bitmap.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
#include <asm/hypervisor.h>
#include <asm/iommu.h>
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 249367228c33..565d9ac883d0 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -16,7 +16,7 @@
#include <linux/export.h>
#include <linux/log2.h>
#include <linux/of_device.h>
-#include <linux/iommu-common.h>
+#include <asm/iommu-common.h>
#include <asm/iommu.h>
#include <asm/irq.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 454a8af28f13..6c086086ca8f 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -518,14 +518,7 @@ void synchronize_user_stack(void)
static void stack_unaligned(unsigned long sp)
{
- siginfo_t info;
-
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void __user *) sp;
- info.si_trapno = 0;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0, current);
}
void fault_in_user_windows(void)
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index e8c3cb6b6d08..7f3d9c59719a 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -147,17 +147,11 @@ SYSCALL_DEFINE0(nis_syscall)
asmlinkage void
sparc_breakpoint (struct pt_regs *regs)
{
- siginfo_t info;
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc);
#endif
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = (void __user *)regs->pc;
- info.si_trapno = 0;
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0, current);
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc);
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 9ef8de63f28b..7e49bbc925a5 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -502,7 +502,6 @@ SYSCALL_DEFINE0(nis_syscall)
asmlinkage void sparc_breakpoint(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (test_thread_flag(TIF_32BIT)) {
regs->tpc &= 0xffffffff;
@@ -511,12 +510,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs)
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc);
#endif
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- force_sig_info(SIGTRAP, &info, current);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0, current);
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc);
#endif
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index b1ed763e4787..bcdfc6168dd5 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -93,8 +93,6 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs)
void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
{
- siginfo_t info;
-
if(type < 0x80) {
/* Sun OS's puke from bad traps, Linux survives! */
printk("Unimplemented Sparc TRAP, type = %02lx\n", type);
@@ -104,19 +102,13 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
if(regs->psr & PSR_PS)
die_if_kernel("Kernel bad trap", regs);
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLTRP;
- info.si_addr = (void __user *)regs->pc;
- info.si_trapno = type - 0x80;
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLTRP,
+ (void __user *)regs->pc, type - 0x80, current);
}
void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
if(psr & PSR_PS)
die_if_kernel("Kernel illegal instruction", regs);
#ifdef TRAP_DEBUG
@@ -124,27 +116,15 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon
regs->pc, *(unsigned long *)regs->pc);
#endif
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLOPC;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGILL, &info, current);
+ send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current);
}
void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
if(psr & PSR_PS)
die_if_kernel("Penguin instruction from Penguin mode??!?!", regs);
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_PRVOPC;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGILL, &info, current);
+ send_sig_fault(SIGILL, ILL_PRVOPC, (void __user *)pc, 0, current);
}
/* XXX User may want to be allowed to do this. XXX */
@@ -152,8 +132,6 @@ void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long n
void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
if(regs->psr & PSR_PS) {
printk("KERNEL MNA at pc %08lx npc %08lx called by %08lx\n", pc, npc,
regs->u_regs[UREG_RETPC]);
@@ -165,12 +143,9 @@ void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned lon
instruction_dump ((unsigned long *) regs->pc);
printk ("do_MNA!\n");
#endif
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = /* FIXME: Should dig out mna address */ (void *)0;
- info.si_trapno = 0;
- send_sig_info(SIGBUS, &info, current);
+ send_sig_fault(SIGBUS, BUS_ADRALN,
+ /* FIXME: Should dig out mna address */ (void *)0,
+ 0, current);
}
static unsigned long init_fsr = 0x0UL;
@@ -226,9 +201,9 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
static int calls;
- siginfo_t info;
unsigned long fsr;
int ret = 0;
+ int code;
#ifndef CONFIG_SMP
struct task_struct *fpt = last_task_used_math;
#else
@@ -303,24 +278,20 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
}
fsr = fpt->thread.fsr;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- info.si_code = FPE_FIXME;
+ code = FPE_FLTUNK;
if ((fsr & 0x1c000) == (1 << 14)) {
if (fsr & 0x10)
- info.si_code = FPE_FLTINV;
+ code = FPE_FLTINV;
else if (fsr & 0x08)
- info.si_code = FPE_FLTOVF;
+ code = FPE_FLTOVF;
else if (fsr & 0x04)
- info.si_code = FPE_FLTUND;
+ code = FPE_FLTUND;
else if (fsr & 0x02)
- info.si_code = FPE_FLTDIV;
+ code = FPE_FLTDIV;
else if (fsr & 0x01)
- info.si_code = FPE_FLTRES;
+ code = FPE_FLTRES;
}
- send_sig_info(SIGFPE, &info, fpt);
+ send_sig_fault(SIGFPE, code, (void __user *)pc, 0, fpt);
#ifndef CONFIG_SMP
last_task_used_math = NULL;
#endif
@@ -332,16 +303,9 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
void handle_tag_overflow(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
if(psr & PSR_PS)
die_if_kernel("Penguin overflow trap from kernel mode", regs);
- info.si_signo = SIGEMT;
- info.si_errno = 0;
- info.si_code = EMT_TAGOVF;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGEMT, &info, current);
+ send_sig_fault(SIGEMT, EMT_TAGOVF, (void __user *)pc, 0, current);
}
void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc,
@@ -359,61 +323,33 @@ void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc
void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
#ifdef TRAP_DEBUG
printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n",
pc, npc, psr);
#endif
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_OBJERR;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0, current);
}
void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_COPROC;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGILL, &info, current);
+ send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current);
}
void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
#ifdef TRAP_DEBUG
printk("Co-Processor Exception at PC %08lx NPC %08lx PSR %08lx\n",
pc, npc, psr);
#endif
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_COPROC;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGILL, &info, current);
+ send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current);
}
void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, unsigned long npc,
unsigned long psr)
{
- siginfo_t info;
-
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = FPE_INTDIV;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- send_sig_info(SIGFPE, &info, current);
+ send_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)pc, 0, current);
}
#ifdef CONFIG_DEBUG_BUGVERBOSE
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 462a21abd105..aa624ed79db1 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -87,7 +87,6 @@ static void dump_tl1_traplog(struct tl1_traplog *p)
void bad_trap(struct pt_regs *regs, long lvl)
{
char buffer[36];
- siginfo_t info;
if (notify_die(DIE_TRAP, "bad trap", regs,
0, lvl, SIGTRAP) == NOTIFY_STOP)
@@ -107,12 +106,8 @@ void bad_trap(struct pt_regs *regs, long lvl)
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLTRP;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = lvl;
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLTRP,
+ (void __user *)regs->tpc, lvl, current);
}
void bad_trap_tl1(struct pt_regs *regs, long lvl)
@@ -191,7 +186,6 @@ EXPORT_SYMBOL_GPL(unregister_dimm_printer);
void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "instruction access exception", regs,
0, 0x8, SIGTRAP) == NOTIFY_STOP)
@@ -206,12 +200,8 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR,
+ (void __user *)regs->tpc, 0, current);
out:
exception_exit(prev_state);
}
@@ -230,7 +220,6 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
{
unsigned short type = (type_ctx >> 16);
unsigned short ctx = (type_ctx & 0xffff);
- siginfo_t info;
if (notify_die(DIE_TRAP, "instruction access exception", regs,
0, 0x8, SIGTRAP) == NOTIFY_STOP)
@@ -247,12 +236,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *) addr;
- info.si_trapno = 0;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0, current);
}
void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
@@ -307,7 +291,6 @@ bool is_no_fault_exception(struct pt_regs *regs)
void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "data access exception", regs,
0, 0x30, SIGTRAP) == NOTIFY_STOP)
@@ -338,12 +321,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
if (is_no_fault_exception(regs))
return;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *)sfar;
- info.si_trapno = 0;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0, current);
out:
exception_exit(prev_state);
}
@@ -559,8 +537,6 @@ static void spitfire_cee_log(unsigned long afsr, unsigned long afar, unsigned lo
static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long tt, int tl1, struct pt_regs *regs)
{
- siginfo_t info;
-
printk(KERN_WARNING "CPU[%d]: Uncorrectable Error AFSR[%lx] "
"AFAR[%lx] UDBL[%lx] UDBH[%ld] TT[%lx] TL>1[%d]\n",
smp_processor_id(), afsr, afar, udbl, udbh, tt, tl1);
@@ -595,12 +571,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_OBJERR;
- info.si_addr = (void *)0;
- info.si_trapno = 0;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0, current);
}
void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar)
@@ -2190,7 +2161,6 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
if (attrs & SUN4V_ERR_ATTRS_MEMORY) {
unsigned long addr = ent->err_raddr;
- siginfo_t info;
if (addr == ~(u64)0) {
/* This seems highly unlikely to ever occur */
@@ -2211,21 +2181,13 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs,
addr += PAGE_SIZE;
}
}
- info.si_signo = SIGKILL;
- info.si_errno = 0;
- info.si_trapno = 0;
- force_sig_info(info.si_signo, &info, current);
+ force_sig(SIGKILL, current);
return true;
}
if (attrs & SUN4V_ERR_ATTRS_PIO) {
- siginfo_t info;
-
- info.si_signo = SIGBUS;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)sun4v_get_vaddr(regs);
- force_sig_info(info.si_signo, &info, current);
-
+ force_sig_fault(SIGBUS, BUS_ADRERR,
+ (void __user *)sun4v_get_vaddr(regs), 0, current);
return true;
}
@@ -2362,30 +2324,27 @@ static void do_fpe_common(struct pt_regs *regs)
regs->tnpc += 4;
} else {
unsigned long fsr = current_thread_info()->xfsr[0];
- siginfo_t info;
+ int code;
if (test_thread_flag(TIF_32BIT)) {
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- info.si_code = FPE_FIXME;
+ code = FPE_FLTUNK;
if ((fsr & 0x1c000) == (1 << 14)) {
if (fsr & 0x10)
- info.si_code = FPE_FLTINV;
+ code = FPE_FLTINV;
else if (fsr & 0x08)
- info.si_code = FPE_FLTOVF;
+ code = FPE_FLTOVF;
else if (fsr & 0x04)
- info.si_code = FPE_FLTUND;
+ code = FPE_FLTUND;
else if (fsr & 0x02)
- info.si_code = FPE_FLTDIV;
+ code = FPE_FLTDIV;
else if (fsr & 0x01)
- info.si_code = FPE_FLTRES;
+ code = FPE_FLTRES;
}
- force_sig_info(SIGFPE, &info, current);
+ force_sig_fault(SIGFPE, code,
+ (void __user *)regs->tpc, 0, current);
}
}
@@ -2428,7 +2387,6 @@ out:
void do_tof(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "tagged arithmetic overflow", regs,
0, 0x26, SIGEMT) == NOTIFY_STOP)
@@ -2440,12 +2398,8 @@ void do_tof(struct pt_regs *regs)
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGEMT;
- info.si_errno = 0;
- info.si_code = EMT_TAGOVF;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- force_sig_info(SIGEMT, &info, current);
+ force_sig_fault(SIGEMT, EMT_TAGOVF,
+ (void __user *)regs->tpc, 0, current);
out:
exception_exit(prev_state);
}
@@ -2453,7 +2407,6 @@ out:
void do_div0(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "integer division by zero", regs,
0, 0x28, SIGFPE) == NOTIFY_STOP)
@@ -2465,12 +2418,8 @@ void do_div0(struct pt_regs *regs)
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = FPE_INTDIV;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- force_sig_info(SIGFPE, &info, current);
+ force_sig_fault(SIGFPE, FPE_INTDIV,
+ (void __user *)regs->tpc, 0, current);
out:
exception_exit(prev_state);
}
@@ -2632,7 +2581,6 @@ void do_illegal_instruction(struct pt_regs *regs)
unsigned long pc = regs->tpc;
unsigned long tstate = regs->tstate;
u32 insn;
- siginfo_t info;
if (notify_die(DIE_TRAP, "illegal instruction", regs,
0, 0x10, SIGILL) == NOTIFY_STOP)
@@ -2666,12 +2614,7 @@ void do_illegal_instruction(struct pt_regs *regs)
}
}
}
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_ILLOPC;
- info.si_addr = (void __user *)pc;
- info.si_trapno = 0;
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current);
out:
exception_exit(prev_state);
}
@@ -2679,7 +2622,6 @@ out:
void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "memory address unaligned", regs,
0, 0x34, SIGSEGV) == NOTIFY_STOP)
@@ -2692,20 +2634,13 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
if (is_no_fault_exception(regs))
return;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void __user *)sfar;
- info.si_trapno = 0;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0, current);
out:
exception_exit(prev_state);
}
void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
{
- siginfo_t info;
-
if (notify_die(DIE_TRAP, "memory address unaligned", regs,
0, 0x34, SIGSEGV) == NOTIFY_STOP)
return;
@@ -2717,12 +2652,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
if (is_no_fault_exception(regs))
return;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void __user *) addr;
- info.si_trapno = 0;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0, current);
}
/* sun4v_mem_corrupt_detect_precise() - Handle precise exception on an ADI
@@ -2775,7 +2705,6 @@ void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
void do_privop(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
- siginfo_t info;
if (notify_die(DIE_TRAP, "privileged operation", regs,
0, 0x11, SIGILL) == NOTIFY_STOP)
@@ -2785,12 +2714,8 @@ void do_privop(struct pt_regs *regs)
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_PRVOPC;
- info.si_addr = (void __user *)regs->tpc;
- info.si_trapno = 0;
- force_sig_info(SIGILL, &info, current);
+ force_sig_fault(SIGILL, ILL_PRVOPC,
+ (void __user *)regs->tpc, 0, current);
out:
exception_exit(prev_state);
}
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 7642d7e4f0d9..64ac8c0c1429 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -311,14 +311,9 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn,
static void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn)
{
- siginfo_t info;
-
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void __user *)safe_compute_effective_address(regs, insn);
- info.si_trapno = 0;
- send_sig_info(SIGBUS, &info, current);
+ send_sig_fault(SIGBUS, BUS_ADRALN,
+ (void __user *)safe_compute_effective_address(regs, insn),
+ 0, current);
}
asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn)
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index a8103a84b4ac..9f75b6444bf1 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -127,19 +127,11 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs,
unsigned long addr)
{
- siginfo_t info;
-
- info.si_signo = sig;
- info.si_code = code;
- info.si_errno = 0;
- info.si_addr = (void __user *) addr;
- info.si_trapno = 0;
-
if (unlikely(show_unhandled_signals))
- show_signal_msg(regs, sig, info.si_code,
+ show_signal_msg(regs, sig, code,
addr, current);
- force_sig_info (sig, &info, current);
+ force_sig_fault(sig, code, (void __user *) addr, 0, current);
}
static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 41363f46797b..63166fcf9e25 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -170,11 +170,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
int fault_code)
{
unsigned long addr;
- siginfo_t info;
- info.si_code = code;
- info.si_signo = sig;
- info.si_errno = 0;
if (fault_code & FAULT_CODE_ITLB) {
addr = regs->tpc;
} else {
@@ -187,13 +183,11 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
else
addr = fault_addr;
}
- info.si_addr = (void __user *) addr;
- info.si_trapno = 0;
if (unlikely(show_unhandled_signals))
show_signal_msg(regs, sig, code, addr, current);
- force_sig_info(sig, &info, current);
+ force_sig_fault(sig, code, (void __user *) addr, 0, current);
}
static unsigned int get_fault_insn(struct pt_regs *regs, unsigned int insn)
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index bb5a196c3061..b10dde6cb793 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,6 +1,7 @@
generic-y += barrier.h
generic-y += bpf_perf_event.h
generic-y += bug.h
+generic-y += compat.h
generic-y += current.h
generic-y += delay.h
generic-y += device.h
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index bc2a516c190f..1a1d88a4d940 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -115,17 +115,10 @@ long arch_ptrace(struct task_struct *child, long request,
static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs,
int error_code)
{
- struct siginfo info;
-
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = TRAP_BRKPT;
-
- /* User-mode eip? */
- info.si_addr = UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL;
-
/* Send us the fake SIGTRAP */
- force_sig_info(SIGTRAP, &info, tsk);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT,
+ /* User-mode eip? */
+ UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL, tsk);
}
/*
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index b2b02df9896e..ec9a42c14c56 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -162,13 +162,9 @@ static void show_segv_info(struct uml_pt_regs *regs)
static void bad_segv(struct faultinfo fi, unsigned long ip)
{
- struct siginfo si;
-
- si.si_signo = SIGSEGV;
- si.si_code = SEGV_ACCERR;
- si.si_addr = (void __user *) FAULT_ADDRESS(fi);
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi),
+ current);
}
void fatal_sigsegv(void)
@@ -214,8 +210,8 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
struct uml_pt_regs *regs)
{
- struct siginfo si;
jmp_buf *catcher;
+ int si_code;
int err;
int is_write = FAULT_WRITE(fi);
unsigned long address = FAULT_ADDRESS(fi);
@@ -239,7 +235,7 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
if (SEGV_IS_FIXABLE(&fi))
err = handle_page_fault(address, ip, is_write, is_user,
- &si.si_code);
+ &si_code);
else {
err = -EFAULT;
/*
@@ -271,18 +267,14 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
show_segv_info(regs);
if (err == -EACCES) {
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_code = BUS_ADRERR;
- si.si_addr = (void __user *)address;
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGBUS, &si, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address,
+ current);
} else {
BUG_ON(err != -EFAULT);
- si.si_signo = SIGSEGV;
- si.si_addr = (void __user *) address;
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, si_code, (void __user *) address,
+ current);
}
out:
@@ -294,9 +286,7 @@ out:
void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
{
- struct faultinfo *fi;
- struct siginfo clean_si;
-
+ int code, err;
if (!UPT_IS_USER(regs)) {
if (sig == SIGBUS)
printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
@@ -306,29 +296,21 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
arch_examine_signal(sig, regs);
- clear_siginfo(&clean_si);
- clean_si.si_signo = si->si_signo;
- clean_si.si_errno = si->si_errno;
- clean_si.si_code = si->si_code;
- switch (sig) {
- case SIGILL:
- case SIGFPE:
- case SIGSEGV:
- case SIGBUS:
- case SIGTRAP:
- fi = UPT_FAULTINFO(regs);
- clean_si.si_addr = (void __user *) FAULT_ADDRESS(*fi);
+ /* Is the signal layout for the signal known?
+ * Signal data must be scrubbed to prevent information leaks.
+ */
+ code = si->si_code;
+ err = si->si_errno;
+ if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
+ struct faultinfo *fi = UPT_FAULTINFO(regs);
current->thread.arch.faultinfo = *fi;
-#ifdef __ARCH_SI_TRAPNO
- clean_si.si_trapno = si->si_trapno;
-#endif
- break;
- default:
- printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d)\n",
- sig, si->si_code);
+ force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi),
+ current);
+ } else {
+ printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
+ sig, code, err);
+ force_sig(sig, current);
}
-
- force_sig_info(sig, &clean_si, current);
}
void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 462e59a7ae78..03f991e44288 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -19,6 +19,8 @@ config UNICORE32
select ARCH_WANT_FRAME_POINTERS
select GENERIC_IOMAP
select MODULES_USE_ELF_REL
+ select NEED_DMA_MAP_STATE
+ select SWIOTLB
help
UniCore-32 is 32-bit Instruction Set Architecture,
including a series of low-power-consumption RISC chip
@@ -61,9 +63,6 @@ config ARCH_MAY_HAVE_PC_FDC
config ZONE_DMA
def_bool y
-config NEED_DMA_MAP_STATE
- def_bool y
-
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 6f70c76c81fc..bfc7abe77905 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -1,5 +1,6 @@
generic-y += atomic.h
generic-y += bugs.h
+generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
index 12c8c9527b8e..8594b168f25e 100644
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ b/arch/unicore32/kernel/fpu-ucf64.c
@@ -52,14 +52,14 @@
* Raise a SIGFPE for the current process.
* sicode describes the signal being raised.
*/
-void ucf64_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
+void ucf64_raise_sigfpe(struct pt_regs *regs)
{
siginfo_t info;
- memset(&info, 0, sizeof(info));
+ clear_siginfo(&info);
info.si_signo = SIGFPE;
- info.si_code = sicode;
+ info.si_code = FPE_FLTUNK;
info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
/*
@@ -94,7 +94,7 @@ void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs)
pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n",
cff(FPSCR), inst);
- ucf64_raise_sigfpe(0, regs);
+ ucf64_raise_sigfpe(regs);
return;
}
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
index e9154a59d561..82759b6aba67 100644
--- a/arch/unicore32/mm/Kconfig
+++ b/arch/unicore32/mm/Kconfig
@@ -39,14 +39,3 @@ config CPU_TLB_SINGLE_ENTRY_DISABLE
default y
help
Say Y here to disable the TLB single entry operations.
-
-config SWIOTLB
- def_bool y
- select DMA_DIRECT_OPS
-
-config IOMMU_HELPER
- def_bool SWIOTLB
-
-config NEED_SG_DMA_LENGTH
- def_bool SWIOTLB
-
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index bbefcc46a45e..381473412937 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -125,6 +125,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
tsk->thread.address = addr;
tsk->thread.error_code = fsr;
tsk->thread.trap_no = 14;
+ clear_siginfo(&si);
si.si_signo = sig;
si.si_errno = 0;
si.si_code = code;
@@ -472,6 +473,7 @@ asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr,
printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n",
inf->name, fsr, addr);
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
@@ -491,6 +493,7 @@ asmlinkage void do_PrefetchAbort(unsigned long addr,
printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
inf->name, ifsr, addr);
+ clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..1fe24b624d44 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,8 @@ config X86_64
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select SWIOTLB
select X86_DEV_DMA_OPS
select ARCH_HAS_SYSCALL_WRAPPER
@@ -60,6 +62,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
+ select ARCH_HAS_UACCESS_MCSAFE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
@@ -134,7 +137,6 @@ config X86
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
@@ -184,6 +186,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
+ select NEED_SG_DMA_LENGTH
select PCI_LOCKLESS_CONFIG
select PERF_EVENTS
select RTC_LIB
@@ -236,13 +239,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
config SBUS
bool
-config NEED_DMA_MAP_STATE
- def_bool y
- depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config GENERIC_ISA_DMA
def_bool y
depends on ISA_DMA_API
@@ -875,6 +871,7 @@ config DMI
config GART_IOMMU
bool "Old AMD GART IOMMU support"
+ select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
---help---
@@ -896,6 +893,7 @@ config GART_IOMMU
config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
+ select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI
---help---
@@ -923,20 +921,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
-# need this always selected by IOMMU for the VIA workaround
-config SWIOTLB
- def_bool y if X86_64
- ---help---
- Support for software bounce buffers used on x86-64 systems
- which don't have a hardware IOMMU. Using this PCI devices
- which can only access 32-bits of memory can be used on systems
- with more than 3 GB of memory.
- If unsure, say Y.
-
-config IOMMU_HELPER
- def_bool y
- depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
-
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
@@ -1458,6 +1442,7 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
+ select PHYS_ADDR_T_64BIT
select SWIOTLB
---help---
PAE is required for NX support, and furthermore enables
@@ -1485,14 +1470,6 @@ config X86_5LEVEL
Say N if unsure.
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool y
- depends on X86_64 || X86_PAE
-
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
- depends on X86_64 || HIGHMEM64G
-
config X86_DIRECT_GBPAGES
def_bool y
depends on X86_64 && !DEBUG_PAGEALLOC
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 0cb325734cfb..af6cda0b7900 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
static unsigned long fs;
static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 09f36c0d9d4f..a8a8642d2b0b 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -109,23 +109,34 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
static efi_status_t
-__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
+__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
{
struct pci_setup_rom *rom = NULL;
efi_status_t status;
unsigned long size;
- uint64_t attributes;
+ uint64_t attributes, romsize;
+ void *romimage;
- status = efi_early->call(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0, 0,
- &attributes);
+ status = efi_call_proto(efi_pci_io_protocol, attributes, pci,
+ EfiPciIoAttributeOperationGet, 0, 0,
+ &attributes);
if (status != EFI_SUCCESS)
return status;
- if (!pci->romimage || !pci->romsize)
+ /*
+ * Some firmware images contain EFI function pointers at the place where the
+ * romimage and romsize fields are supposed to be. Typically the EFI
+ * code is mapped at high addresses, translating to an unrealistically
+ * large romsize. The UEFI spec limits the size of option ROMs to 16
+ * MiB so we reject any ROMs over 16 MiB in size to catch this.
+ */
+ romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol,
+ romimage, pci);
+ romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci);
+ if (!romimage || !romsize || romsize > SZ_16M)
return EFI_INVALID_PARAMETER;
- size = pci->romsize + sizeof(*rom);
+ size = romsize + sizeof(*rom);
status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
if (status != EFI_SUCCESS) {
@@ -141,30 +152,32 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
rom->pcilen = pci->romsize;
*__rom = rom;
- status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
- PCI_VENDOR_ID, 1, &(rom->vendor));
+ status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+ EfiPciIoWidthUint16, PCI_VENDOR_ID, 1,
+ &rom->vendor);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "Failed to read rom->vendor\n");
goto free_struct;
}
- status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
- PCI_DEVICE_ID, 1, &(rom->devid));
+ status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+ EfiPciIoWidthUint16, PCI_DEVICE_ID, 1,
+ &rom->devid);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "Failed to read rom->devid\n");
goto free_struct;
}
- status = efi_early->call(pci->get_location, pci, &(rom->segment),
- &(rom->bus), &(rom->device), &(rom->function));
+ status = efi_call_proto(efi_pci_io_protocol, get_location, pci,
+ &rom->segment, &rom->bus, &rom->device,
+ &rom->function);
if (status != EFI_SUCCESS)
goto free_struct;
- memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
- pci->romsize);
+ memcpy(rom->romdata, romimage, romsize);
return status;
free_struct:
@@ -176,7 +189,7 @@ static void
setup_efi_pci32(struct boot_params *params, void **pci_handle,
unsigned long size)
{
- efi_pci_io_protocol_32 *pci = NULL;
+ efi_pci_io_protocol_t *pci = NULL;
efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
u32 *handles = (u32 *)(unsigned long)pci_handle;
efi_status_t status;
@@ -203,7 +216,7 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
if (!pci)
continue;
- status = __setup_efi_pci32(pci, &rom);
+ status = __setup_efi_pci(pci, &rom);
if (status != EFI_SUCCESS)
continue;
@@ -217,74 +230,11 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
}
}
-static efi_status_t
-__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
-{
- struct pci_setup_rom *rom;
- efi_status_t status;
- unsigned long size;
- uint64_t attributes;
-
- status = efi_early->call(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0,
- &attributes);
- if (status != EFI_SUCCESS)
- return status;
-
- if (!pci->romimage || !pci->romsize)
- return EFI_INVALID_PARAMETER;
-
- size = pci->romsize + sizeof(*rom);
-
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc mem for rom\n");
- return status;
- }
-
- rom->data.type = SETUP_PCI;
- rom->data.len = size - sizeof(struct setup_data);
- rom->data.next = 0;
- rom->pcilen = pci->romsize;
- *__rom = rom;
-
- status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
- PCI_VENDOR_ID, 1, &(rom->vendor));
-
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to read rom->vendor\n");
- goto free_struct;
- }
-
- status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
- PCI_DEVICE_ID, 1, &(rom->devid));
-
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to read rom->devid\n");
- goto free_struct;
- }
-
- status = efi_early->call(pci->get_location, pci, &(rom->segment),
- &(rom->bus), &(rom->device), &(rom->function));
-
- if (status != EFI_SUCCESS)
- goto free_struct;
-
- memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
- pci->romsize);
- return status;
-
-free_struct:
- efi_call_early(free_pool, rom);
- return status;
-
-}
-
static void
setup_efi_pci64(struct boot_params *params, void **pci_handle,
unsigned long size)
{
- efi_pci_io_protocol_64 *pci = NULL;
+ efi_pci_io_protocol_t *pci = NULL;
efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
u64 *handles = (u64 *)(unsigned long)pci_handle;
efi_status_t status;
@@ -311,7 +261,7 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle,
if (!pci)
continue;
- status = __setup_efi_pci64(pci, &rom);
+ status = __setup_efi_pci(pci, &rom);
if (status != EFI_SUCCESS)
continue;
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 8169e8b7a4dc..64037895b085 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -365,6 +365,7 @@ ENTRY(startup_64)
* this function call.
*/
pushq %rsi
+ movq %rsi, %rdi /* real mode address */
call paging_prepare
popq %rsi
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index a0a50b91ecef..b87a7582853d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -47,7 +47,7 @@
#include <linux/decompress/mm.h>
#ifdef CONFIG_X86_5LEVEL
-unsigned int pgtable_l5_enabled __ro_after_init;
+unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift __ro_after_init = 39;
unsigned int ptrs_per_p4d __ro_after_init = 1;
#endif
@@ -734,7 +734,7 @@ void choose_random_location(unsigned long input,
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
- pgtable_l5_enabled = 1;
+ __pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9e11be4cae19..a423bdb42686 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,10 +12,8 @@
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
-#ifdef CONFIG_X86_5LEVEL
-/* cpu_feature_enabled() cannot be used that early */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
#include <linux/linkage.h>
#include <linux/screen_info.h>
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index a362fa0b849c..8c5107545251 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -31,16 +31,23 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
*/
unsigned long *trampoline_32bit __section(.data);
-struct paging_config paging_prepare(void)
+extern struct boot_params *boot_params;
+int cmdline_find_option_bool(const char *option);
+
+struct paging_config paging_prepare(void *rmode)
{
struct paging_config paging_config = {};
unsigned long bios_start, ebda_start;
+ /* Initialize boot_params. Required for cmdline_find_option_bool(). */
+ boot_params = rmode;
+
/*
* Check if LA57 is desired and supported.
*
- * There are two parts to the check:
+ * There are several parts to the check:
* - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
+ * - if user asked to disable 5-level paging: no5lvl in cmdline
* - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported
* + the leaf has the feature bit set
@@ -48,6 +55,7 @@ struct paging_config paging_prepare(void)
* That's substitute for boot_cpu_has() in early boot code.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
+ !cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1;
@@ -130,7 +138,7 @@ void cleanup_trampoline(void *pgtable)
{
void *trampoline_pgtable;
- trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET;
+ trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
/*
* Move the top level page table out of trampoline memory,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d6b27dab1b30..14a2f996e543 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,3 +396,4 @@
382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free
383 i386 statx sys_statx __ia32_sys_statx
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
+385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4dfe42666d0c..cd36232ab62f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -341,6 +341,7 @@
330 common pkey_alloc __x64_sys_pkey_alloc
331 common pkey_free __x64_sys_pkey_free
332 common statx __x64_sys_statx
+333 common io_pgetevents __x64_sys_io_pgetevents
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d998a487c9b1..261802b1cc50 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -44,14 +44,14 @@ obj-y += $(vdso_img_objs)
targets += $(vdso_img_cfiles)
targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
$(DISABLE_LTO)
-$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
@@ -100,11 +100,8 @@ VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-Wl,-z,max-page-size=4096 \
-Wl,-z,common-page-size=4096
-# 64-bit objects to re-brand as x32
-vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
-
# x32-rebranded versions
-vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
# same thing, but in the output directory
vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
@@ -122,7 +119,7 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg
$(call if_changed,objcopy)
-$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 70b7845434cb..7782cdbcd67d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -107,7 +107,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
- memset(&info, 0, sizeof(info));
+ clear_siginfo(&info);
info.si_signo = SIGSEGV;
info.si_errno = 0;
info.si_code = SEGV_MAPERR;
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 786fd875de92..4b98101209a1 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -889,7 +889,7 @@ static void force_ibs_eilvt_setup(void)
if (!ibs_eilvt_valid())
goto out;
- pr_info("IBS: LVT offset %d assigned\n", offset);
+ pr_info("LVT offset %d assigned\n", offset);
return;
out:
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index f5cbbba99283..981ba5e8241b 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -19,6 +19,7 @@
#include <asm/cpufeature.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
+#include <asm/smp.h>
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
@@ -399,26 +400,8 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
}
if (amd_uncore_llc) {
- unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared, subleaf, prev_eax = 0;
-
uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- /*
- * Iterate over Cache Topology Definition leaves until no
- * more cache descriptions are available.
- */
- for (subleaf = 0; subleaf < 5; subleaf++) {
- cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
-
- /* EAX[0:4] gives type of cache */
- if (!(eax & 0x1f))
- break;
-
- prev_eax = eax;
- }
- nshared = ((prev_eax >> 14) & 0xfff) + 1;
-
- uncore->id = apicid - (apicid % nshared);
+ uncore->id = per_cpu(cpu_llc_id, cpu);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 45b2b1c93d04..6e461fb1e0d4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2397,7 +2397,7 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_IA32_EMULATION
-#include <asm/compat.h>
+#include <linux/compat.h>
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b993942a0e4..8d016ce5b80d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;
- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;
@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;
list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a7956fc7ca1d..15b07379e72d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -203,7 +203,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box,
hwc->idx = idx;
hwc->last_tag = ++box->tags[idx];
- if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+ if (uncore_pmc_fixed(hwc->idx)) {
hwc->event_base = uncore_fixed_ctr(box);
hwc->config_base = uncore_fixed_ctl(box);
return;
@@ -218,7 +218,9 @@ void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *e
u64 prev_count, new_count, delta;
int shift;
- if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+ if (uncore_pmc_freerunning(event->hw.idx))
+ shift = 64 - uncore_freerunning_bits(box, event);
+ else if (uncore_pmc_fixed(event->hw.idx))
shift = 64 - uncore_fixed_ctr_bits(box);
else
shift = 64 - uncore_perf_ctr_bits(box);
@@ -449,15 +451,30 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
return ret ? -EINVAL : 0;
}
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
+void uncore_pmu_event_start(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
int idx = event->hw.idx;
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
return;
- if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+ /*
+ * Free running counter is read-only and always active.
+ * Use the current counter value as start point.
+ * There is no overflow interrupt for free running counter.
+ * Use hrtimer to periodically poll the counter to avoid overflow.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx)) {
+ list_add_tail(&event->active_entry, &box->active_list);
+ local64_set(&event->hw.prev_count,
+ uncore_read_counter(box, event));
+ if (box->n_active++ == 0)
+ uncore_pmu_start_hrtimer(box);
+ return;
+ }
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
event->hw.state = 0;
@@ -474,11 +491,20 @@ static void uncore_pmu_event_start(struct perf_event *event, int flags)
}
}
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+void uncore_pmu_event_stop(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
+ /* Cannot disable free running counter which is read-only */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ list_del(&event->active_entry);
+ if (--box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ uncore_perf_event_update(box, event);
+ return;
+ }
+
if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
uncore_disable_event(box, event);
box->n_active--;
@@ -502,7 +528,7 @@ static void uncore_pmu_event_stop(struct perf_event *event, int flags)
}
}
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
+int uncore_pmu_event_add(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
@@ -512,6 +538,17 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags)
if (!box)
return -ENODEV;
+ /*
+ * The free funning counter is assigned in event_init().
+ * The free running counter event and free running counter
+ * are 1:1 mapped. It doesn't need to be tracked in event_list.
+ */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ if (flags & PERF_EF_START)
+ uncore_pmu_event_start(event, 0);
+ return 0;
+ }
+
ret = n = uncore_collect_events(box, event, false);
if (ret < 0)
return ret;
@@ -563,13 +600,21 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags)
return 0;
}
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
+void uncore_pmu_event_del(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
int i;
uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+ /*
+ * The event for free running counter is not tracked by event_list.
+ * It doesn't need to force event->hw.idx = -1 to reassign the counter.
+ * Because the event and the free running counter are 1:1 mapped.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return;
+
for (i = 0; i < box->n_events; i++) {
if (event == box->event_list[i]) {
uncore_put_event_constraint(box, event);
@@ -603,6 +648,10 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu,
struct intel_uncore_box *fake_box;
int ret = -EINVAL, n;
+ /* The free running counter is always active. */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return 0;
+
fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
if (!fake_box)
return -ENOMEM;
@@ -690,6 +739,17 @@ static int uncore_pmu_event_init(struct perf_event *event)
/* fixed counters have event field hardcoded to zero */
hwc->config = 0ULL;
+ } else if (is_freerunning_event(event)) {
+ if (!check_valid_freerunning_event(box, event))
+ return -EINVAL;
+ event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
+ /*
+ * The free running counter event and free running counter
+ * are always 1:1 mapped.
+ * The free running counter is always active.
+ * Assign the free running counter here.
+ */
+ event->hw.event_base = uncore_freerunning_counter(box, event);
} else {
hwc->config = event->attr.config &
(pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 414dc7e7c950..c9e1e0bef3c3 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -12,8 +12,13 @@
#define UNCORE_FIXED_EVENT 0xff
#define UNCORE_PMC_IDX_MAX_GENERIC 8
+#define UNCORE_PMC_IDX_MAX_FIXED 1
+#define UNCORE_PMC_IDX_MAX_FREERUNNING 1
#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
+#define UNCORE_PMC_IDX_FREERUNNING (UNCORE_PMC_IDX_FIXED + \
+ UNCORE_PMC_IDX_MAX_FIXED)
+#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FREERUNNING + \
+ UNCORE_PMC_IDX_MAX_FREERUNNING)
#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \
((dev << 24) | (func << 16) | (type << 8) | idx)
@@ -35,6 +40,7 @@ struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
struct uncore_event_desc;
+struct freerunning_counters;
struct intel_uncore_type {
const char *name;
@@ -42,6 +48,7 @@ struct intel_uncore_type {
int num_boxes;
int perf_ctr_bits;
int fixed_ctr_bits;
+ int num_freerunning_types;
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
@@ -59,6 +66,7 @@ struct intel_uncore_type {
struct intel_uncore_pmu *pmus;
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
+ struct freerunning_counters *freerunning;
const struct attribute_group *attr_groups[4];
struct pmu *pmu; /* for custom pmu ops */
};
@@ -129,6 +137,14 @@ struct uncore_event_desc {
const char *config;
};
+struct freerunning_counters {
+ unsigned int counter_base;
+ unsigned int counter_offset;
+ unsigned int box_offset;
+ unsigned int num_counters;
+ unsigned int bits;
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
@@ -157,6 +173,16 @@ static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+static inline bool uncore_pmc_fixed(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FIXED;
+}
+
+static inline bool uncore_pmc_freerunning(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FREERUNNING;
+}
+
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->box_ctl;
@@ -214,6 +240,60 @@ static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
}
+
+/*
+ * In the uncore document, there is no event-code assigned to free running
+ * counters. Some events need to be defined to indicate the free running
+ * counters. The events are encoded as event-code + umask-code.
+ *
+ * The event-code for all free running counters is 0xff, which is the same as
+ * the fixed counters.
+ *
+ * The umask-code is used to distinguish a fixed counter and a free running
+ * counter, and different types of free running counters.
+ * - For fixed counters, the umask-code is 0x0X.
+ * X indicates the index of the fixed counter, which starts from 0.
+ * - For free running counters, the umask-code uses the rest of the space.
+ * It would bare the format of 0xXY.
+ * X stands for the type of free running counters, which starts from 1.
+ * Y stands for the index of free running counters of same type, which
+ * starts from 0.
+ *
+ * For example, there are three types of IIO free running counters on Skylake
+ * server, IO CLOCKS counters, BANDWIDTH counters and UTILIZATION counters.
+ * The event-code for all the free running counters is 0xff.
+ * 'ioclk' is the first counter of IO CLOCKS. IO CLOCKS is the first type,
+ * which umask-code starts from 0x10.
+ * So 'ioclk' is encoded as event=0xff,umask=0x10
+ * 'bw_in_port2' is the third counter of BANDWIDTH counters. BANDWIDTH is
+ * the second type, which umask-code starts from 0x20.
+ * So 'bw_in_port2' is encoded as event=0xff,umask=0x22
+ */
+static inline unsigned int uncore_freerunning_idx(u64 config)
+{
+ return ((config >> 8) & 0xf);
+}
+
+#define UNCORE_FREERUNNING_UMASK_START 0x10
+
+static inline unsigned int uncore_freerunning_type(u64 config)
+{
+ return ((((config >> 8) - UNCORE_FREERUNNING_UMASK_START) >> 4) & 0xf);
+}
+
+static inline
+unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int idx = uncore_freerunning_idx(event->attr.config);
+ struct intel_uncore_pmu *pmu = box->pmu;
+
+ return pmu->type->freerunning[type].counter_base +
+ pmu->type->freerunning[type].counter_offset * idx +
+ pmu->type->freerunning[type].box_offset * pmu->pmu_idx;
+}
+
static inline
unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
{
@@ -276,11 +356,52 @@ static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
return box->pmu->type->fixed_ctr_bits;
}
+static inline
+unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->attr.config);
+
+ return box->pmu->type->freerunning[type].bits;
+}
+
+static inline int uncore_num_freerunning(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->attr.config);
+
+ return box->pmu->type->freerunning[type].num_counters;
+}
+
+static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->num_freerunning_types;
+}
+
+static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int idx = uncore_freerunning_idx(event->attr.config);
+
+ return (type < uncore_num_freerunning_types(box, event)) &&
+ (idx < uncore_num_freerunning(box, event));
+}
+
static inline int uncore_num_counters(struct intel_uncore_box *box)
{
return box->pmu->type->num_counters;
}
+static inline bool is_freerunning_event(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ return ((cfg & UNCORE_FIXED_EVENT) == UNCORE_FIXED_EVENT) &&
+ (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START);
+}
+
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
@@ -346,6 +467,10 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_start(struct perf_event *event, int flags);
+void uncore_pmu_event_stop(struct perf_event *event, int flags);
+int uncore_pmu_event_add(struct perf_event *event, int flags);
+void uncore_pmu_event_del(struct perf_event *event, int flags);
void uncore_pmu_event_read(struct perf_event *event);
void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
struct event_constraint *
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 93e7a8397cde..173e2674be6e 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -246,7 +246,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p
{
struct hw_perf_event *hwc = &event->hw;
- if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+ if (hwc->idx == UNCORE_PMC_IDX_FIXED)
wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index aee5e8496be4..8527c3e1038b 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -285,6 +285,15 @@ static struct uncore_event_desc snb_uncore_imc_events[] = {
#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+enum perf_snb_uncore_imc_freerunning_types {
+ SNB_PCI_UNCORE_IMC_DATA = 0,
+ SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snb_uncore_imc_freerunning[] = {
+ [SNB_PCI_UNCORE_IMC_DATA] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 },
+};
+
static struct attribute *snb_uncore_imc_formats_attr[] = {
&format_attr_event.attr,
NULL,
@@ -341,9 +350,8 @@ static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf
}
/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
+ * Keep the custom event_init() function compatible with old event
+ * encoding for free running counters.
*/
static int snb_uncore_imc_event_init(struct perf_event *event)
{
@@ -405,11 +413,11 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
switch (cfg) {
case SNB_UNCORE_PCI_IMC_DATA_READS:
base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
- idx = UNCORE_PMC_IDX_FIXED;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
break;
case SNB_UNCORE_PCI_IMC_DATA_WRITES:
base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
- idx = UNCORE_PMC_IDX_FIXED + 1;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
break;
default:
return -EINVAL;
@@ -430,75 +438,6 @@ static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_ev
return 0;
}
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- u64 count;
-
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
- return;
-
- event->hw.state = 0;
- box->n_active++;
-
- list_add_tail(&event->active_entry, &box->active_list);
-
- count = snb_uncore_imc_read_counter(box, event);
- local64_set(&event->hw.prev_count, count);
-
- if (box->n_active == 1)
- uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!(hwc->state & PERF_HES_STOPPED)) {
- box->n_active--;
-
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
-
- list_del(&event->active_entry);
-
- if (box->n_active == 0)
- uncore_pmu_cancel_hrtimer(box);
- }
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- uncore_perf_event_update(box, event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!box)
- return -ENODEV;
-
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- if (!(flags & PERF_EF_START))
- hwc->state |= PERF_HES_ARCH;
-
- snb_uncore_imc_event_start(event, 0);
-
- return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
- snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-}
-
int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
@@ -530,10 +469,10 @@ int snb_pci2phy_map_init(int devid)
static struct pmu snb_uncore_imc_pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = snb_uncore_imc_event_init,
- .add = snb_uncore_imc_event_add,
- .del = snb_uncore_imc_event_del,
- .start = snb_uncore_imc_event_start,
- .stop = snb_uncore_imc_event_stop,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
.read = uncore_pmu_event_read,
};
@@ -552,12 +491,10 @@ static struct intel_uncore_type snb_uncore_imc = {
.name = "imc",
.num_counters = 2,
.num_boxes = 1,
- .fixed_ctr_bits = 32,
- .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .freerunning = snb_uncore_imc_freerunning,
.event_descs = snb_uncore_imc_events,
.format_group = &snb_uncore_imc_format_group,
- .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
- .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
.ops = &snb_uncore_imc_ops,
.pmu = &snb_uncore_imc_pmu,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 77076a102e34..87dc0263a2e1 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3522,6 +3522,87 @@ static struct intel_uncore_type skx_uncore_iio = {
.format_group = &skx_uncore_iio_format_group,
};
+enum perf_uncore_iio_freerunning_type_id {
+ SKX_IIO_MSR_IOCLK = 0,
+ SKX_IIO_MSR_BW = 1,
+ SKX_IIO_MSR_UTIL = 2,
+
+ SKX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+
+static struct freerunning_counters skx_iio_freerunning[] = {
+ [SKX_IIO_MSR_IOCLK] = { 0xa45, 0x1, 0x20, 1, 36 },
+ [SKX_IIO_MSR_BW] = { 0xb00, 0x1, 0x10, 8, 36 },
+ [SKX_IIO_MSR_UTIL] = { 0xb08, 0x1, 0x10, 8, 36 },
+};
+
+static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = {
+ /* Free-Running IO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"),
+ /* Free-running IIO UTILIZATION Counters */
+ INTEL_UNCORE_EVENT_DESC(util_in_port0, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port0, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port1, "event=0xff,umask=0x32"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port1, "event=0xff,umask=0x33"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port2, "event=0xff,umask=0x34"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port2, "event=0xff,umask=0x35"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port3, "event=0xff,umask=0x36"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port3, "event=0xff,umask=0x37"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = {
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_freerunning_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_freerunning_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 17,
+ .num_boxes = 6,
+ .num_freerunning_types = SKX_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = skx_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = skx_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
static struct attribute *skx_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -3595,6 +3676,7 @@ static struct intel_uncore_type *skx_msr_uncores[] = {
&skx_uncore_ubox,
&skx_uncore_chabox,
&skx_uncore_iio,
+ &skx_uncore_iio_free_running,
&skx_uncore_irp,
&skx_uncore_pcu,
NULL,
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 367a8203cfcf..b173d404e3df 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1 +1,2 @@
-obj-y := hv_init.o mmu.o
+obj-y := hv_init.o mmu.o
+obj-$(CONFIG_X86_64) += hv_apic.o
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
new file mode 100644
index 000000000000..f68855499391
--- /dev/null
+++ b/arch/x86/hyperv/hv_apic.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific APIC code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+static struct apic orig_apic;
+
+static u64 hv_apic_icr_read(void)
+{
+ u64 reg_val;
+
+ rdmsrl(HV_X64_MSR_ICR, reg_val);
+ return reg_val;
+}
+
+static void hv_apic_icr_write(u32 low, u32 id)
+{
+ u64 reg_val;
+
+ reg_val = SET_APIC_DEST_FIELD(id);
+ reg_val = reg_val << 32;
+ reg_val |= low;
+
+ wrmsrl(HV_X64_MSR_ICR, reg_val);
+}
+
+static u32 hv_apic_read(u32 reg)
+{
+ u32 reg_val, hi;
+
+ switch (reg) {
+ case APIC_EOI:
+ rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+ return reg_val;
+ case APIC_TASKPRI:
+ rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+ return reg_val;
+
+ default:
+ return native_apic_mem_read(reg);
+ }
+}
+
+static void hv_apic_write(u32 reg, u32 val)
+{
+ switch (reg) {
+ case APIC_EOI:
+ wrmsr(HV_X64_MSR_EOI, val, 0);
+ break;
+ case APIC_TASKPRI:
+ wrmsr(HV_X64_MSR_TPR, val, 0);
+ break;
+ default:
+ native_apic_mem_write(reg, val);
+ }
+}
+
+static void hv_apic_eoi_write(u32 reg, u32 val)
+{
+ wrmsr(HV_X64_MSR_EOI, val, 0);
+}
+
+/*
+ * IPI implementation on Hyper-V.
+ */
+static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
+{
+ struct ipi_arg_ex **arg;
+ struct ipi_arg_ex *ipi_arg;
+ unsigned long flags;
+ int nr_bank = 0;
+ int ret = 1;
+
+ local_irq_save(flags);
+ arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ ipi_arg = *arg;
+ if (unlikely(!ipi_arg))
+ goto ipi_mask_ex_done;
+
+ ipi_arg->vector = vector;
+ ipi_arg->reserved = 0;
+ ipi_arg->vp_set.valid_bank_mask = 0;
+
+ if (!cpumask_equal(mask, cpu_present_mask)) {
+ ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+ }
+ if (!nr_bank)
+ ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+
+ ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ ipi_arg, NULL);
+
+ipi_mask_ex_done:
+ local_irq_restore(flags);
+ return ((ret == 0) ? true : false);
+}
+
+static bool __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ int cur_cpu, vcpu;
+ struct ipi_arg_non_ex **arg;
+ struct ipi_arg_non_ex *ipi_arg;
+ int ret = 1;
+ unsigned long flags;
+
+ if (cpumask_empty(mask))
+ return true;
+
+ if (!hv_hypercall_pg)
+ return false;
+
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return false;
+
+ if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return __send_ipi_mask_ex(mask, vector);
+
+ local_irq_save(flags);
+ arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ ipi_arg = *arg;
+ if (unlikely(!ipi_arg))
+ goto ipi_mask_done;
+
+ ipi_arg->vector = vector;
+ ipi_arg->reserved = 0;
+ ipi_arg->cpu_mask = 0;
+
+ for_each_cpu(cur_cpu, mask) {
+ vcpu = hv_cpu_number_to_vp_number(cur_cpu);
+ /*
+ * This particular version of the IPI hypercall can
+ * only target upto 64 CPUs.
+ */
+ if (vcpu >= 64)
+ goto ipi_mask_done;
+
+ __set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask);
+ }
+
+ ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL);
+
+ipi_mask_done:
+ local_irq_restore(flags);
+ return ((ret == 0) ? true : false);
+}
+
+static bool __send_ipi_one(int cpu, int vector)
+{
+ struct cpumask mask = CPU_MASK_NONE;
+
+ cpumask_set_cpu(cpu, &mask);
+ return __send_ipi_mask(&mask, vector);
+}
+
+static void hv_send_ipi(int cpu, int vector)
+{
+ if (!__send_ipi_one(cpu, vector))
+ orig_apic.send_IPI(cpu, vector);
+}
+
+static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ if (!__send_ipi_mask(mask, vector))
+ orig_apic.send_IPI_mask(mask, vector);
+}
+
+static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask new_mask;
+ const struct cpumask *local_mask;
+
+ cpumask_copy(&new_mask, mask);
+ cpumask_clear_cpu(this_cpu, &new_mask);
+ local_mask = &new_mask;
+ if (!__send_ipi_mask(local_mask, vector))
+ orig_apic.send_IPI_mask_allbutself(mask, vector);
+}
+
+static void hv_send_ipi_allbutself(int vector)
+{
+ hv_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void hv_send_ipi_all(int vector)
+{
+ if (!__send_ipi_mask(cpu_online_mask, vector))
+ orig_apic.send_IPI_all(vector);
+}
+
+static void hv_send_ipi_self(int vector)
+{
+ if (!__send_ipi_one(smp_processor_id(), vector))
+ orig_apic.send_IPI_self(vector);
+}
+
+void __init hv_apic_init(void)
+{
+ if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
+ if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ pr_info("Hyper-V: Using ext hypercalls for IPI\n");
+ else
+ pr_info("Hyper-V: Using IPI hypercalls\n");
+ /*
+ * Set the IPI entry points.
+ */
+ orig_apic = *apic;
+
+ apic->send_IPI = hv_send_ipi;
+ apic->send_IPI_mask = hv_send_ipi_mask;
+ apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself;
+ apic->send_IPI_allbutself = hv_send_ipi_allbutself;
+ apic->send_IPI_all = hv_send_ipi_all;
+ apic->send_IPI_self = hv_send_ipi_self;
+ }
+
+ if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
+ pr_info("Hyper-V: Using MSR based APIC access\n");
+ apic_set_eoi_write(hv_apic_eoi_write);
+ apic->read = hv_apic_read;
+ apic->write = hv_apic_write;
+ apic->icr_write = hv_apic_icr_write;
+ apic->icr_read = hv_apic_icr_read;
+ }
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index cfecc2272f2d..4c431e1c1eff 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -91,12 +91,19 @@ EXPORT_SYMBOL_GPL(hv_vp_index);
struct hv_vp_assist_page **hv_vp_assist_page;
EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+void __percpu **hyperv_pcpu_input_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+
u32 hv_max_vp_index;
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+ void **input_arg;
+
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ *input_arg = page_address(alloc_page(GFP_KERNEL));
hv_get_vp_index(msr_vp_index);
@@ -217,6 +224,16 @@ static int hv_cpu_die(unsigned int cpu)
{
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
+ unsigned long flags;
+ void **input_arg;
+ void *input_pg = NULL;
+
+ local_irq_save(flags);
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ input_pg = *input_arg;
+ *input_arg = NULL;
+ local_irq_restore(flags);
+ free_page((unsigned long)input_pg);
if (hv_vp_assist_page && hv_vp_assist_page[cpu])
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -242,8 +259,9 @@ static int hv_cpu_die(unsigned int cpu)
*
* 1. Setup the hypercall page.
* 2. Register Hyper-V specific clocksource.
+ * 3. Setup Hyper-V specific APIC entry points.
*/
-void hyperv_init(void)
+void __init hyperv_init(void)
{
u64 guest_id, required_msrs;
union hv_x64_msr_hypercall_contents hypercall_msr;
@@ -259,6 +277,16 @@ void hyperv_init(void)
if ((ms_hyperv.features & required_msrs) != required_msrs)
return;
+ /*
+ * Allocate the per-CPU state for the hypercall input arg.
+ * If this allocation fails, we will not be able to setup
+ * (per-CPU) hypercall input page and thus this failure is
+ * fatal on Hyper-V.
+ */
+ hyperv_pcpu_input_arg = alloc_percpu(void *);
+
+ BUG_ON(hyperv_pcpu_input_arg == NULL);
+
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
@@ -296,7 +324,7 @@ void hyperv_init(void)
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
- hyper_alloc_mmu();
+ hv_apic_init();
/*
* Register Hyper-V specific clocksource.
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 56c9ebac946f..5f053d7d1bd9 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -25,20 +25,13 @@ struct hv_flush_pcpu {
struct hv_flush_pcpu_ex {
u64 address_space;
u64 flags;
- struct {
- u64 format;
- u64 valid_bank_mask;
- u64 bank_contents[];
- } hv_vp_set;
+ struct hv_vpset hv_vp_set;
u64 gva_list[];
};
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
-static struct hv_flush_pcpu __percpu **pcpu_flush;
-
-static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
/*
* Fills in gva_list starting from offset. Returns the number of items added.
@@ -70,41 +63,6 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}
-/* Return the number of banks in the resulting vp_set */
-static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
- const struct cpumask *cpus)
-{
- int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
-
- /* valid_bank_mask can represent up to 64 banks */
- if (hv_max_vp_index / 64 >= 64)
- return 0;
-
- /*
- * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
- * structs are not cleared between calls, we risk flushing unneeded
- * vCPUs otherwise.
- */
- for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
- flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
-
- /*
- * Some banks may end up being empty but this is acceptable.
- */
- for_each_cpu(cpu, cpus) {
- vcpu = hv_cpu_number_to_vp_number(cpu);
- vcpu_bank = vcpu / 64;
- vcpu_offset = vcpu % 64;
- __set_bit(vcpu_offset, (unsigned long *)
- &flush->hv_vp_set.bank_contents[vcpu_bank]);
- if (vcpu_bank >= nr_bank)
- nr_bank = vcpu_bank + 1;
- }
- flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0);
-
- return nr_bank;
-}
-
static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
@@ -116,7 +74,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
trace_hyperv_mmu_flush_tlb_others(cpus, info);
- if (!pcpu_flush || !hv_hypercall_pg)
+ if (!hv_hypercall_pg)
goto do_native;
if (cpumask_empty(cpus))
@@ -124,10 +82,8 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
local_irq_save(flags);
- flush_pcpu = this_cpu_ptr(pcpu_flush);
-
- if (unlikely(!*flush_pcpu))
- *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+ flush_pcpu = (struct hv_flush_pcpu **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
@@ -203,7 +159,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
trace_hyperv_mmu_flush_tlb_others(cpus, info);
- if (!pcpu_flush_ex || !hv_hypercall_pg)
+ if (!hv_hypercall_pg)
goto do_native;
if (cpumask_empty(cpus))
@@ -211,10 +167,8 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
local_irq_save(flags);
- flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
-
- if (unlikely(!*flush_pcpu))
- *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+ flush_pcpu = (struct hv_flush_pcpu_ex **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
@@ -239,8 +193,8 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.valid_bank_mask = 0;
if (!cpumask_equal(cpus, cpu_present_mask)) {
- flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K;
- nr_bank = cpumask_to_vp_set(flush, cpus);
+ flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
}
if (!nr_bank) {
@@ -296,14 +250,3 @@ void hyperv_setup_mmu_ops(void)
pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex;
}
}
-
-void hyper_alloc_mmu(void)
-{
- if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
- return;
-
- if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
- else
- pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
-}
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
new file mode 100644
index 000000000000..e958e28f7ab5
--- /dev/null
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEINFO_H
+#define _ASM_X86_CACHEINFO_H
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+
+#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index e1c8dab86670..fb97cf7c4137 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -17,7 +17,6 @@
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
typedef s32 compat_clock_t;
typedef s32 compat_pid_t;
typedef u16 __compat_uid_t;
@@ -46,16 +45,6 @@ typedef u32 compat_u32;
typedef u64 __attribute__((aligned(4))) compat_u64;
typedef u32 compat_uptr_t;
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
-
struct compat_stat {
compat_dev_t st_dev;
u16 __pad1;
@@ -145,10 +134,10 @@ struct compat_ipc64_perm {
struct compat_semid64_ds {
struct compat_ipc64_perm sem_perm;
- compat_time_t sem_otime;
- compat_ulong_t __unused1;
- compat_time_t sem_ctime;
- compat_ulong_t __unused2;
+ compat_ulong_t sem_otime;
+ compat_ulong_t sem_otime_high;
+ compat_ulong_t sem_ctime;
+ compat_ulong_t sem_ctime_high;
compat_ulong_t sem_nsems;
compat_ulong_t __unused3;
compat_ulong_t __unused4;
@@ -156,12 +145,12 @@ struct compat_semid64_ds {
struct compat_msqid64_ds {
struct compat_ipc64_perm msg_perm;
- compat_time_t msg_stime;
- compat_ulong_t __unused1;
- compat_time_t msg_rtime;
- compat_ulong_t __unused2;
- compat_time_t msg_ctime;
- compat_ulong_t __unused3;
+ compat_ulong_t msg_stime;
+ compat_ulong_t msg_stime_high;
+ compat_ulong_t msg_rtime;
+ compat_ulong_t msg_rtime_high;
+ compat_ulong_t msg_ctime;
+ compat_ulong_t msg_ctime_high;
compat_ulong_t msg_cbytes;
compat_ulong_t msg_qnum;
compat_ulong_t msg_qbytes;
@@ -174,12 +163,12 @@ struct compat_msqid64_ds {
struct compat_shmid64_ds {
struct compat_ipc64_perm shm_perm;
compat_size_t shm_segsz;
- compat_time_t shm_atime;
- compat_ulong_t __unused1;
- compat_time_t shm_dtime;
- compat_ulong_t __unused2;
- compat_time_t shm_ctime;
- compat_ulong_t __unused3;
+ compat_ulong_t shm_atime;
+ compat_ulong_t shm_atime_high;
+ compat_ulong_t shm_dtime;
+ compat_ulong_t shm_dtime_high;
+ compat_ulong_t shm_ctime;
+ compat_ulong_t shm_ctime_high;
compat_pid_t shm_cpid;
compat_pid_t shm_lpid;
compat_ulong_t shm_nattch;
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 89ce4bfd241f..ce4d176b3d13 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
return dma_ops;
}
-int arch_dma_supported(struct device *dev, u64 mask);
-#define arch_dma_supported arch_dma_supported
-
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+bool arch_dma_alloc_attrs(struct device **dev);
#define arch_dma_alloc_attrs arch_dma_alloc_attrs
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index cc8f8fcf9b4a..c18ed65287d5 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -63,7 +63,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
#ifndef COMPILE_OFFSETS
#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
-#include <asm/compat.h>
+#include <linux/compat.h>
/*
* Because ia32 syscalls do not map to x86_64 syscall numbers
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 5ea2afd4c871..740a428acf1e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -50,14 +50,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
-#define local_softirq_pending() this_cpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x) \
- this_cpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x))
-
extern void ack_bad_irq(unsigned int irq);
extern u64 arch_irq_stat_cpu(unsigned int cpu);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 416cb0e0c496..3bfa92c2793c 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -164,6 +164,11 @@
*/
#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9)
+/*
+ * Recommend using cluster IPI hypercalls.
+ */
+#define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10)
+
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
@@ -329,12 +334,17 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+#define HV_IPI_LOW_VECTOR 0x10
+#define HV_IPI_HIGH_VECTOR 0xff
+
/* Declare the various hypercall operations. */
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_SEND_IPI 0x000b
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
+#define HVCALL_SEND_IPI_EX 0x0015
#define HVCALL_POST_MESSAGE 0x005c
#define HVCALL_SIGNAL_EVENT 0x005d
@@ -360,7 +370,7 @@ struct hv_tsc_emulation_status {
#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
enum HV_GENERIC_SET_FORMAT {
- HV_GENERIC_SET_SPARCE_4K,
+ HV_GENERIC_SET_SPARSE_4K,
HV_GENERIC_SET_ALL,
};
@@ -706,4 +716,22 @@ struct hv_enlightened_vmcs {
#define HV_STIMER_AUTOENABLE (1ULL << 3)
#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
+struct ipi_arg_non_ex {
+ u32 vector;
+ u32 reserved;
+ u64 cpu_mask;
+};
+
+struct hv_vpset {
+ u64 format;
+ u64 valid_bank_mask;
+ u64 bank_contents[];
+};
+
+struct ipi_arg_ex {
+ u32 vector;
+ u32 reserved;
+ struct hv_vpset vp_set;
+};
+
#endif
diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h
index 35555016b1be..0b44b1abe4d9 100644
--- a/arch/x86/include/asm/intel_mid_vrtc.h
+++ b/arch/x86/include/asm/intel_mid_vrtc.h
@@ -4,7 +4,7 @@
extern unsigned char vrtc_cmos_read(unsigned char reg);
extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
-extern void vrtc_get_time(struct timespec *now);
-extern int vrtc_set_mmss(const struct timespec *now);
+extern void vrtc_get_time(struct timespec64 *now);
+extern int vrtc_set_mmss(const struct timespec64 *now);
#endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b9375d8c..6de64840dd22 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -94,10 +94,10 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#ifdef CONFIG_X86_64
-build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
-build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
+build_mmio_read(readq, "q", u64, "=r", :"memory")
+build_mmio_read(__readq, "q", u64, "=r", )
+build_mmio_write(writeq, "q", u64, "r", :"memory")
+build_mmio_write(__writeq, "q", u64, "r", )
#define readq_relaxed(a) __readq(a)
#define writeq_relaxed(v, a) __writeq(v, a)
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 1775a32f7ea6..97198001e567 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void)
unsigned char rtc_cmos_read(unsigned char addr);
void rtc_cmos_write(unsigned char val, unsigned char addr);
-extern int mach_set_rtc_mmss(const struct timespec *now);
-extern void mach_get_cmos_time(struct timespec *now);
+extern int mach_set_rtc_mmss(const struct timespec64 *now);
+extern void mach_get_cmos_time(struct timespec64 *now);
#define RTC_IRQ 8
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index b90e79610cf7..997192131b7b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -122,6 +122,7 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {}
#if IS_ENABLED(CONFIG_HYPERV)
extern struct clocksource *hyperv_cs;
extern void *hv_hypercall_pg;
+extern void __percpu **hyperv_pcpu_input_arg;
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
@@ -258,9 +259,41 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
return hv_vp_index[cpu_number];
}
-void hyperv_init(void);
+static inline int cpumask_to_vpset(struct hv_vpset *vpset,
+ const struct cpumask *cpus)
+{
+ int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
+
+ /* valid_bank_mask can represent up to 64 banks */
+ if (hv_max_vp_index / 64 >= 64)
+ return 0;
+
+ /*
+ * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+ * structs are not cleared between calls, we risk flushing unneeded
+ * vCPUs otherwise.
+ */
+ for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+ vpset->bank_contents[vcpu_bank] = 0;
+
+ /*
+ * Some banks may end up being empty but this is acceptable.
+ */
+ for_each_cpu(cpu, cpus) {
+ vcpu = hv_cpu_number_to_vp_number(cpu);
+ vcpu_bank = vcpu / 64;
+ vcpu_offset = vcpu % 64;
+ __set_bit(vcpu_offset, (unsigned long *)
+ &vpset->bank_contents[vcpu_bank]);
+ if (vcpu_bank >= nr_bank)
+ nr_bank = vcpu_bank + 1;
+ }
+ vpset->valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0);
+ return nr_bank;
+}
+
+void __init hyperv_init(void);
void hyperv_setup_mmu_ops(void);
-void hyper_alloc_mmu(void);
void hyperv_report_panic(struct pt_regs *regs, long err);
bool hv_is_hyperv_initialized(void);
void hyperv_cleanup(void);
@@ -269,6 +302,13 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs);
void set_hv_tscchange_cb(void (*cb)(void));
void clear_hv_tscchange_cb(void);
void hyperv_stop_tsc_emulation(void);
+
+#ifdef CONFIG_X86_64
+void hv_apic_init(void);
+#else
+static inline void hv_apic_init(void) {}
+#endif
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline bool hv_is_hyperv_initialized(void) { return false; }
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 2c5a966dc222..6afac386a434 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -53,7 +53,7 @@
#define __PHYSICAL_MASK_SHIFT 52
#ifdef CONFIG_X86_5LEVEL
-#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47)
+#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
#else
#define __VIRTUAL_MASK_SHIFT 47
#endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9be2bf13825b..d49bbf4bb5c8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -574,14 +574,14 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
}
#define set_pgd(pgdp, pgdval) do { \
- if (pgtable_l5_enabled) \
+ if (pgtable_l5_enabled()) \
__set_pgd(pgdp, pgdval); \
else \
set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
} while (0)
#define pgd_clear(pgdp) do { \
- if (pgtable_l5_enabled) \
+ if (pgtable_l5_enabled()) \
set_pgd(pgdp, __pgd(0)); \
} while (0)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d32175e30259..662963681ea6 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#endif
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 263c142a6a6c..ada6410fd2ec 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -167,7 +167,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
@@ -193,7 +193,7 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
___p4d_free_tlb(tlb, p4d);
}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f1633de5a675..99ecde23c3ec 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
+#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif
#ifndef set_p4d
@@ -881,7 +881,7 @@ static inline unsigned long p4d_index(unsigned long address)
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}
@@ -898,9 +898,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
/* to find an entry in a page-table-directory. */
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static __always_inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return (p4d_t *)pgd;
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
@@ -909,7 +909,7 @@ static inline int pgd_bad(pgd_t pgd)
{
unsigned long ignore_flags = _PAGE_USER;
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return 0;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
@@ -920,7 +920,7 @@ static inline int pgd_bad(pgd_t pgd)
static inline int pgd_none(pgd_t pgd)
{
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return 0;
/*
* There is no need to do a workaround for the KNL stray
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index e3225e83db7d..d9a001a4a872 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -15,7 +15,7 @@
# include <asm/pgtable-2level_types.h>
#endif
-#define pgtable_l5_enabled 0
+#define pgtable_l5_enabled() 0
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 877bc27718ae..3c5385f9a88f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -220,7 +220,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
- if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+ if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
*p4dp = p4d;
return;
}
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index adb47552e6bb..054765ab2da2 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -22,12 +22,23 @@ typedef struct { pteval_t pte; } pte_t;
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
-#ifndef pgtable_l5_enabled
-#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
-#endif
+
+#ifdef USE_EARLY_PGTABLE_L5
+/*
+ * cpu_feature_enabled() is not available in early boot code.
+ * Use variable instead.
+ */
+static inline bool pgtable_l5_enabled(void)
+{
+ return __pgtable_l5_enabled;
+}
#else
-#define pgtable_l5_enabled 0
-#endif
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#endif /* USE_EARLY_PGTABLE_L5 */
+
+#else
+#define pgtable_l5_enabled() 0
+#endif /* CONFIG_X86_5LEVEL */
extern unsigned int pgdir_shift;
extern unsigned int ptrs_per_p4d;
@@ -102,7 +113,7 @@ extern unsigned int ptrs_per_p4d;
#define LDT_PGD_ENTRY_L4 -3UL
#define LDT_PGD_ENTRY_L5 -112UL
-#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
+#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define __VMALLOC_BASE_L4 0xffffc90000000000UL
@@ -116,7 +127,7 @@ extern unsigned int ptrs_per_p4d;
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
# define VMALLOC_START vmalloc_base
-# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
+# define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
# define VMEMMAP_START vmemmap_base
#else
# define VMALLOC_START __VMALLOC_BASE_L4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 21a114914ba4..e28add6b791f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -186,15 +186,6 @@ extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern u32 get_scattered_cpuid_leaf(unsigned int level,
- unsigned int sub_leaf,
- enum cpuid_regs_idx reg);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
-
-extern void detect_extended_topology(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
#ifdef CONFIG_X86_32
extern int have_cpuid_p(void);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index a7471dcd2205..b6033680d458 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,7 +12,7 @@ void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
- struct timespec *ts);
+ struct timespec64 *ts);
void pvclock_resume(void);
void pvclock_touch_watchdogs(void);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 5e16b5d40d32..3e70bed8a978 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -7,6 +7,14 @@
#include <asm-generic/qspinlock_types.h>
#include <asm/paravirt.h>
+#define _Q_PENDING_LOOPS (1 << 9)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
#define queued_spin_unlock queued_spin_unlock
/**
* queued_spin_unlock - release a queued spinlock
@@ -16,15 +24,9 @@
*/
static inline void native_queued_spin_unlock(struct qspinlock *lock)
{
- smp_store_release((u8 *)lock, 0);
+ smp_store_release(&lock->locked, 0);
}
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-
static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
pv_queued_spin_lock_slowpath(lock, val);
@@ -40,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu)
{
return pv_vcpu_is_preempted(cpu);
}
-#else
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
- native_queued_spin_unlock(lock);
-}
#endif
#ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 923307ea11c7..9ef5ee03d2d7 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
*
* void __pv_queued_spin_unlock(struct qspinlock *lock)
* {
- * struct __qspinlock *l = (void *)lock;
- * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
*
* if (likely(lockval == _Q_LOCKED_VAL))
* return;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f75bff8f9d82..547c4fe50711 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -171,7 +171,6 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
-#define smp_num_siblings 1
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4617a2bf123c..199218719a86 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,8 +27,8 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44)
-# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46)
+# define MAX_PHYSADDR_BITS (pgtable_l5_enabled() ? 52 : 44)
+# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 133d9425fced..b6dc698f992a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -111,4 +111,6 @@ static inline unsigned long caller_frame_pointer(void)
return (unsigned long)frame;
}
+void show_opcodes(u8 *rip, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..d33f92b9fa22 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
#endif
#define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+ size_t cnt);
DECLARE_STATIC_KEY_FALSE(mcsafe_key);
/**
@@ -131,14 +132,15 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
* actually do machine check recovery. Everyone else can just
* use memcpy().
*
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
*/
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
memcpy_mcsafe(void *dst, const void *src, size_t cnt)
{
#ifdef CONFIG_X86_MCE
if (static_branch_unlikely(&mcsafe_key))
- return memcpy_mcsafe_unrolled(dst, src, cnt);
+ return __memcpy_mcsafe(dst, src, cnt);
else
#endif
memcpy(dst, src, cnt);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 62546b3a398e..62acb613114b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
}
static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+ unsigned long ret;
+
+ __uaccess_begin();
+ ret = memcpy_mcsafe(to, from, len);
+ __uaccess_end();
+ return ret;
+}
+
+static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
int ret = 0;
@@ -194,4 +205,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
unsigned long
copy_user_handle_tail(char *to, char *from, unsigned len);
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ce8b4da07e35..2d27236c16a3 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -170,7 +170,7 @@ struct x86_cpuinit_ops {
void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
};
-struct timespec;
+struct timespec64;
/**
* struct x86_legacy_devices - legacy x86 devices
@@ -264,8 +264,8 @@ struct x86_hyper_runtime {
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);
unsigned long (*calibrate_tsc)(void);
- void (*get_wallclock)(struct timespec *ts);
- int (*set_wallclock)(const struct timespec *ts);
+ void (*get_wallclock)(struct timespec64 *ts);
+ int (*set_wallclock)(const struct timespec64 *ts);
void (*iommu_shutdown)(void);
bool (*is_untracked_pat_range)(u64 start, u64 end);
void (*nmi_init)(void);
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
index cabd7476bd6c..89de6cd9f0a7 100644
--- a/arch/x86/include/uapi/asm/sembuf.h
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -8,15 +8,24 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
+ *
+ * x86_64 and x32 incorrectly added padding here, so the structures
+ * are still incompatible with the padding on x86.
*/
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+#ifdef __i386__
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_ctime_high;
+#else
__kernel_time_t sem_otime; /* last semop time */
__kernel_ulong_t __unused1;
__kernel_time_t sem_ctime; /* last change time */
__kernel_ulong_t __unused2;
+#endif
__kernel_ulong_t sem_nsems; /* no. of semaphores in array */
__kernel_ulong_t __unused3;
__kernel_ulong_t __unused4;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index cadeafabf167..5d0de79fdab0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2433,7 +2433,7 @@ MODULE_PARM_DESC(idle_threshold,
"System idle percentage above which to make APM BIOS idle calls");
module_param(idle_period, int, 0444);
MODULE_PARM_DESC(idle_period,
- "Period (in sec/100) over which to caculate the idle percentage");
+ "Period (in sec/100) over which to calculate the idle percentage");
module_param(smp, bool, 0444);
MODULE_PARM_DESC(smp,
"Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a66229f51b12..7a40196967cb 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
-obj-y := intel_cacheinfo.o scattered.o topology.o
+obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1b18be3f35a8..082d7875cef8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,6 +9,7 @@
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
@@ -298,7 +299,6 @@ static int nearby_node(int apicid)
}
#endif
-#ifdef CONFIG_SMP
/*
* Fix up cpu_core_id for pre-F17h systems to be in the
* [0 .. cores_per_node - 1] range. Not really needed but
@@ -328,6 +328,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
/* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int err;
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -346,21 +347,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
}
/*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
+ * In case leaf B is available, use it to derive
+ * topology information.
*/
- if (cpuid_edx(0x80000006)) {
- if (c->x86 == 0x17) {
- /*
- * LLC is at the core complex level.
- * Core complex id is ApicId[3].
- */
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
- } else {
- /* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = node_id;
- }
- }
+ err = detect_extended_topology(c);
+ if (!err)
+ c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+ cacheinfo_amd_init_llc_id(c, cpu, node_id);
+
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
@@ -376,7 +371,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
legacy_fixup_core_id(c);
}
}
-#endif
/*
* On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
@@ -384,7 +378,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
@@ -395,17 +388,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- amd_get_topology(c);
-#endif
}
u16 amd_get_nb_id(int cpu)
{
- u16 id = 0;
-#ifdef CONFIG_SMP
- id = per_cpu(cpu_llc_id, cpu);
-#endif
- return id;
+ return per_cpu(cpu_llc_id, cpu);
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
@@ -864,6 +851,7 @@ static void init_amd(struct cpuinfo_x86 *c)
/* Multi core CPU? */
if (c->extended_cpuid_level >= 0x80000008) {
amd_detect_cmp(c);
+ amd_get_topology(c);
srat_detect_node(c);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 54d04d574148..38354c66df81 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -20,6 +20,8 @@
#include <asm/amd_nb.h>
#include <asm/smp.h>
+#include "cpu.h"
+
#define LVL_1_INST 1
#define LVL_1_DATA 2
#define LVL_2 3
@@ -637,6 +639,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (!cpuid_edx(0x80000006))
+ return;
+
+ if (c->x86 < 0x17) {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else if (c->x86 == 0x17 &&
+ c->x86_model >= 0 && c->x86_model <= 0x1F) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex ID is ApicId[3] for these processors.
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /*
+ * LLC ID is calculated from the number of threads sharing the
+ * cache.
+ * */
+ u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+
+ cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
+ if (eax)
+ num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+
+ if (num_sharing_cache) {
+ int bits = get_count_order(num_sharing_cache) - 1;
+
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ }
+ }
+}
+
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
@@ -650,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)
}
}
-unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
@@ -802,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
- return l2;
+ if (!l2)
+ cpu_detect_cache_sizes(c);
}
static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e5ec0f11c0de..14433ff5b828 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,6 +18,13 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
}
}
+static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
@@ -128,6 +160,24 @@ static void init_centaur(struct cpuinfo_x86 *c)
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
+ init_intel_cacheinfo(c);
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
switch (c->x86) {
#ifdef CONFIG_X86_32
case 5:
@@ -199,6 +249,9 @@ static void init_centaur(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ centaur_detect_vmx_virtcap(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 38276f58d3bf..95c8e507580d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -577,6 +584,19 @@ static void get_model_name(struct cpuinfo_x86 *c)
*(s + 1) = '\0';
}
+void detect_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ c->x86_max_cores = 1;
+ if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
+ return;
+
+ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+ if (eax & 0x1f)
+ c->x86_max_cores = (eax >> 26) + 1;
+}
+
void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -1044,6 +1064,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
+
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
}
void __init early_cpu_init(void)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 37672d299e35..38216f678fc3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,6 +47,16 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+ unsigned int sub_leaf,
+ enum cpuid_regs_idx reg);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 577e7f7ae273..eb75564f2d25 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,24 +456,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-/*
- * find out the number of processor cores on the die
- */
-static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, ebx, ecx, edx;
-
- if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
- return 1;
-
- /* Intel has a non-standard dependency on %ecx for this CPUID level. */
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- return (eax >> 26) + 1;
- else
- return 1;
-}
-
static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
/* Intel VMX MSR indicated features */
@@ -656,8 +638,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
static void init_intel(struct cpuinfo_x86 *c)
{
- unsigned int l2 = 0;
-
early_init_intel(c);
intel_workarounds(c);
@@ -674,19 +654,13 @@ static void init_intel(struct cpuinfo_x86 *c)
* let's use the legacy cpuid vector 0x1 and 0x4 for topology
* detection.
*/
- c->x86_max_cores = intel_num_cpu_cores(c);
+ detect_num_cpu_cores(c);
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
}
- l2 = init_intel_cacheinfo(c);
-
- /* Detect legacy cache sizes if init_intel_cacheinfo did not */
- if (l2 == 0) {
- cpu_detect_cache_sizes(c);
- l2 = c->x86_cache_size;
- }
+ init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -699,7 +673,8 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (boot_cpu_has(X86_FEATURE_DS)) {
- unsigned int l1;
+ unsigned int l1, l2;
+
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
set_cpu_cap(c, X86_FEATURE_BTS);
@@ -727,6 +702,7 @@ static void init_intel(struct cpuinfo_x86 *c)
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
char *p = NULL;
switch (c->x86_model) {
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 589b948e6e01..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,8 +33,8 @@
#include <asm/intel_rdt_sched.h>
#include "intel_rdt.h"
-#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
+#define MBA_MAX_MBPS U32_MAX
/* Mutex to protect rdtgroup access. */
DEFINE_MUTEX(rdtgroup_mutex);
@@ -178,7 +178,7 @@ struct rdt_resource rdt_resources_all[] = {
.msr_update = mba_wrmsr,
.cache_level = 3,
.parse_ctrlval = parse_bw,
- .format_str = "%d=%*d",
+ .format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
},
};
@@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void)
rdt_alloc_capable = true;
}
+bool is_mba_sc(struct rdt_resource *r)
+{
+ if (!r)
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+
+ return r->membw.mba_sc;
+}
+
/*
* rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
* exposed to user interface and the h/w understandable delay values.
@@ -341,7 +349,7 @@ static int get_cache_id(int cpu, int level)
* that can be written to QOS_MSRs.
* There are currently no SKUs which support non linear delay values.
*/
-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
{
if (r->membw.delay_linear)
return MAX_MBA_BW - bw;
@@ -431,25 +439,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
return NULL;
}
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+{
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ * and the bandwidth in MBps to U32_MAX
+ */
+ for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+ *dc = r->default_ctrl;
+ *dm = MBA_MAX_MBPS;
+ }
+}
+
static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
{
struct msr_param m;
- u32 *dc;
- int i;
+ u32 *dc, *dm;
dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
if (!dc)
return -ENOMEM;
- d->ctrl_val = dc;
+ dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+ if (!dm) {
+ kfree(dc);
+ return -ENOMEM;
+ }
- /*
- * Initialize the Control MSRs to having no control.
- * For Cache Allocation: Set all bits in cbm
- * For Memory Allocation: Set b/w requested to 100
- */
- for (i = 0; i < r->num_closid; i++, dc++)
- *dc = r->default_ctrl;
+ d->ctrl_val = dc;
+ d->mbps_val = dm;
+ setup_default_ctrlval(r, dc, dm);
m.low = 0;
m.high = r->num_closid;
@@ -588,6 +611,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
}
kfree(d->ctrl_val);
+ kfree(d->mbps_val);
kfree(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 3fd7a70ee04a..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
#define MBM_CNTR_WIDTH 24
#define MBM_OVERFLOW_INTERVAL 1000
+#define MAX_MBA_BW 100u
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
@@ -180,10 +181,20 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
* @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @chunks_bw Total local data moved. Used for bandwidth calculation
+ * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw The most recent bandwidth in MBps
+ * @delta_bw Difference between the current and previous bandwidth
+ * @delta_comp Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 chunks;
u64 prev_msr;
+ u64 chunks_bw;
+ u64 prev_bw_msr;
+ u32 prev_bw;
+ u32 delta_bw;
+ bool delta_comp;
};
/**
@@ -202,6 +213,7 @@ struct mbm_state {
* @cqm_work_cpu:
* worker cpu for CQM h/w counters
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
* @new_ctrl: new ctrl value to be loaded
* @have_new_ctrl: did user provide new_ctrl for this domain
*/
@@ -217,6 +229,7 @@ struct rdt_domain {
int mbm_work_cpu;
int cqm_work_cpu;
u32 *ctrl_val;
+ u32 *mbps_val;
u32 new_ctrl;
bool have_new_ctrl;
};
@@ -259,6 +272,7 @@ struct rdt_cache {
* @min_bw: Minimum memory bandwidth percentage user can request
* @bw_gran: Granularity at which the memory bandwidth is allocated
* @delay_linear: True if memory B/W delay is in linear scale
+ * @mba_sc: True if MBA software controller(mba_sc) is enabled
* @mb_map: Mapping of memory B/W percentage to memory B/W delay
*/
struct rdt_membw {
@@ -266,6 +280,7 @@ struct rdt_membw {
u32 min_bw;
u32 bw_gran;
u32 delay_linear;
+ bool mba_sc;
u32 *mb_map;
};
@@ -445,6 +460,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms);
void mbm_handle_overflow(struct work_struct *work);
+bool is_mba_sc(struct rdt_resource *r);
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 23e1d5c249c6..116d57b248d3 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return false;
}
- if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+ if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
+ !is_mba_sc(r)) {
rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
r->membw.min_bw, r->default_ctrl);
return false;
@@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid)
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
+ bool mba_sc;
+ u32 *dc;
int cpu;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid)
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ mba_sc = is_mba_sc(r);
list_for_each_entry(d, &r->domains, list) {
- if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+ dc = !mba_sc ? d->ctrl_val : d->mbps_val;
+ if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
- d->ctrl_val[closid] = d->new_ctrl;
+ dc[closid] = d->new_ctrl;
}
}
- if (cpumask_empty(cpu_mask))
+
+ /*
+ * Avoid writing the control msr with control values when
+ * MBA software controller is enabled
+ */
+ if (cpumask_empty(cpu_mask) || mba_sc)
goto done;
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
@@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
{
struct rdt_domain *dom;
bool sep = false;
+ u32 ctrl_val;
seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
seq_puts(s, ";");
+
+ ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
+ dom->mbps_val[closid]);
seq_printf(s, r->format_str, dom->id, max_data_width,
- dom->ctrl_val[closid]);
+ ctrl_val);
sep = true;
}
seq_puts(s, "\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 681450eee428..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -225,10 +225,18 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+{
+ u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >>= shift;
+}
+
static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
- u64 chunks, shift, tval;
struct mbm_state *m;
+ u64 chunks, tval;
tval = __rmid_read(rmid, rr->evtid);
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
@@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
}
if (rr->first) {
- m->prev_msr = tval;
- m->chunks = 0;
+ memset(m, 0, sizeof(struct mbm_state));
+ m->prev_bw_msr = m->prev_msr = tval;
return 0;
}
- shift = 64 - MBM_CNTR_WIDTH;
- chunks = (tval << shift) - (m->prev_msr << shift);
- chunks >>= shift;
+ chunks = mbm_overflow_count(m->prev_msr, tval);
m->chunks += chunks;
m->prev_msr = tval;
@@ -270,6 +276,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
}
/*
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps.
+ */
+static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct mbm_state *m = &rr->d->mbm_local[rmid];
+ u64 tval, cur_bw, chunks;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+ m->chunks_bw += chunks;
+ m->chunks = m->chunks_bw;
+ cur_bw = (chunks * r->mon_scale) >> 20;
+
+ if (m->delta_comp)
+ m->delta_bw = abs(cur_bw - m->prev_bw);
+ m->delta_comp = false;
+ m->prev_bw = cur_bw;
+ m->prev_bw_msr = tval;
+}
+
+/*
* This is called via IPI to read the CQM/MBM counters
* on a domain.
*/
@@ -297,6 +329,118 @@ void mon_event_count(void *info)
}
}
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values. To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+ u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+ struct mbm_state *pmbm_data, *cmbm_data;
+ u32 cur_bw, delta_bw, user_bw;
+ struct rdt_resource *r_mba;
+ struct rdt_domain *dom_mba;
+ struct list_head *head;
+ struct rdtgroup *entry;
+
+ r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ closid = rgrp->closid;
+ rmid = rgrp->mon.rmid;
+ pmbm_data = &dom_mbm->mbm_local[rmid];
+
+ dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ if (!dom_mba) {
+ pr_warn_once("Failure to get domain for MBA update\n");
+ return;
+ }
+
+ cur_bw = pmbm_data->prev_bw;
+ user_bw = dom_mba->mbps_val[closid];
+ delta_bw = pmbm_data->delta_bw;
+ cur_msr_val = dom_mba->ctrl_val[closid];
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rgrp->mon.crdtgrp_list;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cur_bw += cmbm_data->prev_bw;
+ delta_bw += cmbm_data->delta_bw;
+ }
+
+ /*
+ * Scale up/down the bandwidth linearly for the ctrl group. The
+ * bandwidth step is the bandwidth granularity specified by the
+ * hardware.
+ *
+ * The delta_bw is used when increasing the bandwidth so that we
+ * dont alternately increase and decrease the control values
+ * continuously.
+ *
+ * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+ * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+ * switching between 90 and 110 continuously if we only check
+ * cur_bw < user_bw.
+ */
+ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+ new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+ } else if (cur_msr_val < MAX_MBA_BW &&
+ (user_bw > (cur_bw + delta_bw))) {
+ new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+ } else {
+ return;
+ }
+
+ cur_msr = r_mba->msr_base + closid;
+ wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+ dom_mba->ctrl_val[closid] = new_msr_val;
+
+ /*
+ * Delta values are updated dynamically package wise for each
+ * rdtgrp everytime the throttle MSR changes value.
+ *
+ * This is because (1)the increase in bandwidth is not perfectly
+ * linear and only "approximately" linear even when the hardware
+ * says it is linear.(2)Also since MBA is a core specific
+ * mechanism, the delta values vary based on number of cores used
+ * by the rdtgrp.
+ */
+ pmbm_data->delta_comp = true;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cmbm_data->delta_comp = true;
+ }
+}
+
static void mbm_update(struct rdt_domain *d, int rmid)
{
struct rmid_read rr;
@@ -314,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
- __mon_event_count(rmid, &rr);
+
+ /*
+ * Call the MBA software controller only for the
+ * control groups and when user has enabled
+ * the software controller explicitly.
+ */
+ if (!is_mba_sc(NULL))
+ __mon_event_count(rmid, &rr);
+ else
+ mbm_bw_count(rmid, &rr);
}
}
@@ -385,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
mbm_update(d, crgrp->mon.rmid);
+
+ if (is_mba_sc(NULL))
+ update_mba_bw(prgrp, d);
}
schedule_delayed_work_on(cpu, &d->mbm_over, delay);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fca759d272a1..749856a2e736 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg)
wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
}
+static inline bool is_mba_linear(void)
+{
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+}
+
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
@@ -1041,6 +1046,28 @@ static int set_cache_qos_cfg(int level, bool enable)
return 0;
}
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+ struct rdt_domain *d;
+
+ if (!is_mbm_enabled() || !is_mba_linear() ||
+ mba_sc == is_mba_sc(r))
+ return -EINVAL;
+
+ r->membw.mba_sc = mba_sc;
+ list_for_each_entry(d, &r->domains, list)
+ setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+
+ return 0;
+}
+
static int cdp_enable(int level, int data_type, int code_type)
{
struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
@@ -1123,6 +1150,10 @@ static int parse_rdtgroupfs_options(char *data)
ret = cdpl2_enable();
if (ret)
goto out;
+ } else if (!strcmp(token, "mba_MBps")) {
+ ret = set_mba_sc(true);
+ if (ret)
+ goto out;
} else {
ret = -EINVAL;
goto out;
@@ -1445,6 +1476,8 @@ static void rdt_kill_sb(struct super_block *sb)
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
+ set_mba_sc(false);
+
/*Put everything back to default values. */
for_each_alloc_enabled_rdt_resource(r)
reset_all_ctrls(r);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 475cb4f5f14f..c805a06e14c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -48,7 +48,7 @@ static struct dentry *dfs_inj;
static u8 n_banks;
-#define MAX_FLAG_OPT_SIZE 3
+#define MAX_FLAG_OPT_SIZE 4
#define NBCFG 0x44
enum injection_type {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 42cf2880d0ed..cd76380af79f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1727,6 +1727,21 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
}
}
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1739,6 +1754,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_amd_feature_init(c);
break;
}
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
default:
break;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index c8e038800591..f591b01930db 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -436,8 +436,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
-static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
- unsigned int block)
+static u32 smca_get_block_address(unsigned int bank, unsigned int block)
{
u32 low, high;
u32 addr = 0;
@@ -456,13 +455,13 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
goto out;
if (!(low & MCI_CONFIG_MCAX))
goto out;
- if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
@@ -471,7 +470,7 @@ out:
return addr;
}
-static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
{
u32 addr = 0, offset = 0;
@@ -480,7 +479,7 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi
return addr;
if (mce_flags.smca)
- return smca_get_block_address(cpu, bank, block);
+ return smca_get_block_address(bank, block);
/* Fall back to method we used for older processors: */
switch (block) {
@@ -558,7 +557,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
smca_configure(bank, cpu);
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(cpu, address, low, high, bank, block);
+ address = get_block_address(address, low, high, bank, block);
if (!address)
break;
@@ -1175,7 +1174,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
- address = get_block_address(cpu, address, low, high, bank, ++block);
+ address = get_block_address(address, low, high, bank, ++block);
if (!address)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed81181..2ad9107ee980 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y := main.o if.o generic.o cleanup.o
+obj-y := mtrr.o if.o generic.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 7468de429087..9a19c800fe40 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
@@ -100,7 +101,7 @@ static int have_wrcomb(void)
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
dev->revision <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
@@ -110,7 +111,7 @@ static int have_wrcomb(void)
*/
if (dev->vendor == PCI_VENDOR_ID_INTEL &&
dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
- pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
@@ -312,24 +313,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return error;
if (type >= MTRR_NUM_TYPES) {
- pr_warn("mtrr: type: %u invalid\n", type);
+ pr_warn("type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- pr_warn("mtrr: your processor doesn't support write-combining\n");
+ pr_warn("your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
- pr_warn("mtrr: zero sized request\n");
+ pr_warn("zero sized request\n");
return -EINVAL;
}
if ((base | (base + size - 1)) >>
(boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
- pr_warn("mtrr: base or size exceeds the MTRR width\n");
+ pr_warn("base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -360,8 +361,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
} else if (types_compatible(type, ltype))
continue;
}
- pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%lx000\n", base, size, lbase,
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
@@ -369,7 +369,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
- pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
}
}
} else {
- pr_info("mtrr: no more MTRRs available\n");
+ pr_info("no more MTRRs available\n");
}
error = i;
out:
@@ -407,8 +407,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
- pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
- pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ pr_debug("size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
@@ -499,22 +499,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
}
}
if (reg < 0) {
- pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
+ pr_debug("no MTRR for %lx000,%lx000 found\n",
base, size);
goto out;
}
}
if (reg >= max) {
- pr_warn("mtrr: register: %d too big\n", reg);
+ pr_warn("register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
- pr_warn("mtrr: MTRR %d not used\n", reg);
+ pr_warn("MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
- pr_warn("mtrr: reg: %d has count=0\n", reg);
+ pr_warn("reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
@@ -775,7 +775,7 @@ void __init mtrr_bp_init(void)
}
if (!mtrr_enabled()) {
- pr_info("MTRR: Disabled\n");
+ pr_info("Disabled\n");
/*
* PAT initialization relies on MTRR's rendezvous handler.
@@ -793,6 +793,9 @@ void mtrr_ap_init(void)
if (!use_intel() || mtrr_aps_delayed_init)
return;
+
+ rcu_cpu_starting(smp_processor_id());
+
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
* changed, but this routine will be called in cpu boot time,
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index b099024d339c..81c0afb39d0a 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,7 @@
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
-void detect_extended_topology(struct cpuinfo_x86 *c)
+int detect_extended_topology(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
@@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
static bool printed;
if (c->cpuid_level < 0xb)
- return;
+ return -1;
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
@@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
* check if the cpuid leaf 0xb is actually implemented.
*/
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return;
+ return -1;
set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
@@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
c->cpu_core_id);
printed = 1;
}
- return;
#endif
+ return 0;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 18fa9d74c182..666a284116ac 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,11 +22,14 @@
#include <asm/stacktrace.h>
#include <asm/unwind.h>
+#define OPCODE_BUFSIZE 64
+
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
-static unsigned int code_bytes = 64;
static int die_counter;
+static struct pt_regs exec_summary_regs;
+
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info)
{
@@ -69,9 +72,62 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(u8 *rip, const char *loglvl)
+{
+ unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3;
+ u8 opcodes[OPCODE_BUFSIZE];
+ u8 *ip;
+ int i;
+
+ printk("%sCode: ", loglvl);
+
+ ip = (u8 *)rip - code_prologue;
+ if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) {
+ pr_cont("Bad RIP value.\n");
+ return;
+ }
+
+ for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) {
+ if (ip == rip)
+ pr_cont("<%02x> ", opcodes[i]);
+ else
+ pr_cont("%02x ", opcodes[i]);
+ }
+ pr_cont("\n");
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
+{
+#ifdef CONFIG_X86_32
+ printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+ show_opcodes((u8 *)regs->ip, loglvl);
+}
+
void show_iret_regs(struct pt_regs *regs)
{
- printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+ show_ip(regs, KERN_DEFAULT);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
@@ -267,7 +323,6 @@ unsigned long oops_begin(void)
bust_spinlocks(1);
return flags;
}
-EXPORT_SYMBOL_GPL(oops_begin);
NOKPROBE_SYMBOL(oops_begin);
void __noreturn rewind_stack_do_exit(int signr);
@@ -287,6 +342,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
raw_local_irq_restore(flags);
oops_exit();
+ /* Executive summary in case the oops scrolled away */
+ __show_regs(&exec_summary_regs, true);
+
if (!signr)
return;
if (in_interrupt())
@@ -305,10 +363,10 @@ NOKPROBE_SYMBOL(oops_end);
int __die(const char *str, struct pt_regs *regs, long err)
{
-#ifdef CONFIG_X86_32
- unsigned short ss;
- unsigned long sp;
-#endif
+ /* Save the regs of the first oops for the executive summary later. */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
@@ -318,26 +376,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+ show_regs(regs);
+ print_modules();
+
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- print_modules();
- show_regs(regs);
-#ifdef CONFIG_X86_32
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss;
- } else {
- sp = kernel_stack_pointer(regs);
- savesegment(ss, ss);
- }
- printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
- (void *)regs->ip, ss, sp);
-#else
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
-#endif
return 0;
}
NOKPROBE_SYMBOL(__die);
@@ -356,30 +401,9 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
-static int __init code_bytes_setup(char *s)
-{
- ssize_t ret;
- unsigned long val;
-
- if (!s)
- return -EINVAL;
-
- ret = kstrtoul(s, 0, &val);
- if (ret)
- return ret;
-
- code_bytes = val;
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
-
void show_regs(struct pt_regs *regs)
{
bool all = true;
- int i;
show_regs_print_info(KERN_DEFAULT);
@@ -389,36 +413,8 @@ void show_regs(struct pt_regs *regs)
__show_regs(regs, all);
/*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
+ * When in-kernel, we also print out the stack at the time of the fault..
*/
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
+ if (!user_mode(regs))
show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
-
- printk(KERN_DEFAULT "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- pr_cont(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- pr_cont("<%02x> ", c);
- else
- pr_cont("%02x ", c);
- }
- }
- pr_cont("\n");
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6a2cb1442e05..d1f25c831447 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si
int x = table->nr_entries;
if (x >= ARRAY_SIZE(table->entries)) {
- pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
+ pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size - 1);
return;
}
@@ -190,9 +191,10 @@ void __init e820__print_table(char *who)
int i;
for (i = 0; i < e820_table->nr_entries; i++) {
- pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
- e820_table->entries[i].addr,
- e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+ pr_info("%s: [mem %#018Lx-%#018Lx] ",
+ who,
+ e820_table->entries[i].addr,
+ e820_table->entries[i].addr + e820_table->entries[i].size - 1);
e820_print_type(e820_table->entries[i].type);
pr_cont("\n");
@@ -574,7 +576,7 @@ void __init e820__update_table_print(void)
if (e820__update_table(e820_table))
return;
- pr_info("e820: modified physical RAM map:\n");
+ pr_info("modified physical RAM map:\n");
e820__print_table("modified");
}
@@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void)
if (!found) {
#ifdef CONFIG_X86_64
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
- pr_err(
- "e820: Cannot find an available gap in the 32-bit address range\n"
- "e820: PCI devices with unassigned 32-bit BARs may not work!\n");
+ pr_err("Cannot find an available gap in the 32-bit address range\n");
+ pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
#else
gapstart = 0x10000000;
#endif
@@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void)
*/
pci_mem_start = gapstart;
- pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
+ pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
/*
@@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
early_memunmap(sdata, data_len);
- pr_info("e820: extended physical RAM map:\n");
+ pr_info("extended physical RAM map:\n");
e820__print_table("extended");
}
@@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
if (addr) {
e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
- pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+ pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
e820__update_table_kexec();
}
@@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
- last_pfn, max_arch_pfn);
+ pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
return last_pfn;
}
@@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void)
if (e820__update_table(e820_table) < 0)
early_panic("Invalid user supplied memory map");
- pr_info("e820: user-defined physical RAM map:\n");
+ pr_info("user-defined physical RAM map:\n");
e820__print_table("user");
}
}
@@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void)
memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
- pr_info("e820: BIOS-provided physical RAM map:\n");
+ pr_info("BIOS-provided physical RAM map:\n");
e820__print_table(who);
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bae0d32e327b..da5d8ac60062 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -28,8 +28,6 @@
#include <asm/irq_remapping.h>
#include <asm/early_ioremap.h>
-#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
-
static void __init fix_hypertransport_config(int num, int slot, int func)
{
u32 htcfg;
@@ -617,7 +615,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
- dev_err("Cannot power up Apple AirPort card\n");
+ pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+ bus, slot, func);
return;
}
}
@@ -628,7 +627,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
if (!mmio) {
- dev_err("Cannot iomap Apple AirPort card\n");
+ pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+ bus, slot, func);
return;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d29e47c056e..a21d6ace648e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -6,6 +6,10 @@
*/
#define DISABLE_BRANCH_PROFILING
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
@@ -32,11 +36,6 @@
#include <asm/microcode.h>
#include <asm/kasan.h>
-#ifdef CONFIG_X86_5LEVEL
-#undef pgtable_l5_enabled
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
-
/*
* Manage page tables very early on.
*/
@@ -45,8 +44,7 @@ static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled __ro_after_init;
-EXPORT_SYMBOL(__pgtable_l5_enabled);
+unsigned int __pgtable_l5_enabled __initdata;
unsigned int pgdir_shift __ro_after_init = 39;
EXPORT_SYMBOL(pgdir_shift);
unsigned int ptrs_per_p4d __ro_after_init = 1;
@@ -82,13 +80,14 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
static bool __head check_la57_support(unsigned long physaddr)
{
- if (native_cpuid_eax(0) < 7)
- return false;
-
- if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ /*
+ * 5-level paging is detected and enabled at kernel decomression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
return false;
- *fixup_int(&pgtable_l5_enabled, physaddr) = 1;
+ *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
*fixup_int(&pgdir_shift, physaddr) = 48;
*fixup_int(&ptrs_per_p4d, physaddr) = 512;
*fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
@@ -281,7 +280,7 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8ce4212e2b8d..b6be34ee88e9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -975,8 +975,7 @@ int __init hpet_enable(void)
cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
hpet_writel(cfg, HPET_CFG);
if (cfg)
- pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
- cfg);
+ pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
for (i = 0; i <= last; ++i) {
cfg = hpet_readl(HPET_Tn_CFG(i));
@@ -988,7 +987,7 @@ int __init hpet_enable(void)
| HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
| HPET_TN_FSB | HPET_TN_FSB_CAP);
if (cfg)
- pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+ pr_warn("Unrecognized bits %#x set in cfg#%u\n",
cfg, i);
}
hpet_print_config();
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index a15fe0e92cf9..108c48d0d40e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -37,7 +37,7 @@ static uint32_t __init jailhouse_detect(void)
return jailhouse_cpuid_base();
}
-static void jailhouse_get_wallclock(struct timespec *now)
+static void jailhouse_get_wallclock(struct timespec64 *now)
{
memset(now, 0, sizeof(*now));
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 8b26c9e01cc4..bf8d1eb7fca3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -53,7 +53,7 @@ static struct pvclock_wall_clock *wall_clock;
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
-static void kvm_get_wallclock(struct timespec *now)
+static void kvm_get_wallclock(struct timespec64 *now)
{
struct pvclock_vcpu_time_info *vcpu_time;
int low, high;
@@ -72,7 +72,7 @@ static void kvm_get_wallclock(struct timespec *now)
put_cpu();
}
-static int kvm_set_wallclock(const struct timespec *now)
+static int kvm_set_wallclock(const struct timespec64 *now)
{
return -ENODEV;
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6010449ca6d2..4c8acdfdc5a7 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -354,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
- VMCOREINFO_NUMBER(pgtable_l5_enabled);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 77625b60a510..ab5d9dd668d2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -15,13 +15,11 @@
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
-static int forbid_dac __read_mostly;
+static bool disable_dac_quirk __read_mostly;
const struct dma_map_ops *dma_ops = &dma_direct_ops;
EXPORT_SYMBOL(dma_ops);
-static int iommu_sac_force __read_mostly;
-
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
int force_iommu __read_mostly = 1;
@@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = {
};
EXPORT_SYMBOL(x86_dma_fallback_dev);
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES 65536
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
@@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void)
}
}
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+bool arch_dma_alloc_attrs(struct device **dev)
{
if (!*dev)
*dev = &x86_dma_fallback_dev;
@@ -125,13 +120,13 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "nomerge", 7))
iommu_merge = 0;
if (!strncmp(p, "forcesac", 8))
- iommu_sac_force = 1;
+ pr_warn("forcesac option ignored.\n");
if (!strncmp(p, "allowdac", 8))
- forbid_dac = 0;
+ pr_warn("allowdac option ignored.\n");
if (!strncmp(p, "nodac", 5))
- forbid_dac = 1;
+ pr_warn("nodac option ignored.\n");
if (!strncmp(p, "usedac", 6)) {
- forbid_dac = -1;
+ disable_dac_quirk = true;
return 1;
}
#ifdef CONFIG_SWIOTLB
@@ -156,40 +151,9 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int arch_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
- if (mask > 0xffffffff && forbid_dac > 0) {
- dev_info(dev, "PCI: Disallowing DAC for device\n");
- return 0;
- }
-#endif
-
- /* Tell the device to use SAC when IOMMU force is on. This
- allows the driver to use cheaper accesses in some cases.
-
- Problem with this is that if we overflow the IOMMU area and
- return DAC as fallback address the device may not handle it
- correctly.
-
- As a special case some controllers have a 39bit address
- mode that is as efficient as 32bit (aic79xx). Don't force
- SAC for these. Assume all masks <= 40 bits are of this
- type. Normally this doesn't make any difference, but gives
- more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
- dev_info(dev, "Force SAC with mask %Lx\n", mask);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL(arch_dma_supported);
-
static int __init pci_iommu_init(void)
{
struct iommu_table_entry *p;
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
@@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.dma_32bit_limit = true;
+ return 0;
+}
+
static void via_no_dac(struct pci_dev *dev)
{
- if (forbid_dac == 0) {
+ if (!disable_dac_quirk) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
- forbid_dac = 1;
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
}
}
DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index e47b2dbbdef3..c06c4c16c6b6 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -151,17 +151,19 @@ void perf_get_regs_user(struct perf_regs *regs_user,
regs_user_copy->sp = user_regs->sp;
regs_user_copy->cs = user_regs->cs;
regs_user_copy->ss = user_regs->ss;
-
/*
- * Most system calls don't save these registers, don't report them.
+ * Store user space frame-pointer value on sample
+ * to facilitate stack unwinding for cases when
+ * user space executable code has such support
+ * enabled at compile time:
*/
+ regs_user_copy->bp = user_regs->bp;
+
regs_user_copy->bx = -1;
- regs_user_copy->bp = -1;
regs_user_copy->r12 = -1;
regs_user_copy->r13 = -1;
regs_user_copy->r14 = -1;
regs_user_copy->r15 = -1;
-
/*
* For this to be at all useful, we need a reasonable guess for
* the ABI. Be careful: we're in NMI context, and we're
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5224c6099184..0ae659de21eb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}
- printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
- printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- raw_smp_processor_id());
+ show_ip(regs, KERN_DEFAULT);
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
regs->si, regs->di, regs->bp, sp);
- printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
- (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+ printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
if (!all)
return;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ed5c4cdf0a34..e2ee403865eb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk,
tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
- memset(info, 0, sizeof(*info));
info->si_signo = SIGTRAP;
info->si_code = si_code;
info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
@@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
{
struct siginfo info;
+ clear_siginfo(&info);
fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
/* Send us the fake SIGTRAP */
force_sig_info(SIGTRAP, &info, tsk);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 761f6af6efa5..637982efecd8 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -123,28 +123,35 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
- struct timespec *ts)
+ struct timespec64 *ts)
{
u32 version;
u64 delta;
- struct timespec now;
+ struct timespec64 now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
+ /*
+ * Note: wall_clock->sec is a u32 value, so it can
+ * only store dates between 1970 and 2106. To allow
+ * times beyond that, we need to create a new hypercall
+ * interface with an extended pvclock_wall_clock structure
+ * like ARM has.
+ */
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}
void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index f7b82ed7b5b5..586f718b8e95 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL(rtc_lock);
* jump to the next second precisely 500 ms later. Check the Motorola
* MC146818A or Dallas DS12887 data sheet for details.
*/
-int mach_set_rtc_mmss(const struct timespec *now)
+int mach_set_rtc_mmss(const struct timespec64 *now)
{
unsigned long long nowtime = now->tv_sec;
struct rtc_time tm;
@@ -60,7 +60,7 @@ int mach_set_rtc_mmss(const struct timespec *now)
return retval;
}
-void mach_get_cmos_time(struct timespec *now)
+void mach_get_cmos_time(struct timespec64 *now)
{
unsigned int status, year, mon, day, hour, min, sec, century = 0;
unsigned long flags;
@@ -118,7 +118,7 @@ void mach_get_cmos_time(struct timespec *now)
} else
year += CMOS_YEARS_OFFS;
- now->tv_sec = mktime(year, mon, day, hour, min, sec);
+ now->tv_sec = mktime64(year, mon, day, hour, min, sec);
now->tv_nsec = 0;
}
@@ -145,13 +145,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
}
EXPORT_SYMBOL(rtc_cmos_write);
-int update_persistent_clock(struct timespec now)
+int update_persistent_clock64(struct timespec64 now)
{
return x86_platform.set_wallclock(&now);
}
/* not static: needed by APM */
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
{
x86_platform.get_wallclock(ts);
}
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 14c057f29979..9ccbf0576cd0 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE != 15);
BUILD_BUG_ON(NSIGSEGV != 7);
BUILD_BUG_ON(NSIGBUS != 5);
- BUILD_BUG_ON(NSIGTRAP != 4);
+ BUILD_BUG_ON(NSIGTRAP != 5);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS != 1);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9dd324ae4832..c2f7d1d2a5c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -81,13 +81,6 @@
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
-
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a3f15ed545b5..6a78d4b36a79 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
@@ -19,7 +20,6 @@
#include <linux/elf.h>
#include <asm/elf.h>
-#include <asm/compat.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
#include <asm/mpx.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 03f3d7695dac..a535dd64de63 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
+ clear_siginfo(&info);
do_trap(trapnr, signr, str, regs, error_code,
fill_trap_info(regs, signr, trapnr, &info));
}
@@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
+ clear_siginfo(&info);
info.si_signo = SIGFPE;
info.si_errno = 0;
info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
@@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
+ clear_siginfo(&info);
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_BADSTK;
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f44ce0fb3583..ff20b35e98dd 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
tsk->thread.trap_nr = X86_TRAP_PF;
+ clear_siginfo(&info);
info.si_signo = SIGSEGV;
info.si_errno = 0;
info.si_code = SEGV_MAPERR;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index c84bb5396958..58d8d800875d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1083,8 +1083,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
if (nleft != rasize) {
- pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
- "%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 795f3a80e576..5e1458f609a1 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -117,11 +117,11 @@ SECTIONS
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
- VMLINUX_SYMBOL(__entry_trampoline_start) = .;
+ __entry_trampoline_start = .;
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
- VMLINUX_SYMBOL(__entry_trampoline_end) = .;
+ __entry_trampoline_end = .;
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dbae41b9..d634f0332c0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3007,6 +3007,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
{
siginfo_t info;
+ clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_MCEERR_AR;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 9a53a06e5a3e..c3b527a9f95d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
#ifndef CONFIG_UML
/*
- * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
+ * __memcpy_mcsafe - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses.
* Writes to target are posted and don't generate machine checks.
*/
-ENTRY(memcpy_mcsafe_unrolled)
+ENTRY(__memcpy_mcsafe)
cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words
@@ -204,58 +204,29 @@ ENTRY(memcpy_mcsafe_unrolled)
subl $8, %ecx
negl %ecx
subl %ecx, %edx
-.L_copy_leading_bytes:
+.L_read_leading_bytes:
movb (%rsi), %al
+.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
- jnz .L_copy_leading_bytes
+ jnz .L_read_leading_bytes
.L_8byte_aligned:
- /* Figure out how many whole cache lines (64-bytes) to copy */
- movl %edx, %ecx
- andl $63, %edx
- shrl $6, %ecx
- jz .L_no_whole_cache_lines
-
- /* Loop copying whole cache lines */
-.L_cache_w0: movq (%rsi), %r8
-.L_cache_w1: movq 1*8(%rsi), %r9
-.L_cache_w2: movq 2*8(%rsi), %r10
-.L_cache_w3: movq 3*8(%rsi), %r11
- movq %r8, (%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, 2*8(%rdi)
- movq %r11, 3*8(%rdi)
-.L_cache_w4: movq 4*8(%rsi), %r8
-.L_cache_w5: movq 5*8(%rsi), %r9
-.L_cache_w6: movq 6*8(%rsi), %r10
-.L_cache_w7: movq 7*8(%rsi), %r11
- movq %r8, 4*8(%rdi)
- movq %r9, 5*8(%rdi)
- movq %r10, 6*8(%rdi)
- movq %r11, 7*8(%rdi)
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
- decl %ecx
- jnz .L_cache_w0
-
- /* Are there any trailing 8-byte words? */
-.L_no_whole_cache_lines:
movl %edx, %ecx
andl $7, %edx
shrl $3, %ecx
jz .L_no_whole_words
- /* Copy trailing words */
-.L_copy_trailing_words:
+.L_read_words:
movq (%rsi), %r8
- mov %r8, (%rdi)
- leaq 8(%rsi), %rsi
- leaq 8(%rdi), %rdi
+.L_write_words:
+ movq %r8, (%rdi)
+ addq $8, %rsi
+ addq $8, %rdi
decl %ecx
- jnz .L_copy_trailing_words
+ jnz .L_read_words
/* Any trailing bytes? */
.L_no_whole_words:
@@ -264,38 +235,53 @@ ENTRY(memcpy_mcsafe_unrolled)
/* Copy trailing bytes */
movl %edx, %ecx
-.L_copy_trailing_bytes:
+.L_read_trailing_bytes:
movb (%rsi), %al
+.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
- jnz .L_copy_trailing_bytes
+ jnz .L_read_trailing_bytes
/* Copy successful. Return zero */
.L_done_memcpy_trap:
xorq %rax, %rax
ret
-ENDPROC(memcpy_mcsafe_unrolled)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
.section .fixup, "ax"
- /* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
- mov $-EFAULT, %rax
+ /*
+ * Return number of bytes not copied for any failure. Note that
+ * there is no "tail" handling since the source buffer is 8-byte
+ * aligned and poison is cacheline aligned.
+ */
+.E_read_words:
+ shll $3, %ecx
+.E_leading_bytes:
+ addl %edx, %ecx
+.E_trailing_bytes:
+ mov %ecx, %eax
ret
+ /*
+ * For write fault handling, given the destination is unaligned,
+ * we handle faults on multi-byte writes with a byte-by-byte
+ * copy up to the write-protected page.
+ */
+.E_write_words:
+ shll $3, %ecx
+ addl %edx, %ecx
+ movl %ecx, %edx
+ jmp mcsafe_handle_tail
+
.previous
- _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
- _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+ _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+ _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+ _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+ _ASM_EXTABLE(.L_write_words, .E_write_words)
+ _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..9c5606d88f61 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -23,13 +23,13 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
- "0: movq %[zero],(%[dst])\n"
- " addq %[eight],%[dst]\n"
+ "0: movq $0,(%[dst])\n"
+ " addq $8,%[dst]\n"
" decl %%ecx ; jnz 0b\n"
"4: movq %[size1],%%rcx\n"
" testl %%ecx,%%ecx\n"
" jz 2f\n"
- "1: movb %b[zero],(%[dst])\n"
+ "1: movb $0,(%[dst])\n"
" incq %[dst]\n"
" decl %%ecx ; jnz 1b\n"
"2:\n"
@@ -40,8 +40,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
_ASM_EXTABLE(0b,3b)
_ASM_EXTABLE(1b,2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
- : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
- [zero] "r" (0UL), [eight] "r" (8UL));
+ : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
clac();
return size;
}
@@ -75,6 +74,27 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
return len;
}
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+ for (; len; --len, to++, from++) {
+ /*
+ * Call the assembly routine back directly since
+ * memcpy_mcsafe() may silently fallback to memcpy.
+ */
+ unsigned long rem = __memcpy_mcsafe(to, from, 1);
+
+ if (rem)
+ break;
+ }
+ return len;
+}
+
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
* clean_cache_range - write back a cache range with CLWB
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index cc7ff5957194..2f3c9196b834 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -360,7 +360,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
void *pt)
{
if (__pa(pt) == __pa(kasan_zero_pmd) ||
- (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) ||
+ (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
__pa(pt) == __pa(kasan_zero_pud)) {
pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
note_page(m, st, __pgprot(prot), 0, 5);
@@ -476,8 +476,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
}
-#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
-#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
static inline bool is_hypervisor_range(int idx)
{
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c95ac71..9a84a0d08727 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -209,6 +209,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
unsigned lsb = 0;
siginfo_t info;
+ clear_siginfo(&info);
info.si_signo = si_signo;
info.si_errno = 0;
info.si_code = si_code;
@@ -439,7 +440,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd_k))
return -1;
- if (pgtable_l5_enabled) {
+ if (pgtable_l5_enabled()) {
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_k);
arch_flush_lazy_mmu_mode();
@@ -454,7 +455,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (p4d_none(*p4d_k))
return -1;
- if (p4d_none(*p4d) && !pgtable_l5_enabled) {
+ if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
set_p4d(p4d, *p4d_k);
arch_flush_lazy_mmu_mode();
} else {
@@ -828,6 +829,8 @@ static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct task_struct *tsk)
{
+ const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
if (!unhandled_signal(tsk, SIGSEGV))
return;
@@ -835,13 +838,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
return;
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
+ loglvl, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
print_vma_addr(KERN_CONT " in ", regs->ip);
printk(KERN_CONT "\n");
+
+ show_opcodes((u8 *)regs->ip, loglvl);
}
static void
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index a2f0c7e20fb0..fe7a12599d8e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -123,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- if (pgtable_l5_enabled) {
+ if (pgtable_l5_enabled()) {
set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a400606dea0..17383f9677fa 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -180,7 +180,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
*/
void sync_global_pgds(unsigned long start, unsigned long end)
{
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
sync_global_pgds_l5(start, end);
else
sync_global_pgds_l4(start, end);
@@ -643,7 +643,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -723,7 +723,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1100,7 +1100,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
free_pud_table(pud_base, p4d);
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 980dbebd0ca7..e3e77527f8df 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -2,10 +2,8 @@
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
-#ifdef CONFIG_X86_5LEVEL
-/* Too early to use cpu_feature_enabled() */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
#include <linux/bootmem.h>
#include <linux/kasan.h>
@@ -182,7 +180,7 @@ static void __init clear_pgds(unsigned long start,
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
pgd_clear(pgd);
else
p4d_clear(p4d_offset(pgd, start));
@@ -197,7 +195,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
- if (!pgtable_l5_enabled)
+ if (!pgtable_l5_enabled())
return (p4d_t *)pgd;
p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -284,7 +282,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
+ for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -315,7 +313,7 @@ void __init kasan_init(void)
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
- if (pgtable_l5_enabled) {
+ if (pgtable_l5_enabled()) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 615cc03ced84..61db77b0eda9 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -78,7 +78,7 @@ void __init kernel_randomize_memory(void)
struct rnd_state rand_state;
unsigned long remain_entropy;
- vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+ vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
vaddr = vaddr_start;
/*
@@ -124,7 +124,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -136,7 +136,7 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -212,7 +212,7 @@ void __meminit init_trampoline(void)
return;
}
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
init_trampoline_p4d();
else
init_trampoline_pud();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5aa816..fa150855647c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -136,13 +136,13 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
+ pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
return 0;
}
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: too many memblk ranges\n");
+ pr_err("too many memblk ranges\n");
return -EINVAL;
}
@@ -267,14 +267,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
*/
if (bi->end > bj->start && bi->start < bj->end) {
if (bi->nid != bj->nid) {
- pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
bi->nid, bi->start, bi->end - 1,
bj->nid, bj->start, bj->end - 1);
return -EINVAL;
}
- pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->start, bj->end - 1);
+ pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
}
/*
@@ -364,7 +364,7 @@ static int __init numa_alloc_distance(void)
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
size, PAGE_SIZE);
if (!phys) {
- pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ pr_warn("Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
@@ -410,14 +410,14 @@ void __init numa_set_distance(int from, int to, int distance)
if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
from < 0 || to < 0) {
- pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
+ pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
return;
}
if ((u8)distance != distance ||
(from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
from, to, distance);
return;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e055d1a06699..6eb1f34c3c85 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
unsigned long sp = current_stack_pointer;
pgd_t *pgd = pgd_offset(mm, sp);
- if (pgtable_l5_enabled) {
+ if (pgtable_l5_enabled()) {
if (unlikely(pgd_none(*pgd))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 263c8453815e..d765acedc05c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
-/* bpf_jit_comp.c : BPF JIT compiler
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
* Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -17,7 +18,7 @@
#include <asm/nospec-branch.h>
/*
- * assembly code in arch/x86/net/bpf_jit.S
+ * Assembly code in arch/x86/net/bpf_jit.S
*/
extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
@@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
#define EMIT1_off32(b1, off) \
- do {EMIT1(b1); EMIT(off, 4); } while (0)
+ do { EMIT1(b1); EMIT(off, 4); } while (0)
#define EMIT2_off32(b1, b2, off) \
- do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+ do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
#define EMIT3_off32(b1, b2, b3, off) \
- do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+ do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
#define EMIT4_off32(b1, b2, b3, b4, off) \
- do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+ do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
static bool is_imm8(int value)
{
@@ -70,9 +72,10 @@ static bool is_uimm32(u64 value)
}
/* mov dst, src */
-#define EMIT_mov(DST, SRC) \
- do {if (DST != SRC) \
- EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+#define EMIT_mov(DST, SRC) \
+ do { \
+ if (DST != SRC) \
+ EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
} while (0)
static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
return 0;
}
-/* list of x86 cond jumps opcodes (. + s8)
+/*
+ * List of x86 cond jumps opcodes (. + s8)
* Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
*/
#define X86_JB 0x72
@@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-/* pick a register outside of BPF range for JIT internal work */
+/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
-/* The following table maps BPF registers to x64 registers.
+/*
+ * The following table maps BPF registers to x86-64 registers.
*
- * x64 register r12 is unused, since if used as base address
+ * x86-64 register R12 is unused, since if used as base address
* register in load/store instructions, it always needs an
* extra byte of encoding and is callee saved.
*
- * r9 caches skb->len - skb->data_len
- * r10 caches skb->data, and used for blinding (if enabled)
+ * R9 caches skb->len - skb->data_len
+ * R10 caches skb->data, and used for blinding (if enabled)
*/
static const int reg2hex[] = {
- [BPF_REG_0] = 0, /* rax */
- [BPF_REG_1] = 7, /* rdi */
- [BPF_REG_2] = 6, /* rsi */
- [BPF_REG_3] = 2, /* rdx */
- [BPF_REG_4] = 1, /* rcx */
- [BPF_REG_5] = 0, /* r8 */
- [BPF_REG_6] = 3, /* rbx callee saved */
- [BPF_REG_7] = 5, /* r13 callee saved */
- [BPF_REG_8] = 6, /* r14 callee saved */
- [BPF_REG_9] = 7, /* r15 callee saved */
- [BPF_REG_FP] = 5, /* rbp readonly */
- [BPF_REG_AX] = 2, /* r10 temp register */
- [AUX_REG] = 3, /* r11 temp register */
+ [BPF_REG_0] = 0, /* RAX */
+ [BPF_REG_1] = 7, /* RDI */
+ [BPF_REG_2] = 6, /* RSI */
+ [BPF_REG_3] = 2, /* RDX */
+ [BPF_REG_4] = 1, /* RCX */
+ [BPF_REG_5] = 0, /* R8 */
+ [BPF_REG_6] = 3, /* RBX callee saved */
+ [BPF_REG_7] = 5, /* R13 callee saved */
+ [BPF_REG_8] = 6, /* R14 callee saved */
+ [BPF_REG_9] = 7, /* R15 callee saved */
+ [BPF_REG_FP] = 5, /* RBP readonly */
+ [BPF_REG_AX] = 2, /* R10 temp register */
+ [AUX_REG] = 3, /* R11 temp register */
};
-/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
* which need extra byte of encoding.
* rax,rcx,...,rbp have simpler encoding
*/
@@ -153,7 +159,7 @@ static bool is_axreg(u32 reg)
return reg == BPF_REG_0;
}
-/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
static u8 add_1mod(u8 byte, u32 reg)
{
if (is_ereg(reg))
@@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
return byte;
}
-/* encode 'dst_reg' register into x64 opcode 'byte' */
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
static u8 add_1reg(u8 byte, u32 dst_reg)
{
return byte + reg2hex[dst_reg];
}
-/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
{
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
static void jit_fill_hole(void *area, unsigned int size)
{
- /* fill whole space with int3 instructions */
+ /* Fill whole space with INT3 instructions */
memset(area, 0xcc, size);
}
struct jit_context {
- int cleanup_addr; /* epilogue code offset */
+ int cleanup_addr; /* Epilogue code offset */
bool seen_ld_abs;
bool seen_ax_reg;
};
-/* maximum number of bytes emitted while JITing one eBPF insn */
+/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
#define AUX_STACK_SPACE \
- (32 /* space for rbx, r13, r14, r15 */ + \
- 8 /* space for skb_copy_bits() buffer */)
+ (32 /* Space for RBX, R13, R14, R15 */ + \
+ 8 /* Space for skb_copy_bits() buffer */)
#define PROLOGUE_SIZE 37
-/* emit x64 prologue code for BPF program and check it's size.
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
* bpf_tail_call helper will skip it while jumping into another program
*/
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
u8 *prog = *pprog;
int cnt = 0;
- EMIT1(0x55); /* push rbp */
- EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+ /* push rbp */
+ EMIT1(0x55);
+
+ /* mov rbp,rsp */
+ EMIT3(0x48, 0x89, 0xE5);
/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
/* sub rbp, AUX_STACK_SPACE */
EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
- /* all classic BPF filters use R6(rbx) save it */
+ /* All classic BPF filters use R6(rbx) save it */
/* mov qword ptr [rbp+0],rbx */
EMIT4(0x48, 0x89, 0x5D, 0);
- /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
- * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
- * R8(r14). R9(r15) spill could be made conditional, but there is only
+ /*
+ * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
+ * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
+ * R8(R14). R9(R15) spill could be made conditional, but there is only
* one 'bpf_error' return path out of helper functions inside bpf_jit.S
* The overhead of extra spill is negligible for any filter other
* than synthetic ones. Therefore not worth adding complexity.
@@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
EMIT4(0x4C, 0x89, 0x7D, 24);
if (!ebpf_from_cbpf) {
- /* Clear the tail call counter (tail_call_cnt): for eBPF tail
+ /*
+ * Clear the tail call counter (tail_call_cnt): for eBPF tail
* calls we need to reset the counter to 0. It's done in two
- * instructions, resetting rax register to 0, and moving it
+ * instructions, resetting RAX register to 0, and moving it
* to the counter location.
*/
@@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
*pprog = prog;
}
-/* generate the following code:
+/*
+ * Generate the following code:
+ *
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
@@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog)
int label1, label2, label3;
int cnt = 0;
- /* rdi - pointer to ctx
+ /*
+ * rdi - pointer to ctx
* rsi - pointer to bpf_array
* rdx - index in bpf_array
*/
- /* if (index >= array->map.max_entries)
- * goto out;
+ /*
+ * if (index >= array->map.max_entries)
+ * goto out;
*/
EMIT2(0x89, 0xD2); /* mov edx, edx */
EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
EMIT2(X86_JBE, OFFSET1); /* jbe out */
label1 = cnt;
- /* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
- * goto out;
+ /*
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
*/
EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
offsetof(struct bpf_array, ptrs));
- /* if (prog == NULL)
- * goto out;
+ /*
+ * if (prog == NULL)
+ * goto out;
*/
EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog)
offsetof(struct bpf_prog, bpf_func));
EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */
- /* now we're ready to jump into next BPF program
+ /*
+ * Wow we're ready to jump into next BPF program
* rdi == ctx (1st arg)
* rax == prog->bpf_func + prologue_size
*/
@@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog)
u8 *prog = *pprog;
int cnt = 0;
- /* r9d = skb->len - skb->data_len (headlen)
+ /*
+ * r9d = skb->len - skb->data_len (headlen)
* r10 = skb->data
*/
/* mov %r9d, off32(%rdi) */
@@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
u8 b1, b2, b3;
int cnt = 0;
- /* optimization: if imm32 is positive, use 'mov %eax, imm32'
+ /*
+ * Optimization: if imm32 is positive, use 'mov %eax, imm32'
* (which zero-extends imm32) to save 2 bytes.
*/
if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
goto done;
}
- /* optimization: if imm32 is zero, use 'xor %eax, %eax'
+ /*
+ * Optimization: if imm32 is zero, use 'xor %eax, %eax'
* to save 3 bytes.
*/
if (imm32 == 0) {
@@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
int cnt = 0;
if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
- /* For emitting plain u32, where sign bit must not be
+ /*
+ * For emitting plain u32, where sign bit must not be
* propagated LLVM tends to load imm64 over mov32
* directly, so save couple of bytes by just doing
* 'mov %eax, imm32' instead.
@@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- /* b3 holds 'normal' opcode, b2 short form only valid
+ /*
+ * b3 holds 'normal' opcode, b2 short form only valid
* in case dst is eax/rax.
*/
switch (BPF_OP(insn->code)) {
@@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
/* mov rax, dst_reg */
EMIT_mov(BPF_REG_0, dst_reg);
- /* xor edx, edx
+ /*
+ * xor edx, edx
* equivalent to 'xor rdx, rdx', but one byte less
*/
EMIT2(0x31, 0xd2);
@@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
break;
}
- /* shifts */
+ /* Shifts */
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_RSH | BPF_X:
case BPF_ALU64 | BPF_ARSH | BPF_X:
- /* check for bad case when dst_reg == rcx */
+ /* Check for bad case when dst_reg == rcx */
if (dst_reg == BPF_REG_4) {
/* mov r11, dst_reg */
EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU | BPF_END | BPF_FROM_BE:
switch (imm32) {
case 16:
- /* emit 'ror %ax, 8' to swap lower 2 bytes */
+ /* Emit 'ror %ax, 8' to swap lower 2 bytes */
EMIT1(0x66);
if (is_ereg(dst_reg))
EMIT1(0x41);
EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
- /* emit 'movzwl eax, ax' */
+ /* Emit 'movzwl eax, ax' */
if (is_ereg(dst_reg))
EMIT3(0x45, 0x0F, 0xB7);
else
@@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
break;
case 32:
- /* emit 'bswap eax' to swap lower 4 bytes */
+ /* Emit 'bswap eax' to swap lower 4 bytes */
if (is_ereg(dst_reg))
EMIT2(0x41, 0x0F);
else
@@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT1(add_1reg(0xC8, dst_reg));
break;
case 64:
- /* emit 'bswap rax' to swap 8 bytes */
+ /* Emit 'bswap rax' to swap 8 bytes */
EMIT3(add_1mod(0x48, dst_reg), 0x0F,
add_1reg(0xC8, dst_reg));
break;
@@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU | BPF_END | BPF_FROM_LE:
switch (imm32) {
case 16:
- /* emit 'movzwl eax, ax' to zero extend 16-bit
+ /*
+ * Emit 'movzwl eax, ax' to zero extend 16-bit
* into 64 bit
*/
if (is_ereg(dst_reg))
@@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
break;
case 32:
- /* emit 'mov eax, eax' to clear upper 32-bits */
+ /* Emit 'mov eax, eax' to clear upper 32-bits */
if (is_ereg(dst_reg))
EMIT1(0x45);
EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +835,9 @@ st: if (is_imm8(insn->off))
/* STX: *(u8*)(dst_reg + off) = src_reg */
case BPF_STX | BPF_MEM | BPF_B:
- /* emit 'mov byte ptr [rax + off], al' */
+ /* Emit 'mov byte ptr [rax + off], al' */
if (is_ereg(dst_reg) || is_ereg(src_reg) ||
- /* have to add extra byte for x86 SIL, DIL regs */
+ /* We have to add extra byte for x86 SIL, DIL regs */
src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
else
@@ -840,25 +866,26 @@ stx: if (is_imm8(insn->off))
/* LDX: dst_reg = *(u8*)(src_reg + off) */
case BPF_LDX | BPF_MEM | BPF_B:
- /* emit 'movzx rax, byte ptr [rax + off]' */
+ /* Emit 'movzx rax, byte ptr [rax + off]' */
EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
goto ldx;
case BPF_LDX | BPF_MEM | BPF_H:
- /* emit 'movzx rax, word ptr [rax + off]' */
+ /* Emit 'movzx rax, word ptr [rax + off]' */
EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
goto ldx;
case BPF_LDX | BPF_MEM | BPF_W:
- /* emit 'mov eax, dword ptr [rax+0x14]' */
+ /* Emit 'mov eax, dword ptr [rax+0x14]' */
if (is_ereg(dst_reg) || is_ereg(src_reg))
EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
else
EMIT1(0x8B);
goto ldx;
case BPF_LDX | BPF_MEM | BPF_DW:
- /* emit 'mov rax, qword ptr [rax+0x14]' */
+ /* Emit 'mov rax, qword ptr [rax+0x14]' */
EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx: /* if insn->off == 0 we can save one extra byte, but
- * special case of x86 r13 which always needs an offset
+ldx: /*
+ * If insn->off == 0 we can save one extra byte, but
+ * special case of x86 R13 which always needs an offset
* is not worth the hassle
*/
if (is_imm8(insn->off))
@@ -870,7 +897,7 @@ ldx: /* if insn->off == 0 we can save one extra byte, but
/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
case BPF_STX | BPF_XADD | BPF_W:
- /* emit 'lock add dword ptr [rax + off], eax' */
+ /* Emit 'lock add dword ptr [rax + off], eax' */
if (is_ereg(dst_reg) || is_ereg(src_reg))
EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
else
@@ -897,14 +924,15 @@ xadd: if (is_imm8(insn->off))
} else {
EMIT2(0x41, 0x52); /* push %r10 */
EMIT2(0x41, 0x51); /* push %r9 */
- /* need to adjust jmp offset, since
+ /*
+ * We need to adjust jmp offset, since
* pop %r9, pop %r10 take 4 bytes after call insn
*/
jmp_offset += 4;
}
}
if (!imm32 || !is_simm32(jmp_offset)) {
- pr_err("unsupported bpf func %d addr %p image %p\n",
+ pr_err("unsupported BPF func %d addr %p image %p\n",
imm32, func, image);
return -EINVAL;
}
@@ -970,7 +998,7 @@ xadd: if (is_imm8(insn->off))
else
EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
-emit_cond_jmp: /* convert BPF opcode to x86 */
+emit_cond_jmp: /* Convert BPF opcode to x86 */
switch (BPF_OP(insn->code)) {
case BPF_JEQ:
jmp_cond = X86_JE;
@@ -996,22 +1024,22 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
jmp_cond = X86_JBE;
break;
case BPF_JSGT:
- /* signed '>', GT in x86 */
+ /* Signed '>', GT in x86 */
jmp_cond = X86_JG;
break;
case BPF_JSLT:
- /* signed '<', LT in x86 */
+ /* Signed '<', LT in x86 */
jmp_cond = X86_JL;
break;
case BPF_JSGE:
- /* signed '>=', GE in x86 */
+ /* Signed '>=', GE in x86 */
jmp_cond = X86_JGE;
break;
case BPF_JSLE:
- /* signed '<=', LE in x86 */
+ /* Signed '<=', LE in x86 */
jmp_cond = X86_JLE;
break;
- default: /* to silence gcc warning */
+ default: /* to silence GCC warning */
return -EFAULT;
}
jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1039,7 +1067,7 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
jmp_offset = addrs[i + insn->off] - addrs[i];
if (!jmp_offset)
- /* optimize out nop jumps */
+ /* Optimize out nop jumps */
break;
emit_jmp:
if (is_imm8(jmp_offset)) {
@@ -1061,7 +1089,7 @@ common_load:
ctx->seen_ld_abs = seen_ld_abs = true;
jmp_offset = func - (image + addrs[i]);
if (!func || !is_simm32(jmp_offset)) {
- pr_err("unsupported bpf func %d addr %p image %p\n",
+ pr_err("unsupported BPF func %d addr %p image %p\n",
imm32, func, image);
return -EINVAL;
}
@@ -1080,7 +1108,8 @@ common_load:
EMIT2_off32(0x81, 0xC6, imm32);
}
}
- /* skb pointer is in R6 (%rbx), it will be copied into
+ /*
+ * skb pointer is in R6 (%rbx), it will be copied into
* %rdi if skb_copy_bits() call is necessary.
* sk_load_* helpers also use %r10 and %r9d.
* See bpf_jit.S
@@ -1111,7 +1140,7 @@ common_load:
goto emit_jmp;
}
seen_exit = true;
- /* update cleanup_addr */
+ /* Update cleanup_addr */
ctx->cleanup_addr = proglen;
/* mov rbx, qword ptr [rbp+0] */
EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1129,10 +1158,11 @@ common_load:
break;
default:
- /* By design x64 JIT should support all BPF instructions
+ /*
+ * By design x86-64 JIT should support all BPF instructions.
* This error will be seen if new instruction was added
- * to interpreter, but not to JIT
- * or if there is junk in bpf_prog
+ * to the interpreter, but not to the JIT, or if there is
+ * junk in bpf_prog.
*/
pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
return -EINVAL;
@@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
return orig_prog;
tmp = bpf_jit_blind_constants(prog);
- /* If blinding was requested and we failed during blinding,
+ /*
+ * If blinding was requested and we failed during blinding,
* we must fall back to the interpreter.
*/
if (IS_ERR(tmp))
@@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
goto out_addrs;
}
- /* Before first pass, make a rough estimation of addrs[]
- * each bpf instruction is translated to less than 64 bytes
+ /*
+ * Before first pass, make a rough estimation of addrs[]
+ * each BPF instruction is translated to less than 64 bytes
*/
for (proglen = 0, i = 0; i < prog->len; i++) {
proglen += 64;
@@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
ctx.cleanup_addr = proglen;
skip_init_addrs:
- /* JITed image shrinks with every pass and the loop iterates
- * until the image stops shrinking. Very large bpf programs
+ /*
+ * JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large BPF programs
* may converge on the last pass. In such case do one more
- * pass to emit the final image
+ * pass to emit the final image.
*/
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index bed7e7f4e44c..e01f7ceb9e7a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -225,7 +225,7 @@ int __init efi_alloc_page_tables(void)
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
- if (pgtable_l5_enabled)
+ if (pgtable_l5_enabled())
free_page((unsigned long) pgd_page_vaddr(*pgd));
free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
return -ENOMEM;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 58024862a7eb..a52914aa3b6c 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -57,7 +57,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg)
}
EXPORT_SYMBOL_GPL(vrtc_cmos_write);
-void vrtc_get_time(struct timespec *now)
+void vrtc_get_time(struct timespec64 *now)
{
u8 sec, min, hour, mday, mon;
unsigned long flags;
@@ -83,18 +83,18 @@ void vrtc_get_time(struct timespec *now)
pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
"mon: %d year: %d\n", sec, min, hour, mday, mon, year);
- now->tv_sec = mktime(year, mon, mday, hour, min, sec);
+ now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
now->tv_nsec = 0;
}
-int vrtc_set_mmss(const struct timespec *now)
+int vrtc_set_mmss(const struct timespec64 *now)
{
unsigned long flags;
struct rtc_time tm;
int year;
int retval = 0;
- rtc_time_to_tm(now->tv_sec, &tm);
+ rtc_time64_to_tm(now->tv_sec, &tm);
if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
/*
* tm.year is the number of years since 1900, and the
@@ -110,8 +110,8 @@ int vrtc_set_mmss(const struct timespec *now)
vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
spin_unlock_irqrestore(&rtc_lock, flags);
} else {
- pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
- __func__, now->tv_sec);
+ pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n",
+ __func__, (s64)now->tv_sec);
retval = -EINVAL;
}
return retval;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index ccf4a49bb065..67ccf64c8bd8 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -72,7 +72,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* tables used by the image kernel.
*/
- if (pgtable_l5_enabled) {
+ if (pgtable_l5_enabled()) {
p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
if (!p4d)
return -ENOMEM;
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 10003359e633..b2d6967262b2 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -23,14 +23,14 @@ $(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
$(obj)/%.so: OBJCOPYFLAGS := -S
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a18703be9ead..1804b27f9632 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -115,6 +115,61 @@ static efi_system_table_t __init *xen_efi_probe(void)
return &efi_systab_xen;
}
+/*
+ * Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
+ */
+static enum efi_secureboot_mode xen_efi_get_secureboot(void)
+{
+ static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+ static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+ efi_status_t status;
+ u8 moksbstate, secboot, setupmode;
+ unsigned long size;
+
+ size = sizeof(secboot);
+ status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
+ NULL, &size, &secboot);
+
+ if (status == EFI_NOT_FOUND)
+ return efi_secureboot_mode_disabled;
+
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ size = sizeof(setupmode);
+ status = efi.get_variable(L"SetupMode", &efi_variable_guid,
+ NULL, &size, &setupmode);
+
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ if (secboot == 0 || setupmode == 1)
+ return efi_secureboot_mode_disabled;
+
+ /* See if a user has put the shim into insecure mode. */
+ size = sizeof(moksbstate);
+ status = efi.get_variable(L"MokSBStateRT", &shim_guid,
+ NULL, &size, &moksbstate);
+
+ /* If it fails, we don't care why. Default to secure. */
+ if (status != EFI_SUCCESS)
+ goto secure_boot_enabled;
+
+ if (moksbstate == 1)
+ return efi_secureboot_mode_disabled;
+
+ secure_boot_enabled:
+ pr_info("UEFI Secure Boot is enabled.\n");
+ return efi_secureboot_mode_enabled;
+
+ out_efi_err:
+ pr_err("Could not determine UEFI Secure Boot status.\n");
+ return efi_secureboot_mode_unknown;
+}
+
void __init xen_efi_init(void)
{
efi_system_table_t *efi_systab_xen;
@@ -129,6 +184,8 @@ void __init xen_efi_init(void)
boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
+ boot_params.secure_boot = xen_efi_get_secureboot();
+
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_PARAVIRT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 29163c43ebbd..e0f1bcf01d63 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -57,7 +57,7 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
-static void xen_read_wallclock(struct timespec *ts)
+static void xen_read_wallclock(struct timespec64 *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
struct pvclock_wall_clock *wall_clock = &(s->wc);
@@ -68,12 +68,12 @@ static void xen_read_wallclock(struct timespec *ts)
put_cpu_var(xen_vcpu);
}
-static void xen_get_wallclock(struct timespec *now)
+static void xen_get_wallclock(struct timespec64 *now)
{
xen_read_wallclock(now);
}
-static int xen_set_wallclock(const struct timespec *now)
+static int xen_set_wallclock(const struct timespec64 *now)
{
return -ENODEV;
}
@@ -461,7 +461,7 @@ static void __init xen_time_init(void)
{
struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
- struct timespec tp;
+ struct timespec64 tp;
/* As Dom0 is never moved, no penalty on using TSC there */
if (xen_initial_domain())
@@ -479,7 +479,7 @@ static void __init xen_time_init(void)
/* Set initial system time with full resolution */
xen_read_wallclock(&tp);
- do_settimeofday(&tp);
+ do_settimeofday64(&tp);
setup_force_cpu_cap(X86_FEATURE_TSC);
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index c921e8bccdc8..17df332269b2 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -19,7 +19,6 @@ config XTENSA
select HAVE_ARCH_KASAN if MMU
select HAVE_CC_STACKPROTECTOR
select HAVE_DEBUG_KMEMLEAK
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_EXIT_THREAD
select HAVE_FUNCTION_TRACER
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 436b20337168..e5e1e61c538c 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -1,4 +1,5 @@
generic-y += bug.h
+generic-y += compat.h
generic-y += device.h
generic-y += div64.h
generic-y += dma-contiguous.h
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index d5a82153a7c5..6ddf0a30c60d 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -42,8 +42,6 @@ extern struct pci_controller* pcibios_alloc_controller(void);
* decisions.
*/
-#define PCI_DMA_BUS_IS_PHYS (1)
-
/* Tell PCI code what kind of PCI resource mappings we support */
#define HAVE_PCI_MMAP 1
#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
diff --git a/arch/xtensa/include/uapi/asm/msgbuf.h b/arch/xtensa/include/uapi/asm/msgbuf.h
index 36e2e103ca38..d6915e9f071c 100644
--- a/arch/xtensa/include/uapi/asm/msgbuf.h
+++ b/arch/xtensa/include/uapi/asm/msgbuf.h
@@ -7,7 +7,6 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*
* This file is subject to the terms and conditions of the GNU General
@@ -21,19 +20,19 @@
struct msqid64_ds {
struct ipc64_perm msg_perm;
#ifdef __XTENSA_EB__
- unsigned int __unused1;
- __kernel_time_t msg_stime; /* last msgsnd time */
- unsigned int __unused2;
- __kernel_time_t msg_rtime; /* last msgrcv time */
- unsigned int __unused3;
- __kernel_time_t msg_ctime; /* last change time */
+ unsigned long msg_stime_high;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_ctime_high;
+ unsigned long msg_ctime; /* last change time */
#elif defined(__XTENSA_EL__)
- __kernel_time_t msg_stime; /* last msgsnd time */
- unsigned int __unused1;
- __kernel_time_t msg_rtime; /* last msgrcv time */
- unsigned int __unused2;
- __kernel_time_t msg_ctime; /* last change time */
- unsigned int __unused3;
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_stime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_ctime; /* last change time */
+ unsigned long msg_ctime_high;
#else
# error processor byte order undefined!
#endif
diff --git a/arch/xtensa/include/uapi/asm/sembuf.h b/arch/xtensa/include/uapi/asm/sembuf.h
index f61b6331a10c..09f348d643f1 100644
--- a/arch/xtensa/include/uapi/asm/sembuf.h
+++ b/arch/xtensa/include/uapi/asm/sembuf.h
@@ -14,7 +14,6 @@
* between kernel and user space.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*
*/
@@ -27,15 +26,15 @@
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
#ifdef __XTENSA_EL__
- __kernel_time_t sem_otime; /* last semop time */
- unsigned long __unused1;
- __kernel_time_t sem_ctime; /* last change time */
- unsigned long __unused2;
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_ctime_high;
#else
- unsigned long __unused1;
- __kernel_time_t sem_otime; /* last semop time */
- unsigned long __unused2;
- __kernel_time_t sem_ctime; /* last change time */
+ unsigned long sem_otime_high;
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_ctime_high;
+ unsigned long sem_ctime; /* last change time */
#endif
unsigned long sem_nsems; /* no. of semaphores in array */
unsigned long __unused3;
diff --git a/arch/xtensa/include/uapi/asm/shmbuf.h b/arch/xtensa/include/uapi/asm/shmbuf.h
index 26550bdc8430..554a57a6a90f 100644
--- a/arch/xtensa/include/uapi/asm/shmbuf.h
+++ b/arch/xtensa/include/uapi/asm/shmbuf.h
@@ -4,10 +4,10 @@
*
* The shmid64_ds structure for Xtensa architecture.
* Note extra padding because this structure is passed back and forth
- * between kernel and user space.
+ * between kernel and user space, but the padding is on the wrong
+ * side for big-endian xtensa, for historic reasons.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*
* This file is subject to the terms and conditions of the GNU General Public
@@ -20,42 +20,21 @@
#ifndef _XTENSA_SHMBUF_H
#define _XTENSA_SHMBUF_H
-#if defined (__XTENSA_EL__)
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
- unsigned long __unused1;
- __kernel_time_t shm_dtime; /* last detach time */
- unsigned long __unused2;
- __kernel_time_t shm_ctime; /* last change time */
- unsigned long __unused3;
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_atime_high;
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_dtime_high;
+ unsigned long shm_ctime; /* last change time */
+ unsigned long shm_ctime_high;
__kernel_pid_t shm_cpid; /* pid of creator */
__kernel_pid_t shm_lpid; /* pid of last operator */
unsigned long shm_nattch; /* no. of current attaches */
unsigned long __unused4;
unsigned long __unused5;
};
-#elif defined (__XTENSA_EB__)
-struct shmid64_ds {
- struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
- unsigned long __unused1;
- __kernel_time_t shm_dtime; /* last detach time */
- unsigned long __unused2;
- __kernel_time_t shm_ctime; /* last change time */
- unsigned long __unused3;
- __kernel_pid_t shm_cpid; /* pid of creator */
- __kernel_pid_t shm_lpid; /* pid of last operator */
- unsigned long shm_nattch; /* no. of current attaches */
- unsigned long __unused4;
- unsigned long __unused5;
-};
-#else
-# error endian order not defined
-#endif
-
struct shminfo64 {
unsigned long shmmax;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 732631ce250f..392b4a80ebc2 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -261,12 +261,3 @@ const struct dma_map_ops xtensa_dma_map_ops = {
.mapping_error = xtensa_dma_mapping_error,
};
EXPORT_SYMBOL(xtensa_dma_map_ops);
-
-#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-
-static int __init xtensa_dma_init(void)
-{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
- return 0;
-}
-fs_initcall(xtensa_dma_init);
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 32c5207f1226..86507fa7c2d7 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -323,8 +323,6 @@ do_illegal_instruction(struct pt_regs *regs)
void
do_unaligned_user (struct pt_regs *regs)
{
- siginfo_t info;
-
__die_if_kernel("Unhandled unaligned exception in kernel",
regs, SIGKILL);
@@ -334,12 +332,7 @@ do_unaligned_user (struct pt_regs *regs)
"(pid = %d, pc = %#010lx)\n",
regs->excvaddr, current->comm,
task_pid_nr(current), regs->pc);
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRALN;
- info.si_addr = (void *) regs->excvaddr;
- force_sig_info(SIGSEGV, &info, current);
-
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr, current);
}
#endif
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 8b9b6f44bb06..c111a833205a 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -39,13 +39,13 @@ void do_page_fault(struct pt_regs *regs)
struct mm_struct *mm = current->mm;
unsigned int exccause = regs->exccause;
unsigned int address = regs->excvaddr;
- siginfo_t info;
+ int code;
int is_write, is_exec;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- info.si_code = SEGV_MAPERR;
+ code = SEGV_MAPERR;
/* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
@@ -91,7 +91,7 @@ retry:
*/
good_area:
- info.si_code = SEGV_ACCERR;
+ code = SEGV_ACCERR;
if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
@@ -157,11 +157,7 @@ bad_area:
if (user_mode(regs)) {
current->thread.bad_vaddr = address;
current->thread.error_code = is_write;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void *) address;
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, code, (void *) address, current);
return;
}
bad_page_fault(regs, address, SIGSEGV);
@@ -186,11 +182,7 @@ do_sigbus:
* or user mode.
*/
current->thread.bad_vaddr = address;
- info.si_code = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void *) address;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address, current);
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 7846c0c20cfe..89ed613c017e 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = {
.sendpage = sock_no_sendpage,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.bind = alg_bind,
.release = af_alg_release,
@@ -1061,19 +1060,12 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
}
EXPORT_SYMBOL_GPL(af_alg_async_cb);
-/**
- * af_alg_poll - poll system call handler
- */
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct alg_sock *ask = alg_sk(sk);
struct af_alg_ctx *ctx = ask->private;
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ __poll_t mask = 0;
if (!ctx->more || ctx->used)
mask |= EPOLLIN | EPOLLRDNORM;
@@ -1083,7 +1075,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,
return mask;
}
-EXPORT_SYMBOL_GPL(af_alg_poll);
+EXPORT_SYMBOL_GPL(af_alg_poll_mask);
/**
* af_alg_alloc_areq - allocate struct af_alg_async_req
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 4b07edd5a9ff..330cf9f2b767 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {
.sendmsg = aead_sendmsg,
.sendpage = af_alg_sendpage,
.recvmsg = aead_recvmsg,
- .poll = af_alg_poll,
+ .poll_mask = af_alg_poll_mask,
};
static int aead_check_key(struct socket *sock)
@@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {
.sendmsg = aead_sendmsg_nokey,
.sendpage = aead_sendpage_nokey,
.recvmsg = aead_recvmsg_nokey,
- .poll = af_alg_poll,
+ .poll_mask = af_alg_poll_mask,
};
static void *aead_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 6c9b1927a520..bfcf595fd8f9 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = {
.mmap = sock_no_mmap,
.bind = sock_no_bind,
.setsockopt = sock_no_setsockopt,
- .poll = sock_no_poll,
.release = af_alg_release,
.sendmsg = hash_sendmsg,
@@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = {
.mmap = sock_no_mmap,
.bind = sock_no_bind,
.setsockopt = sock_no_setsockopt,
- .poll = sock_no_poll,
.release = af_alg_release,
.sendmsg = hash_sendmsg_nokey,
diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
index 150c2b6480ed..22df3799a17b 100644
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = {
.bind = sock_no_bind,
.accept = sock_no_accept,
.setsockopt = sock_no_setsockopt,
- .poll = sock_no_poll,
.sendmsg = sock_no_sendmsg,
.sendpage = sock_no_sendpage,
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index c4e885df4564..15cf3c5222e0 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -205,7 +205,7 @@ static struct proto_ops algif_skcipher_ops = {
.sendmsg = skcipher_sendmsg,
.sendpage = af_alg_sendpage,
.recvmsg = skcipher_recvmsg,
- .poll = af_alg_poll,
+ .poll_mask = af_alg_poll_mask,
};
static int skcipher_check_key(struct socket *sock)
@@ -301,7 +301,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
.sendmsg = skcipher_sendmsg_nokey,
.sendpage = skcipher_sendpage_nokey,
.recvmsg = skcipher_recvmsg_nokey,
- .poll = af_alg_poll,
+ .poll_mask = af_alg_poll_mask,
};
static void *skcipher_bind(const char *name, u32 type, u32 mask)
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 4a3ac31c07d0..3b0118786b43 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -20,6 +20,7 @@
#include <linux/sizes.h>
#include <linux/limits.h>
#include <linux/clk/clk-conf.h>
+#include <linux/platform_device.h>
#include <asm/irq.h>
@@ -193,14 +194,16 @@ static const struct dev_pm_ops amba_pm = {
/*
* Primecells are part of the Advanced Microcontroller Bus Architecture,
* so we call the bus "amba".
+ * DMA configuration for platform and AMBA bus is same. So here we reuse
+ * platform's DMA config routine.
*/
struct bus_type amba_bustype = {
.name = "amba",
.dev_groups = amba_dev_groups,
.match = amba_match,
.uevent = amba_uevent,
+ .dma_configure = platform_dma_configure,
.pm = &amba_pm,
- .force_dma = true,
};
static int __init amba_init(void)
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index d82566d6e237..f831a582209c 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -329,36 +329,13 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
#endif
/*
- * Common configuration to enable DMA API use for a device
+ * enables DMA API use for a device
*/
-#include <linux/pci.h>
-
int dma_configure(struct device *dev)
{
- struct device *bridge = NULL, *dma_dev = dev;
- enum dev_dma_attr attr;
- int ret = 0;
-
- if (dev_is_pci(dev)) {
- bridge = pci_get_host_bridge_device(to_pci_dev(dev));
- dma_dev = bridge;
- if (IS_ENABLED(CONFIG_OF) && dma_dev->parent &&
- dma_dev->parent->of_node)
- dma_dev = dma_dev->parent;
- }
-
- if (dma_dev->of_node) {
- ret = of_dma_configure(dev, dma_dev->of_node);
- } else if (has_acpi_companion(dma_dev)) {
- attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
- if (attr != DEV_DMA_NOT_SUPPORTED)
- ret = acpi_dma_configure(dev, attr);
- }
-
- if (bridge)
- pci_put_host_bridge_device(bridge);
-
- return ret;
+ if (dev->bus->dma_configure)
+ return dev->bus->dma_configure(dev);
+ return 0;
}
void dma_deconfigure(struct device *dev)
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8e22073aeeed..60d6cc618f1c 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -101,6 +101,9 @@ static void platform_msi_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
if (!chip->irq_write_msi_msg)
chip->irq_write_msi_msg = platform_msi_write_msg;
+ if (WARN_ON((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+ !(chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)))
+ info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
}
static void platform_msi_free_descs(struct device *dev, int base, int nvec)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 8075ddc70a17..c0ff1e73a634 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1130,6 +1130,22 @@ int platform_pm_restore(struct device *dev)
#endif /* CONFIG_HIBERNATE_CALLBACKS */
+int platform_dma_configure(struct device *dev)
+{
+ enum dev_dma_attr attr;
+ int ret = 0;
+
+ if (dev->of_node) {
+ ret = of_dma_configure(dev, dev->of_node, true);
+ } else if (has_acpi_companion(dev)) {
+ attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
+ if (attr != DEV_DMA_NOT_SUPPORTED)
+ ret = acpi_dma_configure(dev, attr);
+ }
+
+ return ret;
+}
+
static const struct dev_pm_ops platform_dev_pm_ops = {
.runtime_suspend = pm_generic_runtime_suspend,
.runtime_resume = pm_generic_runtime_resume,
@@ -1141,8 +1157,8 @@ struct bus_type platform_bus_type = {
.dev_groups = platform_dev_groups,
.match = platform_match,
.uevent = platform_uevent,
+ .dma_configure = platform_dma_configure,
.pm = &platform_dev_pm_ops,
- .force_dma = true,
};
EXPORT_SYMBOL_GPL(platform_bus_type);
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index 5cadfd3394d8..8741fb5f8f54 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -206,7 +206,8 @@ static void regmap_mmio_free_context(void *context)
if (!IS_ERR(ctx->clk)) {
clk_unprepare(ctx->clk);
- clk_put(ctx->clk);
+ if (!ctx->attached_clk)
+ clk_put(ctx->clk);
}
kfree(context);
}
diff --git a/drivers/base/regmap/regmap-slimbus.c b/drivers/base/regmap/regmap-slimbus.c
index c90bee81d954..91d501eda8a9 100644
--- a/drivers/base/regmap/regmap-slimbus.c
+++ b/drivers/base/regmap/regmap-slimbus.c
@@ -41,7 +41,7 @@ static struct regmap_bus regmap_slimbus_bus = {
static const struct regmap_bus *regmap_get_slimbus(struct slim_device *slim,
const struct regmap_config *config)
{
- if (config->val_bits == 8 && config->reg_bits == 8)
+ if (config->val_bits == 8 && config->reg_bits == 16)
return &regmap_slimbus_bus;
return ERR_PTR(-ENOTSUPP);
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index e6986c7608f1..fc1f4acdd189 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -207,7 +207,7 @@ static void bcma_of_fill_device(struct device *parent,
core->irq = bcma_of_get_irq(parent, core, 0);
- of_dma_configure(&core->dev, node);
+ of_dma_configure(&core->dev, node, false);
}
unsigned int bcma_core_irq(struct bcma_device *core, int num)
diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c
index ec35e255b496..8b9c66d7c4ff 100644
--- a/drivers/bus/fsl-mc/fsl-mc-msi.c
+++ b/drivers/bus/fsl-mc/fsl-mc-msi.c
@@ -163,6 +163,8 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode,
{
struct irq_domain *domain;
+ if (WARN_ON((info->flags & MSI_FLAG_LEVEL_CAPABLE)))
+ info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
fsl_mc_msi_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index cd888d4ee605..a8fb0020ba5c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -402,8 +402,7 @@ static struct poolinfo {
/*
* Static global variables
*/
-static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
-static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_wait);
static struct fasync_struct *fasync;
static DEFINE_SPINLOCK(random_ready_list_lock);
@@ -722,8 +721,8 @@ retry:
/* should we wake readers? */
if (entropy_bits >= random_read_wakeup_bits &&
- wq_has_sleeper(&random_read_wait)) {
- wake_up_interruptible(&random_read_wait);
+ wq_has_sleeper(&random_wait)) {
+ wake_up_interruptible_poll(&random_wait, POLLIN);
kill_fasync(&fasync, SIGIO, POLL_IN);
}
/* If the input pool is getting full, send some
@@ -1397,7 +1396,7 @@ retry:
trace_debit_entropy(r->name, 8 * ibytes);
if (ibytes &&
(r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) {
- wake_up_interruptible(&random_write_wait);
+ wake_up_interruptible_poll(&random_wait, POLLOUT);
kill_fasync(&fasync, SIGIO, POLL_OUT);
}
@@ -1839,7 +1838,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes)
if (nonblock)
return -EAGAIN;
- wait_event_interruptible(random_read_wait,
+ wait_event_interruptible(random_wait,
ENTROPY_BITS(&input_pool) >=
random_read_wakeup_bits);
if (signal_pending(current))
@@ -1876,14 +1875,17 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
return ret;
}
+static struct wait_queue_head *
+random_get_poll_head(struct file *file, __poll_t events)
+{
+ return &random_wait;
+}
+
static __poll_t
-random_poll(struct file *file, poll_table * wait)
+random_poll_mask(struct file *file, __poll_t events)
{
- __poll_t mask;
+ __poll_t mask = 0;
- poll_wait(file, &random_read_wait, wait);
- poll_wait(file, &random_write_wait, wait);
- mask = 0;
if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
mask |= EPOLLIN | EPOLLRDNORM;
if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
@@ -1990,7 +1992,8 @@ static int random_fasync(int fd, struct file *filp, int on)
const struct file_operations random_fops = {
.read = random_read,
.write = random_write,
- .poll = random_poll,
+ .get_poll_head = random_get_poll_head,
+ .poll_mask = random_poll_mask,
.unlocked_ioctl = random_ioctl,
.fasync = random_fasync,
.llseek = noop_llseek,
@@ -2323,7 +2326,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,
* We'll be woken up again once below random_write_wakeup_thresh,
* or when the calling thread is about to terminate.
*/
- wait_event_interruptible(random_write_wait, kthread_should_stop() ||
+ wait_event_interruptible(random_wait, kthread_should_stop() ||
ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);
mix_pool_bytes(poolp, buffer, count);
credit_entropy_bits(poolp, entropy);
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 8e8a09755d10..dec0dd88ec15 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -450,8 +450,10 @@ config MTK_TIMER
Support for Mediatek timer driver.
config SPRD_TIMER
- bool "Spreadtrum timer driver" if COMPILE_TEST
+ bool "Spreadtrum timer driver" if EXPERT
depends on HAS_IOMEM
+ depends on (ARCH_SPRD || COMPILE_TEST)
+ default ARCH_SPRD
select TIMER_OF
help
Enables support for the Spreadtrum timer driver.
diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c
index 471b428d8034..20da9b1d7f7d 100644
--- a/drivers/clocksource/arc_timer.c
+++ b/drivers/clocksource/arc_timer.c
@@ -61,6 +61,20 @@ static u64 arc_read_gfrc(struct clocksource *cs)
unsigned long flags;
u32 l, h;
+ /*
+ * From a programming model pov, there seems to be just one instance of
+ * MCIP_CMD/MCIP_READBACK however micro-architecturally there's
+ * an instance PER ARC CORE (not per cluster), and there are dedicated
+ * hardware decode logic (per core) inside ARConnect to handle
+ * simultaneous read/write accesses from cores via those two registers.
+ * So several concurrent commands to ARConnect are OK if they are
+ * trying to access two different sub-components (like GFRC,
+ * inter-core interrupt, etc...). HW also supports simultaneously
+ * accessing GFRC by multiple cores.
+ * That's why it is safe to disable hard interrupts on the local CPU
+ * before access to GFRC instead of taking global MCIP spinlock
+ * defined in arch/arc/kernel/mcip.c
+ */
local_irq_save(flags);
__mcip_cmd(CMD_GFRC_READ_LO, 0);
diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c
index 986b6796b631..54f8a331b53a 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -5,6 +5,9 @@
*
* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
*/
+
+#define pr_fmt(fmt) "mips-gic-timer: " fmt
+
#include <linux/clk.h>
#include <linux/clockchips.h>
#include <linux/cpu.h>
@@ -136,8 +139,7 @@ static int gic_clockevent_init(void)
ret = setup_percpu_irq(gic_timer_irq, &gic_compare_irqaction);
if (ret < 0) {
- pr_err("GIC timer IRQ %d setup failed: %d\n",
- gic_timer_irq, ret);
+ pr_err("IRQ %d setup failed (%d)\n", gic_timer_irq, ret);
return ret;
}
@@ -176,7 +178,7 @@ static int __init __gic_clocksource_init(void)
ret = clocksource_register_hz(&gic_clocksource, gic_frequency);
if (ret < 0)
- pr_warn("GIC: Unable to register clocksource\n");
+ pr_warn("Unable to register clocksource\n");
return ret;
}
@@ -188,7 +190,7 @@ static int __init gic_clocksource_of_init(struct device_node *node)
if (!mips_gic_present() || !node->parent ||
!of_device_is_compatible(node->parent, "mti,gic")) {
- pr_warn("No DT definition for the mips gic driver\n");
+ pr_warn("No DT definition\n");
return -ENXIO;
}
@@ -196,7 +198,7 @@ static int __init gic_clocksource_of_init(struct device_node *node)
if (!IS_ERR(clk)) {
ret = clk_prepare_enable(clk);
if (ret < 0) {
- pr_err("GIC failed to enable clock\n");
+ pr_err("Failed to enable clock\n");
clk_put(clk);
return ret;
}
@@ -204,12 +206,12 @@ static int __init gic_clocksource_of_init(struct device_node *node)
gic_frequency = clk_get_rate(clk);
} else if (of_property_read_u32(node, "clock-frequency",
&gic_frequency)) {
- pr_err("GIC frequency not specified.\n");
+ pr_err("Frequency not specified\n");
return -EINVAL;
}
gic_timer_irq = irq_of_parse_and_map(node, 0);
if (!gic_timer_irq) {
- pr_err("GIC timer IRQ not specified.\n");
+ pr_err("IRQ not specified\n");
return -EINVAL;
}
@@ -220,7 +222,7 @@ static int __init gic_clocksource_of_init(struct device_node *node)
ret = gic_clockevent_init();
if (!ret && !IS_ERR(clk)) {
if (clk_notifier_register(clk, &gic_clk_nb) < 0)
- pr_warn("GIC: Unable to register clock notifier\n");
+ pr_warn("Unable to register clock notifier\n");
}
/* And finally start the counter */
diff --git a/drivers/clocksource/mxs_timer.c b/drivers/clocksource/mxs_timer.c
index a03434e9fe8f..f6ddae30933f 100644
--- a/drivers/clocksource/mxs_timer.c
+++ b/drivers/clocksource/mxs_timer.c
@@ -1,24 +1,10 @@
-/*
- * Copyright (C) 2000-2001 Deep Blue Solutions
- * Copyright (C) 2002 Shane Nay (shane@minirl.com)
- * Copyright (C) 2006-2007 Pavel Pisa (ppisa@pikron.com)
- * Copyright (C) 2008 Juergen Beisert (kernel@pengutronix.de)
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright (C) 2000-2001 Deep Blue Solutions
+// Copyright (C) 2002 Shane Nay (shane@minirl.com)
+// Copyright (C) 2006-2007 Pavel Pisa (ppisa@pikron.com)
+// Copyright (C) 2008 Juergen Beisert (kernel@pengutronix.de)
+// Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
#include <linux/err.h>
#include <linux/interrupt.h>
diff --git a/drivers/clocksource/timer-imx-gpt.c b/drivers/clocksource/timer-imx-gpt.c
index 6ec6d79b237c..165fbbb1c9a0 100644
--- a/drivers/clocksource/timer-imx-gpt.c
+++ b/drivers/clocksource/timer-imx-gpt.c
@@ -1,25 +1,9 @@
-/*
- * linux/arch/arm/plat-mxc/time.c
- *
- * Copyright (C) 2000-2001 Deep Blue Solutions
- * Copyright (C) 2002 Shane Nay (shane@minirl.com)
- * Copyright (C) 2006-2007 Pavel Pisa (ppisa@pikron.com)
- * Copyright (C) 2008 Juergen Beisert (kernel@pengutronix.de)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright (C) 2000-2001 Deep Blue Solutions
+// Copyright (C) 2002 Shane Nay (shane@minirl.com)
+// Copyright (C) 2006-2007 Pavel Pisa (ppisa@pikron.com)
+// Copyright (C) 2008 Juergen Beisert (kernel@pengutronix.de)
#include <linux/interrupt.h>
#include <linux/irq.h>
diff --git a/drivers/clocksource/timer-imx-tpm.c b/drivers/clocksource/timer-imx-tpm.c
index 6c8318470b48..b7aa2b817078 100644
--- a/drivers/clocksource/timer-imx-tpm.c
+++ b/drivers/clocksource/timer-imx-tpm.c
@@ -1,12 +1,7 @@
-/*
- * Copyright 2016 Freescale Semiconductor, Inc.
- * Copyright 2017 NXP
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2016 Freescale Semiconductor, Inc.
+// Copyright 2017 NXP
#include <linux/clk.h>
#include <linux/clockchips.h>
diff --git a/drivers/dma/qcom/hidma_mgmt.c b/drivers/dma/qcom/hidma_mgmt.c
index 000c7019ca7d..d64edeb6771a 100644
--- a/drivers/dma/qcom/hidma_mgmt.c
+++ b/drivers/dma/qcom/hidma_mgmt.c
@@ -398,7 +398,7 @@ static int __init hidma_mgmt_of_populate_channels(struct device_node *np)
}
of_node_get(child);
new_pdev->dev.of_node = child;
- of_dma_configure(&new_pdev->dev, child);
+ of_dma_configure(&new_pdev->dev, child, true);
/*
* It is assumed that calling of_msi_configure is safe on
* platforms with or without MSI support.
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 3098410abad8..781a4a337557 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -174,6 +174,11 @@ config UEFI_CPER_ARM
depends on UEFI_CPER && ( ARM || ARM64 )
default y
+config UEFI_CPER_X86
+ bool
+ depends on UEFI_CPER && X86
+ default y
+
config EFI_DEV_PATH_PARSER
bool
depends on ACPI
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index cb805374f4bc..5f9f5039de50 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_ARM) += $(arm-obj-y)
obj-$(CONFIG_ARM64) += $(arm-obj-y)
obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o
obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o
+obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index e456f4602df1..96688986da56 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -134,10 +134,16 @@ static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
/* Indicate capsule binary uploading is done */
cap_info->index = NO_FURTHER_WRITE_ACTION;
- pr_info("Successfully upload capsule file with reboot type '%s'\n",
- !cap_info->reset_type ? "RESET_COLD" :
- cap_info->reset_type == 1 ? "RESET_WARM" :
- "RESET_SHUTDOWN");
+
+ if (cap_info->header.flags & EFI_CAPSULE_PERSIST_ACROSS_RESET) {
+ pr_info("Successfully uploaded capsule file with reboot type '%s'\n",
+ !cap_info->reset_type ? "RESET_COLD" :
+ cap_info->reset_type == 1 ? "RESET_WARM" :
+ "RESET_SHUTDOWN");
+ } else {
+ pr_info("Successfully processed capsule file\n");
+ }
+
return 0;
}
diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
index 698e5c8e0c8d..502811344e81 100644
--- a/drivers/firmware/efi/cper-arm.c
+++ b/drivers/firmware/efi/cper-arm.c
@@ -30,8 +30,6 @@
#include <acpi/ghes.h>
#include <ras/ras_event.h>
-#define INDENT_SP " "
-
static const char * const arm_reg_ctx_strs[] = {
"AArch32 general purpose registers",
"AArch32 EL1 context registers",
@@ -283,7 +281,7 @@ void cper_print_proc_arm(const char *pfx,
pfx, proc->psci_state);
}
- snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+ snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
err_info = (struct cper_arm_err_info *)(proc + 1);
for (i = 0; i < proc->err_info_num; i++) {
@@ -310,7 +308,7 @@ void cper_print_proc_arm(const char *pfx,
if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) {
printk("%serror_info: 0x%016llx\n", newpfx,
err_info->error_info);
- snprintf(infopfx, sizeof(infopfx), "%s%s", newpfx, INDENT_SP);
+ snprintf(infopfx, sizeof(infopfx), "%s ", newpfx);
cper_print_arm_err_info(infopfx, err_info->type,
err_info->error_info);
}
diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c
new file mode 100644
index 000000000000..2531de49f56c
--- /dev/null
+++ b/drivers/firmware/efi/cper-x86.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+#include <linux/cper.h>
+
+/*
+ * We don't need a "CPER_IA" prefix since these are all locally defined.
+ * This will save us a lot of line space.
+ */
+#define VALID_LAPIC_ID BIT_ULL(0)
+#define VALID_CPUID_INFO BIT_ULL(1)
+#define VALID_PROC_ERR_INFO_NUM(bits) (((bits) & GENMASK_ULL(7, 2)) >> 2)
+#define VALID_PROC_CXT_INFO_NUM(bits) (((bits) & GENMASK_ULL(13, 8)) >> 8)
+
+#define INFO_ERR_STRUCT_TYPE_CACHE \
+ GUID_INIT(0xA55701F5, 0xE3EF, 0x43DE, 0xAC, 0x72, 0x24, 0x9B, \
+ 0x57, 0x3F, 0xAD, 0x2C)
+#define INFO_ERR_STRUCT_TYPE_TLB \
+ GUID_INIT(0xFC06B535, 0x5E1F, 0x4562, 0x9F, 0x25, 0x0A, 0x3B, \
+ 0x9A, 0xDB, 0x63, 0xC3)
+#define INFO_ERR_STRUCT_TYPE_BUS \
+ GUID_INIT(0x1CF3F8B3, 0xC5B1, 0x49a2, 0xAA, 0x59, 0x5E, 0xEF, \
+ 0x92, 0xFF, 0xA6, 0x3C)
+#define INFO_ERR_STRUCT_TYPE_MS \
+ GUID_INIT(0x48AB7F57, 0xDC34, 0x4f6c, 0xA7, 0xD3, 0xB0, 0xB5, \
+ 0xB0, 0xA7, 0x43, 0x14)
+
+#define INFO_VALID_CHECK_INFO BIT_ULL(0)
+#define INFO_VALID_TARGET_ID BIT_ULL(1)
+#define INFO_VALID_REQUESTOR_ID BIT_ULL(2)
+#define INFO_VALID_RESPONDER_ID BIT_ULL(3)
+#define INFO_VALID_IP BIT_ULL(4)
+
+#define CHECK_VALID_TRANS_TYPE BIT_ULL(0)
+#define CHECK_VALID_OPERATION BIT_ULL(1)
+#define CHECK_VALID_LEVEL BIT_ULL(2)
+#define CHECK_VALID_PCC BIT_ULL(3)
+#define CHECK_VALID_UNCORRECTED BIT_ULL(4)
+#define CHECK_VALID_PRECISE_IP BIT_ULL(5)
+#define CHECK_VALID_RESTARTABLE_IP BIT_ULL(6)
+#define CHECK_VALID_OVERFLOW BIT_ULL(7)
+
+#define CHECK_VALID_BUS_PART_TYPE BIT_ULL(8)
+#define CHECK_VALID_BUS_TIME_OUT BIT_ULL(9)
+#define CHECK_VALID_BUS_ADDR_SPACE BIT_ULL(10)
+
+#define CHECK_VALID_BITS(check) (((check) & GENMASK_ULL(15, 0)))
+#define CHECK_TRANS_TYPE(check) (((check) & GENMASK_ULL(17, 16)) >> 16)
+#define CHECK_OPERATION(check) (((check) & GENMASK_ULL(21, 18)) >> 18)
+#define CHECK_LEVEL(check) (((check) & GENMASK_ULL(24, 22)) >> 22)
+#define CHECK_PCC BIT_ULL(25)
+#define CHECK_UNCORRECTED BIT_ULL(26)
+#define CHECK_PRECISE_IP BIT_ULL(27)
+#define CHECK_RESTARTABLE_IP BIT_ULL(28)
+#define CHECK_OVERFLOW BIT_ULL(29)
+
+#define CHECK_BUS_PART_TYPE(check) (((check) & GENMASK_ULL(31, 30)) >> 30)
+#define CHECK_BUS_TIME_OUT BIT_ULL(32)
+#define CHECK_BUS_ADDR_SPACE(check) (((check) & GENMASK_ULL(34, 33)) >> 33)
+
+#define CHECK_VALID_MS_ERR_TYPE BIT_ULL(0)
+#define CHECK_VALID_MS_PCC BIT_ULL(1)
+#define CHECK_VALID_MS_UNCORRECTED BIT_ULL(2)
+#define CHECK_VALID_MS_PRECISE_IP BIT_ULL(3)
+#define CHECK_VALID_MS_RESTARTABLE_IP BIT_ULL(4)
+#define CHECK_VALID_MS_OVERFLOW BIT_ULL(5)
+
+#define CHECK_MS_ERR_TYPE(check) (((check) & GENMASK_ULL(18, 16)) >> 16)
+#define CHECK_MS_PCC BIT_ULL(19)
+#define CHECK_MS_UNCORRECTED BIT_ULL(20)
+#define CHECK_MS_PRECISE_IP BIT_ULL(21)
+#define CHECK_MS_RESTARTABLE_IP BIT_ULL(22)
+#define CHECK_MS_OVERFLOW BIT_ULL(23)
+
+#define CTX_TYPE_MSR 1
+#define CTX_TYPE_MMREG 7
+
+enum err_types {
+ ERR_TYPE_CACHE = 0,
+ ERR_TYPE_TLB,
+ ERR_TYPE_BUS,
+ ERR_TYPE_MS,
+ N_ERR_TYPES
+};
+
+static enum err_types cper_get_err_type(const guid_t *err_type)
+{
+ if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_CACHE))
+ return ERR_TYPE_CACHE;
+ else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_TLB))
+ return ERR_TYPE_TLB;
+ else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_BUS))
+ return ERR_TYPE_BUS;
+ else if (guid_equal(err_type, &INFO_ERR_STRUCT_TYPE_MS))
+ return ERR_TYPE_MS;
+ else
+ return N_ERR_TYPES;
+}
+
+static const char * const ia_check_trans_type_strs[] = {
+ "Instruction",
+ "Data Access",
+ "Generic",
+};
+
+static const char * const ia_check_op_strs[] = {
+ "generic error",
+ "generic read",
+ "generic write",
+ "data read",
+ "data write",
+ "instruction fetch",
+ "prefetch",
+ "eviction",
+ "snoop",
+};
+
+static const char * const ia_check_bus_part_type_strs[] = {
+ "Local Processor originated request",
+ "Local Processor responded to request",
+ "Local Processor observed",
+ "Generic",
+};
+
+static const char * const ia_check_bus_addr_space_strs[] = {
+ "Memory Access",
+ "Reserved",
+ "I/O",
+ "Other Transaction",
+};
+
+static const char * const ia_check_ms_error_type_strs[] = {
+ "No Error",
+ "Unclassified",
+ "Microcode ROM Parity Error",
+ "External Error",
+ "FRC Error",
+ "Internal Unclassified",
+};
+
+static const char * const ia_reg_ctx_strs[] = {
+ "Unclassified Data",
+ "MSR Registers (Machine Check and other MSRs)",
+ "32-bit Mode Execution Context",
+ "64-bit Mode Execution Context",
+ "FXSAVE Context",
+ "32-bit Mode Debug Registers (DR0-DR7)",
+ "64-bit Mode Debug Registers (DR0-DR7)",
+ "Memory Mapped Registers",
+};
+
+static inline void print_bool(char *str, const char *pfx, u64 check, u64 bit)
+{
+ printk("%s%s: %s\n", pfx, str, (check & bit) ? "true" : "false");
+}
+
+static void print_err_info_ms(const char *pfx, u16 validation_bits, u64 check)
+{
+ if (validation_bits & CHECK_VALID_MS_ERR_TYPE) {
+ u8 err_type = CHECK_MS_ERR_TYPE(check);
+
+ printk("%sError Type: %u, %s\n", pfx, err_type,
+ err_type < ARRAY_SIZE(ia_check_ms_error_type_strs) ?
+ ia_check_ms_error_type_strs[err_type] : "unknown");
+ }
+
+ if (validation_bits & CHECK_VALID_MS_PCC)
+ print_bool("Processor Context Corrupt", pfx, check, CHECK_MS_PCC);
+
+ if (validation_bits & CHECK_VALID_MS_UNCORRECTED)
+ print_bool("Uncorrected", pfx, check, CHECK_MS_UNCORRECTED);
+
+ if (validation_bits & CHECK_VALID_MS_PRECISE_IP)
+ print_bool("Precise IP", pfx, check, CHECK_MS_PRECISE_IP);
+
+ if (validation_bits & CHECK_VALID_MS_RESTARTABLE_IP)
+ print_bool("Restartable IP", pfx, check, CHECK_MS_RESTARTABLE_IP);
+
+ if (validation_bits & CHECK_VALID_MS_OVERFLOW)
+ print_bool("Overflow", pfx, check, CHECK_MS_OVERFLOW);
+}
+
+static void print_err_info(const char *pfx, u8 err_type, u64 check)
+{
+ u16 validation_bits = CHECK_VALID_BITS(check);
+
+ /*
+ * The MS Check structure varies a lot from the others, so use a
+ * separate function for decoding.
+ */
+ if (err_type == ERR_TYPE_MS)
+ return print_err_info_ms(pfx, validation_bits, check);
+
+ if (validation_bits & CHECK_VALID_TRANS_TYPE) {
+ u8 trans_type = CHECK_TRANS_TYPE(check);
+
+ printk("%sTransaction Type: %u, %s\n", pfx, trans_type,
+ trans_type < ARRAY_SIZE(ia_check_trans_type_strs) ?
+ ia_check_trans_type_strs[trans_type] : "unknown");
+ }
+
+ if (validation_bits & CHECK_VALID_OPERATION) {
+ u8 op = CHECK_OPERATION(check);
+
+ /*
+ * CACHE has more operation types than TLB or BUS, though the
+ * name and the order are the same.
+ */
+ u8 max_ops = (err_type == ERR_TYPE_CACHE) ? 9 : 7;
+
+ printk("%sOperation: %u, %s\n", pfx, op,
+ op < max_ops ? ia_check_op_strs[op] : "unknown");
+ }
+
+ if (validation_bits & CHECK_VALID_LEVEL)
+ printk("%sLevel: %llu\n", pfx, CHECK_LEVEL(check));
+
+ if (validation_bits & CHECK_VALID_PCC)
+ print_bool("Processor Context Corrupt", pfx, check, CHECK_PCC);
+
+ if (validation_bits & CHECK_VALID_UNCORRECTED)
+ print_bool("Uncorrected", pfx, check, CHECK_UNCORRECTED);
+
+ if (validation_bits & CHECK_VALID_PRECISE_IP)
+ print_bool("Precise IP", pfx, check, CHECK_PRECISE_IP);
+
+ if (validation_bits & CHECK_VALID_RESTARTABLE_IP)
+ print_bool("Restartable IP", pfx, check, CHECK_RESTARTABLE_IP);
+
+ if (validation_bits & CHECK_VALID_OVERFLOW)
+ print_bool("Overflow", pfx, check, CHECK_OVERFLOW);
+
+ if (err_type != ERR_TYPE_BUS)
+ return;
+
+ if (validation_bits & CHECK_VALID_BUS_PART_TYPE) {
+ u8 part_type = CHECK_BUS_PART_TYPE(check);
+
+ printk("%sParticipation Type: %u, %s\n", pfx, part_type,
+ part_type < ARRAY_SIZE(ia_check_bus_part_type_strs) ?
+ ia_check_bus_part_type_strs[part_type] : "unknown");
+ }
+
+ if (validation_bits & CHECK_VALID_BUS_TIME_OUT)
+ print_bool("Time Out", pfx, check, CHECK_BUS_TIME_OUT);
+
+ if (validation_bits & CHECK_VALID_BUS_ADDR_SPACE) {
+ u8 addr_space = CHECK_BUS_ADDR_SPACE(check);
+
+ printk("%sAddress Space: %u, %s\n", pfx, addr_space,
+ addr_space < ARRAY_SIZE(ia_check_bus_addr_space_strs) ?
+ ia_check_bus_addr_space_strs[addr_space] : "unknown");
+ }
+}
+
+void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc)
+{
+ int i;
+ struct cper_ia_err_info *err_info;
+ struct cper_ia_proc_ctx *ctx_info;
+ char newpfx[64], infopfx[64];
+ u8 err_type;
+
+ if (proc->validation_bits & VALID_LAPIC_ID)
+ printk("%sLocal APIC_ID: 0x%llx\n", pfx, proc->lapic_id);
+
+ if (proc->validation_bits & VALID_CPUID_INFO) {
+ printk("%sCPUID Info:\n", pfx);
+ print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, proc->cpuid,
+ sizeof(proc->cpuid), 0);
+ }
+
+ snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
+
+ err_info = (struct cper_ia_err_info *)(proc + 1);
+ for (i = 0; i < VALID_PROC_ERR_INFO_NUM(proc->validation_bits); i++) {
+ printk("%sError Information Structure %d:\n", pfx, i);
+
+ err_type = cper_get_err_type(&err_info->err_type);
+ printk("%sError Structure Type: %s\n", newpfx,
+ err_type < ARRAY_SIZE(cper_proc_error_type_strs) ?
+ cper_proc_error_type_strs[err_type] : "unknown");
+
+ if (err_type >= N_ERR_TYPES) {
+ printk("%sError Structure Type: %pUl\n", newpfx,
+ &err_info->err_type);
+ }
+
+ if (err_info->validation_bits & INFO_VALID_CHECK_INFO) {
+ printk("%sCheck Information: 0x%016llx\n", newpfx,
+ err_info->check_info);
+
+ if (err_type < N_ERR_TYPES) {
+ snprintf(infopfx, sizeof(infopfx), "%s ",
+ newpfx);
+
+ print_err_info(infopfx, err_type,
+ err_info->check_info);
+ }
+ }
+
+ if (err_info->validation_bits & INFO_VALID_TARGET_ID) {
+ printk("%sTarget Identifier: 0x%016llx\n",
+ newpfx, err_info->target_id);
+ }
+
+ if (err_info->validation_bits & INFO_VALID_REQUESTOR_ID) {
+ printk("%sRequestor Identifier: 0x%016llx\n",
+ newpfx, err_info->requestor_id);
+ }
+
+ if (err_info->validation_bits & INFO_VALID_RESPONDER_ID) {
+ printk("%sResponder Identifier: 0x%016llx\n",
+ newpfx, err_info->responder_id);
+ }
+
+ if (err_info->validation_bits & INFO_VALID_IP) {
+ printk("%sInstruction Pointer: 0x%016llx\n",
+ newpfx, err_info->ip);
+ }
+
+ err_info++;
+ }
+
+ ctx_info = (struct cper_ia_proc_ctx *)err_info;
+ for (i = 0; i < VALID_PROC_CXT_INFO_NUM(proc->validation_bits); i++) {
+ int size = sizeof(*ctx_info) + ctx_info->reg_arr_size;
+ int groupsize = 4;
+
+ printk("%sContext Information Structure %d:\n", pfx, i);
+
+ printk("%sRegister Context Type: %s\n", newpfx,
+ ctx_info->reg_ctx_type < ARRAY_SIZE(ia_reg_ctx_strs) ?
+ ia_reg_ctx_strs[ctx_info->reg_ctx_type] : "unknown");
+
+ printk("%sRegister Array Size: 0x%04x\n", newpfx,
+ ctx_info->reg_arr_size);
+
+ if (ctx_info->reg_ctx_type == CTX_TYPE_MSR) {
+ groupsize = 8; /* MSRs are 8 bytes wide. */
+ printk("%sMSR Address: 0x%08x\n", newpfx,
+ ctx_info->msr_addr);
+ }
+
+ if (ctx_info->reg_ctx_type == CTX_TYPE_MMREG) {
+ printk("%sMM Register Address: 0x%016llx\n", newpfx,
+ ctx_info->mm_reg_addr);
+ }
+
+ printk("%sRegister Array:\n", newpfx);
+ print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, groupsize,
+ (ctx_info + 1), ctx_info->reg_arr_size, 0);
+
+ ctx_info = (struct cper_ia_proc_ctx *)((long)ctx_info + size);
+ }
+}
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index c165933ebf38..3bf0dca378a6 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -37,8 +37,6 @@
#include <acpi/ghes.h>
#include <ras/ras_event.h>
-#define INDENT_SP " "
-
static char rcd_decode_str[CPER_REC_LEN];
/*
@@ -433,7 +431,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
- snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+ snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
@@ -470,6 +468,16 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
else
goto err_section_too_small;
#endif
+#if defined(CONFIG_UEFI_CPER_X86)
+ } else if (guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+ struct cper_sec_proc_ia *ia_err = acpi_hest_get_payload(gdata);
+
+ printk("%ssection_type: IA32/X64 processor error\n", newpfx);
+ if (gdata->error_data_length >= sizeof(*ia_err))
+ cper_print_proc_ia(newpfx, ia_err);
+ else
+ goto err_section_too_small;
+#endif
} else {
const void *err = acpi_hest_get_payload(gdata);
@@ -500,7 +508,7 @@ void cper_estatus_print(const char *pfx,
"It has been corrected by h/w "
"and requires no further action");
printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
- snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+ snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
apei_estatus_for_each_section(estatus, gdata) {
cper_estatus_print_section(newpfx, gdata, sec_no);
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 8f07eb414c00..72d9dfbebf08 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -30,6 +30,9 @@ static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
/*
* Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * arch/x86/xen/efi.c:xen_efi_get_secureboot().
*/
enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
{
diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c
index 9d08cea3f1b0..caa37a6dd9d4 100644
--- a/drivers/firmware/efi/libstub/tpm.c
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -59,7 +59,7 @@ void efi_enable_reset_attack_mitigation(efi_system_table_t *sys_table_arg)
#endif
-void efi_retrieve_tpm2_eventlog_1_2(efi_system_table_t *sys_table_arg)
+static void efi_retrieve_tpm2_eventlog_1_2(efi_system_table_t *sys_table_arg)
{
efi_guid_t tcg2_guid = EFI_TCG2_PROTOCOL_GUID;
efi_guid_t linux_eventlog_guid = LINUX_EFI_TPM_EVENT_LOG_GUID;
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 88a3558b7916..815bdb42e3f0 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -314,6 +314,11 @@ static int host1x_device_match(struct device *dev, struct device_driver *drv)
return strcmp(dev_name(dev), drv->name) == 0;
}
+static int host1x_dma_configure(struct device *dev)
+{
+ return of_dma_configure(dev, dev->of_node, true);
+}
+
static const struct dev_pm_ops host1x_device_pm_ops = {
.suspend = pm_generic_suspend,
.resume = pm_generic_resume,
@@ -326,8 +331,8 @@ static const struct dev_pm_ops host1x_device_pm_ops = {
struct bus_type host1x_bus_type = {
.name = "host1x",
.match = host1x_device_match,
+ .dma_configure = host1x_dma_configure,
.pm = &host1x_device_pm_ops,
- .force_dma = true,
};
static void __host1x_device_del(struct host1x_device *device)
@@ -416,7 +421,7 @@ static int host1x_device_add(struct host1x *host1x,
device->dev.bus = &host1x_bus_type;
device->dev.parent = host1x->dev;
- of_dma_configure(&device->dev, host1x->dev->of_node);
+ of_dma_configure(&device->dev, host1x->dev->of_node, true);
err = host1x_device_parse_dt(device, driver);
if (err < 0) {
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 6ec307c93ece..f10840ad465c 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -717,15 +717,12 @@ config SENSORS_LTC2945
be called ltc2945.
config SENSORS_LTC2990
- tristate "Linear Technology LTC2990 (current monitoring mode only)"
+ tristate "Linear Technology LTC2990"
depends on I2C
help
If you say yes here you get support for Linear Technology LTC2990
I2C System Monitor. The LTC2990 supports a combination of voltage,
- current and temperature monitoring, but in addition to the Vcc supply
- voltage and chip temperature, this driver currently only supports
- reading two currents by measuring two differential voltages across
- series resistors.
+ current and temperature monitoring.
This driver can also be built as a module. If so, the module will
be called ltc2990.
diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c
index 975c43d446f8..a6636fe42189 100644
--- a/drivers/hwmon/asus_atk0110.c
+++ b/drivers/hwmon/asus_atk0110.c
@@ -125,6 +125,8 @@ struct atk_data {
int temperature_count;
int fan_count;
struct list_head sensor_list;
+ struct attribute_group attr_group;
+ const struct attribute_group *attr_groups[2];
struct {
struct dentry *root;
@@ -188,7 +190,6 @@ static int atk_add(struct acpi_device *device);
static int atk_remove(struct acpi_device *device);
static void atk_print_sensor(struct atk_data *data, union acpi_object *obj);
static int atk_read_value(struct atk_sensor_data *sensor, u64 *value);
-static void atk_free_sensors(struct atk_data *data);
static struct acpi_driver atk_driver = {
.name = ATK_HID,
@@ -262,14 +263,6 @@ static ssize_t atk_limit2_show(struct device *dev,
return sprintf(buf, "%lld\n", value);
}
-static ssize_t atk_name_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return sprintf(buf, "atk0110\n");
-}
-static struct device_attribute atk_name_attr =
- __ATTR(name, 0444, atk_name_show, NULL);
-
static void atk_init_attribute(struct device_attribute *attr, char *name,
sysfs_show_func show)
{
@@ -912,15 +905,13 @@ static int atk_add_sensor(struct atk_data *data, union acpi_object *obj)
limit1 = atk_get_pack_member(data, obj, HWMON_PACK_LIMIT1);
limit2 = atk_get_pack_member(data, obj, HWMON_PACK_LIMIT2);
- sensor = kzalloc(sizeof(*sensor), GFP_KERNEL);
+ sensor = devm_kzalloc(dev, sizeof(*sensor), GFP_KERNEL);
if (!sensor)
return -ENOMEM;
- sensor->acpi_name = kstrdup(name->string.pointer, GFP_KERNEL);
- if (!sensor->acpi_name) {
- err = -ENOMEM;
- goto out;
- }
+ sensor->acpi_name = devm_kstrdup(dev, name->string.pointer, GFP_KERNEL);
+ if (!sensor->acpi_name)
+ return -ENOMEM;
INIT_LIST_HEAD(&sensor->list);
sensor->type = type;
@@ -961,9 +952,6 @@ static int atk_add_sensor(struct atk_data *data, union acpi_object *obj)
(*num)++;
return 1;
-out:
- kfree(sensor);
- return err;
}
static int atk_enumerate_old_hwmon(struct atk_data *data)
@@ -1004,8 +992,7 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
dev_warn(dev, METHOD_OLD_ENUM_TMP ": ACPI exception: %s\n",
acpi_format_exception(status));
- ret = -ENODEV;
- goto cleanup;
+ return -ENODEV;
}
pack = buf.pointer;
@@ -1026,8 +1013,7 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
dev_warn(dev, METHOD_OLD_ENUM_FAN ": ACPI exception: %s\n",
acpi_format_exception(status));
- ret = -ENODEV;
- goto cleanup;
+ return -ENODEV;
}
pack = buf.pointer;
@@ -1041,9 +1027,6 @@ static int atk_enumerate_old_hwmon(struct atk_data *data)
ACPI_FREE(buf.pointer);
return count;
-cleanup:
- atk_free_sensors(data);
- return ret;
}
static int atk_ec_present(struct atk_data *data)
@@ -1193,76 +1176,44 @@ static int atk_enumerate_new_hwmon(struct atk_data *data)
return err;
}
-static int atk_create_files(struct atk_data *data)
+static int atk_init_attribute_groups(struct atk_data *data)
{
+ struct device *dev = &data->acpi_dev->dev;
struct atk_sensor_data *s;
- int err;
+ struct attribute **attrs;
+ int i = 0;
+ int len = (data->voltage_count + data->temperature_count
+ + data->fan_count) * 4 + 1;
- list_for_each_entry(s, &data->sensor_list, list) {
- err = device_create_file(data->hwmon_dev, &s->input_attr);
- if (err)
- return err;
- err = device_create_file(data->hwmon_dev, &s->label_attr);
- if (err)
- return err;
- err = device_create_file(data->hwmon_dev, &s->limit1_attr);
- if (err)
- return err;
- err = device_create_file(data->hwmon_dev, &s->limit2_attr);
- if (err)
- return err;
- }
-
- err = device_create_file(data->hwmon_dev, &atk_name_attr);
-
- return err;
-}
-
-static void atk_remove_files(struct atk_data *data)
-{
- struct atk_sensor_data *s;
+ attrs = devm_kcalloc(dev, len, sizeof(struct attribute *), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
list_for_each_entry(s, &data->sensor_list, list) {
- device_remove_file(data->hwmon_dev, &s->input_attr);
- device_remove_file(data->hwmon_dev, &s->label_attr);
- device_remove_file(data->hwmon_dev, &s->limit1_attr);
- device_remove_file(data->hwmon_dev, &s->limit2_attr);
+ attrs[i++] = &s->input_attr.attr;
+ attrs[i++] = &s->label_attr.attr;
+ attrs[i++] = &s->limit1_attr.attr;
+ attrs[i++] = &s->limit2_attr.attr;
}
- device_remove_file(data->hwmon_dev, &atk_name_attr);
-}
-static void atk_free_sensors(struct atk_data *data)
-{
- struct list_head *head = &data->sensor_list;
- struct atk_sensor_data *s, *tmp;
+ data->attr_group.attrs = attrs;
+ data->attr_groups[0] = &data->attr_group;
- list_for_each_entry_safe(s, tmp, head, list) {
- kfree(s->acpi_name);
- kfree(s);
- }
+ return 0;
}
static int atk_register_hwmon(struct atk_data *data)
{
struct device *dev = &data->acpi_dev->dev;
- int err;
dev_dbg(dev, "registering hwmon device\n");
- data->hwmon_dev = hwmon_device_register(dev);
+ data->hwmon_dev = hwmon_device_register_with_groups(dev, "atk0110",
+ data,
+ data->attr_groups);
if (IS_ERR(data->hwmon_dev))
return PTR_ERR(data->hwmon_dev);
- dev_dbg(dev, "populating sysfs directory\n");
- err = atk_create_files(data);
- if (err)
- goto remove;
-
return 0;
-remove:
- /* Cleanup the registered files */
- atk_remove_files(data);
- hwmon_device_unregister(data->hwmon_dev);
- return err;
}
static int atk_probe_if(struct atk_data *data)
@@ -1350,7 +1301,7 @@ static int atk_add(struct acpi_device *device)
dev_dbg(&device->dev, "adding...\n");
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = devm_kzalloc(&device->dev, sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -1397,20 +1348,20 @@ static int atk_add(struct acpi_device *device)
goto out;
}
+ err = atk_init_attribute_groups(data);
+ if (err)
+ goto out;
err = atk_register_hwmon(data);
if (err)
- goto cleanup;
+ goto out;
atk_debugfs_init(data);
device->driver_data = data;
return 0;
-cleanup:
- atk_free_sensors(data);
out:
if (data->disable_ec)
atk_ec_ctl(data, 0);
- kfree(data);
return err;
}
@@ -1423,8 +1374,6 @@ static int atk_remove(struct acpi_device *device)
atk_debugfs_cleanup(data);
- atk_remove_files(data);
- atk_free_sensors(data);
hwmon_device_unregister(data->hwmon_dev);
if (data->disable_ec) {
@@ -1432,8 +1381,6 @@ static int atk_remove(struct acpi_device *device)
dev_err(&device->dev, "Failed to disable EC\n");
}
- kfree(data);
-
return 0;
}
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index 5e78229ade04..22d3a84f13ef 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -105,7 +105,7 @@ static const u8 FSCHMD_REG_VOLT[7][6] = {
static const int FSCHMD_NO_VOLT_SENSORS[7] = { 3, 3, 3, 3, 3, 3, 6 };
/*
- * minimum pwm at which the fan is driven (pwm can by increased depending on
+ * minimum pwm at which the fan is driven (pwm can be increased depending on
* the temp. Notice that for the scy some fans share there minimum speed.
* Also notice that with the scy the sensor order is different than with the
* other chips, this order was in the 2.4 driver and kept for consistency.
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 32083e452cde..e88c01961948 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -698,6 +698,9 @@ hwmon_device_register_with_info(struct device *dev, const char *name,
if (chip && (!chip->ops || !chip->ops->is_visible || !chip->info))
return ERR_PTR(-EINVAL);
+ if (chip && !dev)
+ return ERR_PTR(-EINVAL);
+
return __hwmon_device_register(dev, name, drvdata, chip, extra_groups);
}
EXPORT_SYMBOL_GPL(hwmon_device_register_with_info);
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 3b73dee6fdc6..17c6460ae351 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -37,6 +37,10 @@ MODULE_PARM_DESC(force, "force loading on processors with erratum 319");
/* Provide lock for writing to NB_SMU_IND_ADDR */
static DEFINE_MUTEX(nb_smu_ind_mutex);
+#ifndef PCI_DEVICE_ID_AMD_15H_M70H_NB_F3
+#define PCI_DEVICE_ID_AMD_15H_M70H_NB_F3 0x15b3
+#endif
+
#ifndef PCI_DEVICE_ID_AMD_17H_DF_F3
#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
#endif
@@ -81,6 +85,7 @@ struct k10temp_data {
void (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
int temp_offset;
u32 temp_adjust_mask;
+ bool show_tdie;
};
struct tctl_offset {
@@ -141,17 +146,24 @@ static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
}
-static ssize_t temp1_input_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static unsigned int get_raw_temp(struct k10temp_data *data)
{
- struct k10temp_data *data = dev_get_drvdata(dev);
- u32 regval;
unsigned int temp;
+ u32 regval;
data->read_tempreg(data->pdev, &regval);
temp = (regval >> 21) * 125;
if (regval & data->temp_adjust_mask)
temp -= 49000;
+ return temp;
+}
+
+static ssize_t temp1_input_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct k10temp_data *data = dev_get_drvdata(dev);
+ unsigned int temp = get_raw_temp(data);
+
if (temp > data->temp_offset)
temp -= data->temp_offset;
else
@@ -160,6 +172,23 @@ static ssize_t temp1_input_show(struct device *dev,
return sprintf(buf, "%u\n", temp);
}
+static ssize_t temp2_input_show(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+ struct k10temp_data *data = dev_get_drvdata(dev);
+ unsigned int temp = get_raw_temp(data);
+
+ return sprintf(buf, "%u\n", temp);
+}
+
+static ssize_t temp_label_show(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+ struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+
+ return sprintf(buf, "%s\n", attr->index ? "Tctl" : "Tdie");
+}
+
static ssize_t temp1_max_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -187,16 +216,23 @@ static DEVICE_ATTR_RO(temp1_max);
static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, show_temp_crit, NULL, 0);
static SENSOR_DEVICE_ATTR(temp1_crit_hyst, S_IRUGO, show_temp_crit, NULL, 1);
+static SENSOR_DEVICE_ATTR(temp1_label, 0444, temp_label_show, NULL, 0);
+static DEVICE_ATTR_RO(temp2_input);
+static SENSOR_DEVICE_ATTR(temp2_label, 0444, temp_label_show, NULL, 1);
+
static umode_t k10temp_is_visible(struct kobject *kobj,
struct attribute *attr, int index)
{
struct device *dev = container_of(kobj, struct device, kobj);
struct k10temp_data *data = dev_get_drvdata(dev);
struct pci_dev *pdev = data->pdev;
+ u32 reg;
- if (index >= 2) {
- u32 reg;
-
+ switch (index) {
+ case 0 ... 1: /* temp1_input, temp1_max */
+ default:
+ break;
+ case 2 ... 3: /* temp1_crit, temp1_crit_hyst */
if (!data->read_htcreg)
return 0;
@@ -208,6 +244,11 @@ static umode_t k10temp_is_visible(struct kobject *kobj,
data->read_htcreg(data->pdev, &reg);
if (!(reg & HTC_ENABLE))
return 0;
+ break;
+ case 4 ... 6: /* temp1_label, temp2_input, temp2_label */
+ if (!data->show_tdie)
+ return 0;
+ break;
}
return attr->mode;
}
@@ -217,6 +258,9 @@ static struct attribute *k10temp_attrs[] = {
&dev_attr_temp1_max.attr,
&sensor_dev_attr_temp1_crit.dev_attr.attr,
&sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
+ &sensor_dev_attr_temp1_label.dev_attr.attr,
+ &dev_attr_temp2_input.attr,
+ &sensor_dev_attr_temp2_label.dev_attr.attr,
NULL
};
@@ -292,6 +336,7 @@ static int k10temp_probe(struct pci_dev *pdev,
} else if (boot_cpu_data.x86 == 0x17) {
data->temp_adjust_mask = 0x80000;
data->read_tempreg = read_tempreg_nb_f17;
+ data->show_tdie = true;
} else {
data->read_htcreg = read_htcreg_pci;
data->read_tempreg = read_tempreg_pci;
@@ -320,6 +365,7 @@ static const struct pci_device_id k10temp_id_table[] = {
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
diff --git a/drivers/hwmon/ltc2990.c b/drivers/hwmon/ltc2990.c
index 8f8fe059ab48..2aefdc58b242 100644
--- a/drivers/hwmon/ltc2990.c
+++ b/drivers/hwmon/ltc2990.c
@@ -5,18 +5,16 @@
* Author: Mike Looijmans <mike.looijmans@topic.nl>
*
* License: GPLv2
- *
- * This driver assumes the chip is wired as a dual current monitor, and
- * reports the voltage drop across two series resistors. It also reports
- * the chip's internal temperature and Vcc power supply voltage.
*/
+#include <linux/bitops.h>
#include <linux/err.h>
#include <linux/hwmon.h>
#include <linux/hwmon-sysfs.h>
#include <linux/i2c.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/of.h>
#define LTC2990_STATUS 0x00
#define LTC2990_CONTROL 0x01
@@ -28,45 +26,108 @@
#define LTC2990_V4_MSB 0x0C
#define LTC2990_VCC_MSB 0x0E
-#define LTC2990_CONTROL_KELVIN BIT(7)
-#define LTC2990_CONTROL_SINGLE BIT(6)
-#define LTC2990_CONTROL_MEASURE_ALL (0x3 << 3)
-#define LTC2990_CONTROL_MODE_CURRENT 0x06
-#define LTC2990_CONTROL_MODE_VOLTAGE 0x07
+#define LTC2990_IN0 BIT(0)
+#define LTC2990_IN1 BIT(1)
+#define LTC2990_IN2 BIT(2)
+#define LTC2990_IN3 BIT(3)
+#define LTC2990_IN4 BIT(4)
+#define LTC2990_CURR1 BIT(5)
+#define LTC2990_CURR2 BIT(6)
+#define LTC2990_TEMP1 BIT(7)
+#define LTC2990_TEMP2 BIT(8)
+#define LTC2990_TEMP3 BIT(9)
+#define LTC2990_NONE 0
+#define LTC2990_ALL GENMASK(9, 0)
-/* convert raw register value to sign-extended integer in 16-bit range */
-static int ltc2990_voltage_to_int(int raw)
-{
- if (raw & BIT(14))
- return -(0x4000 - (raw & 0x3FFF)) << 2;
- else
- return (raw & 0x3FFF) << 2;
-}
+#define LTC2990_MODE0_SHIFT 0
+#define LTC2990_MODE0_MASK GENMASK(2, 0)
+#define LTC2990_MODE1_SHIFT 3
+#define LTC2990_MODE1_MASK GENMASK(1, 0)
+
+/* Enabled measurements for mode bits 2..0 */
+static const int ltc2990_attrs_ena_0[] = {
+ LTC2990_IN1 | LTC2990_IN2 | LTC2990_TEMP3,
+ LTC2990_CURR1 | LTC2990_TEMP3,
+ LTC2990_CURR1 | LTC2990_IN3 | LTC2990_IN4,
+ LTC2990_TEMP2 | LTC2990_IN3 | LTC2990_IN4,
+ LTC2990_TEMP2 | LTC2990_CURR2,
+ LTC2990_TEMP2 | LTC2990_TEMP3,
+ LTC2990_CURR1 | LTC2990_CURR2,
+ LTC2990_IN1 | LTC2990_IN2 | LTC2990_IN3 | LTC2990_IN4
+};
+
+/* Enabled measurements for mode bits 4..3 */
+static const int ltc2990_attrs_ena_1[] = {
+ LTC2990_NONE,
+ LTC2990_TEMP2 | LTC2990_IN1 | LTC2990_CURR1,
+ LTC2990_TEMP3 | LTC2990_IN3 | LTC2990_CURR2,
+ LTC2990_ALL
+};
+
+struct ltc2990_data {
+ struct i2c_client *i2c;
+ u32 mode[2];
+};
/* Return the converted value from the given register in uV or mC */
-static int ltc2990_get_value(struct i2c_client *i2c, u8 reg, int *result)
+static int ltc2990_get_value(struct i2c_client *i2c, int index, int *result)
{
int val;
+ u8 reg;
+
+ switch (index) {
+ case LTC2990_IN0:
+ reg = LTC2990_VCC_MSB;
+ break;
+ case LTC2990_IN1:
+ case LTC2990_CURR1:
+ case LTC2990_TEMP2:
+ reg = LTC2990_V1_MSB;
+ break;
+ case LTC2990_IN2:
+ reg = LTC2990_V2_MSB;
+ break;
+ case LTC2990_IN3:
+ case LTC2990_CURR2:
+ case LTC2990_TEMP3:
+ reg = LTC2990_V3_MSB;
+ break;
+ case LTC2990_IN4:
+ reg = LTC2990_V4_MSB;
+ break;
+ case LTC2990_TEMP1:
+ reg = LTC2990_TINT_MSB;
+ break;
+ default:
+ return -EINVAL;
+ }
val = i2c_smbus_read_word_swapped(i2c, reg);
if (unlikely(val < 0))
return val;
- switch (reg) {
- case LTC2990_TINT_MSB:
- /* internal temp, 0.0625 degrees/LSB, 13-bit */
- val = (val & 0x1FFF) << 3;
- *result = (val * 1000) >> 7;
+ switch (index) {
+ case LTC2990_TEMP1:
+ case LTC2990_TEMP2:
+ case LTC2990_TEMP3:
+ /* temp, 0.0625 degrees/LSB */
+ *result = sign_extend32(val, 12) * 1000 / 16;
break;
- case LTC2990_V1_MSB:
- case LTC2990_V3_MSB:
- /* Vx-Vy, 19.42uV/LSB. Depends on mode. */
- *result = ltc2990_voltage_to_int(val) * 1942 / (4 * 100);
+ case LTC2990_CURR1:
+ case LTC2990_CURR2:
+ /* Vx-Vy, 19.42uV/LSB */
+ *result = sign_extend32(val, 14) * 1942 / 100;
break;
- case LTC2990_VCC_MSB:
- /* Vcc, 305.18μV/LSB, 2.5V offset */
- *result = (ltc2990_voltage_to_int(val) * 30518 /
- (4 * 100 * 1000)) + 2500;
+ case LTC2990_IN0:
+ /* Vcc, 305.18uV/LSB, 2.5V offset */
+ *result = sign_extend32(val, 14) * 30518 / (100 * 1000) + 2500;
+ break;
+ case LTC2990_IN1:
+ case LTC2990_IN2:
+ case LTC2990_IN3:
+ case LTC2990_IN4:
+ /* Vx, 305.18uV/LSB */
+ *result = sign_extend32(val, 14) * 30518 / (100 * 1000);
break;
default:
return -EINVAL; /* won't happen, keep compiler happy */
@@ -79,48 +140,117 @@ static ssize_t ltc2990_show_value(struct device *dev,
struct device_attribute *da, char *buf)
{
struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+ struct ltc2990_data *data = dev_get_drvdata(dev);
int value;
int ret;
- ret = ltc2990_get_value(dev_get_drvdata(dev), attr->index, &value);
+ ret = ltc2990_get_value(data->i2c, attr->index, &value);
if (unlikely(ret < 0))
return ret;
return snprintf(buf, PAGE_SIZE, "%d\n", value);
}
+static umode_t ltc2990_attrs_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct ltc2990_data *data = dev_get_drvdata(dev);
+ struct device_attribute *da =
+ container_of(a, struct device_attribute, attr);
+ struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+
+ int attrs_mask = LTC2990_IN0 | LTC2990_TEMP1 |
+ (ltc2990_attrs_ena_0[data->mode[0]] &
+ ltc2990_attrs_ena_1[data->mode[1]]);
+
+ if (attr->index & attrs_mask)
+ return a->mode;
+
+ return 0;
+}
+
static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, ltc2990_show_value, NULL,
- LTC2990_TINT_MSB);
+ LTC2990_TEMP1);
+static SENSOR_DEVICE_ATTR(temp2_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_TEMP2);
+static SENSOR_DEVICE_ATTR(temp3_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_TEMP3);
static SENSOR_DEVICE_ATTR(curr1_input, S_IRUGO, ltc2990_show_value, NULL,
- LTC2990_V1_MSB);
+ LTC2990_CURR1);
static SENSOR_DEVICE_ATTR(curr2_input, S_IRUGO, ltc2990_show_value, NULL,
- LTC2990_V3_MSB);
+ LTC2990_CURR2);
static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, ltc2990_show_value, NULL,
- LTC2990_VCC_MSB);
+ LTC2990_IN0);
+static SENSOR_DEVICE_ATTR(in1_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_IN1);
+static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_IN2);
+static SENSOR_DEVICE_ATTR(in3_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_IN3);
+static SENSOR_DEVICE_ATTR(in4_input, S_IRUGO, ltc2990_show_value, NULL,
+ LTC2990_IN4);
static struct attribute *ltc2990_attrs[] = {
&sensor_dev_attr_temp1_input.dev_attr.attr,
+ &sensor_dev_attr_temp2_input.dev_attr.attr,
+ &sensor_dev_attr_temp3_input.dev_attr.attr,
&sensor_dev_attr_curr1_input.dev_attr.attr,
&sensor_dev_attr_curr2_input.dev_attr.attr,
&sensor_dev_attr_in0_input.dev_attr.attr,
+ &sensor_dev_attr_in1_input.dev_attr.attr,
+ &sensor_dev_attr_in2_input.dev_attr.attr,
+ &sensor_dev_attr_in3_input.dev_attr.attr,
+ &sensor_dev_attr_in4_input.dev_attr.attr,
NULL,
};
-ATTRIBUTE_GROUPS(ltc2990);
+
+static const struct attribute_group ltc2990_group = {
+ .attrs = ltc2990_attrs,
+ .is_visible = ltc2990_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(ltc2990);
static int ltc2990_i2c_probe(struct i2c_client *i2c,
const struct i2c_device_id *id)
{
int ret;
struct device *hwmon_dev;
+ struct ltc2990_data *data;
+ struct device_node *of_node = i2c->dev.of_node;
if (!i2c_check_functionality(i2c->adapter, I2C_FUNC_SMBUS_BYTE_DATA |
I2C_FUNC_SMBUS_WORD_DATA))
return -ENODEV;
- /* Setup continuous mode, current monitor */
+ data = devm_kzalloc(&i2c->dev, sizeof(struct ltc2990_data), GFP_KERNEL);
+ if (unlikely(!data))
+ return -ENOMEM;
+
+ data->i2c = i2c;
+
+ if (of_node) {
+ ret = of_property_read_u32_array(of_node, "lltc,meas-mode",
+ data->mode, 2);
+ if (ret < 0)
+ return ret;
+
+ if (data->mode[0] & ~LTC2990_MODE0_MASK ||
+ data->mode[1] & ~LTC2990_MODE1_MASK)
+ return -EINVAL;
+ } else {
+ ret = i2c_smbus_read_byte_data(i2c, LTC2990_CONTROL);
+ if (ret < 0)
+ return ret;
+
+ data->mode[0] = ret >> LTC2990_MODE0_SHIFT & LTC2990_MODE0_MASK;
+ data->mode[1] = ret >> LTC2990_MODE1_SHIFT & LTC2990_MODE1_MASK;
+ }
+
+ /* Setup continuous mode */
ret = i2c_smbus_write_byte_data(i2c, LTC2990_CONTROL,
- LTC2990_CONTROL_MEASURE_ALL |
- LTC2990_CONTROL_MODE_CURRENT);
+ data->mode[0] << LTC2990_MODE0_SHIFT |
+ data->mode[1] << LTC2990_MODE1_SHIFT);
if (ret < 0) {
dev_err(&i2c->dev, "Error: Failed to set control mode.\n");
return ret;
@@ -134,7 +264,7 @@ static int ltc2990_i2c_probe(struct i2c_client *i2c,
hwmon_dev = devm_hwmon_device_register_with_groups(&i2c->dev,
i2c->name,
- i2c,
+ data,
ltc2990_groups);
return PTR_ERR_OR_ZERO(hwmon_dev);
diff --git a/drivers/hwmon/mc13783-adc.c b/drivers/hwmon/mc13783-adc.c
index 960a1db6f269..67860ad2e3d9 100644
--- a/drivers/hwmon/mc13783-adc.c
+++ b/drivers/hwmon/mc13783-adc.c
@@ -63,6 +63,10 @@ static int mc13783_adc_read(struct device *dev,
if (ret)
return ret;
+ /* ADIN7 subchannels */
+ if (channel >= 16)
+ channel = 7;
+
channel &= 0x7;
*val = (sample[channel % 4] >> (channel > 3 ? 14 : 2)) & 0x3ff;
@@ -111,6 +115,57 @@ static ssize_t mc13783_adc_read_gp(struct device *dev,
return sprintf(buf, "%u\n", val);
}
+static ssize_t mc13783_adc_read_uid(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+ unsigned int val;
+ struct platform_device *pdev = to_platform_device(dev);
+ kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+ int ret = mc13783_adc_read(dev, devattr, &val);
+
+ if (ret)
+ return ret;
+
+ if (driver_data & MC13783_ADC_BPDIV2)
+ /* MC13892 have 1/2 divider, input range is [0, 4.800V] */
+ val = DIV_ROUND_CLOSEST(val * 4800, 1024);
+ else
+ /* MC13783 have 0.9 divider, input range is [0, 2.555V] */
+ val = DIV_ROUND_CLOSEST(val * 2555, 1024);
+
+ return sprintf(buf, "%u\n", val);
+}
+
+static ssize_t mc13783_adc_read_temp(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+ unsigned int val;
+ struct platform_device *pdev = to_platform_device(dev);
+ kernel_ulong_t driver_data = platform_get_device_id(pdev)->driver_data;
+ int ret = mc13783_adc_read(dev, devattr, &val);
+
+ if (ret)
+ return ret;
+
+ if (driver_data & MC13783_ADC_BPDIV2) {
+ /*
+ * MC13892:
+ * Die Temperature Read Out Code at 25C 680
+ * Temperature change per LSB +0.4244C
+ */
+ ret = DIV_ROUND_CLOSEST(-2635920 + val * 4244, 10);
+ } else {
+ /*
+ * MC13783:
+ * Die Temperature Read Out Code at 25C 282
+ * Temperature change per LSB -1.14C
+ */
+ ret = 346480 - 1140 * val;
+ }
+
+ return sprintf(buf, "%d\n", ret);
+}
+
static DEVICE_ATTR_RO(name);
static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, mc13783_adc_read_bp, NULL, 2);
static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, mc13783_adc_read_gp, NULL, 5);
@@ -124,6 +179,9 @@ static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, mc13783_adc_read_gp, NULL, 12);
static SENSOR_DEVICE_ATTR(in13_input, S_IRUGO, mc13783_adc_read_gp, NULL, 13);
static SENSOR_DEVICE_ATTR(in14_input, S_IRUGO, mc13783_adc_read_gp, NULL, 14);
static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, mc13783_adc_read_gp, NULL, 15);
+static SENSOR_DEVICE_ATTR(in16_input, S_IRUGO, mc13783_adc_read_uid, NULL, 16);
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO,
+ mc13783_adc_read_temp, NULL, 17);
static struct attribute *mc13783_attr_base[] = {
&dev_attr_name.attr,
@@ -131,6 +189,8 @@ static struct attribute *mc13783_attr_base[] = {
&sensor_dev_attr_in5_input.dev_attr.attr,
&sensor_dev_attr_in6_input.dev_attr.attr,
&sensor_dev_attr_in7_input.dev_attr.attr,
+ &sensor_dev_attr_in16_input.dev_attr.attr,
+ &sensor_dev_attr_temp1_input.dev_attr.attr,
NULL
};
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 54d4d78ca46a..6f344654ef22 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -180,7 +180,6 @@ EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
void ide_dma_off_quietly(ide_drive_t *drive)
{
drive->dev_flags &= ~IDE_DFLAG_USING_DMA;
- ide_toggle_bounce(drive, 0);
drive->hwif->dma_ops->dma_host_set(drive, 0);
}
@@ -211,7 +210,6 @@ EXPORT_SYMBOL(ide_dma_off);
void ide_dma_on(ide_drive_t *drive)
{
drive->dev_flags |= IDE_DFLAG_USING_DMA;
- ide_toggle_bounce(drive, 1);
drive->hwif->dma_ops->dma_host_set(drive, 1);
}
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index e1180fa46196..78cb79eddc8b 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -6,32 +6,6 @@
#include <linux/ide.h>
#include <linux/bitops.h>
-/**
- * ide_toggle_bounce - handle bounce buffering
- * @drive: drive to update
- * @on: on/off boolean
- *
- * Enable or disable bounce buffering for the device. Drives move
- * between PIO and DMA and that changes the rules we need.
- */
-
-void ide_toggle_bounce(ide_drive_t *drive, int on)
-{
- u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */
-
- if (!PCI_DMA_BUS_IS_PHYS) {
- addr = BLK_BOUNCE_ANY;
- } else if (on && drive->media == ide_disk) {
- struct device *dev = drive->hwif->dev;
-
- if (dev && dev->dma_mask)
- addr = *dev->dma_mask;
- }
-
- if (drive->queue)
- blk_queue_bounce_limit(drive->queue, addr);
-}
-
u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
{
struct ide_taskfile *tf = &cmd->tf;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 2019e66eada7..56d7bc228cb3 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -796,8 +796,7 @@ static int ide_init_queue(ide_drive_t *drive)
* This will be fixed once we teach pci_map_sg() about our boundary
* requirements, hopefully soon. *FIXME*
*/
- if (!PCI_DMA_BUS_IS_PHYS)
- max_sg_entries >>= 1;
+ max_sg_entries >>= 1;
#endif /* CONFIG_PCI */
blk_queue_max_segments(q, max_sg_entries);
@@ -805,9 +804,6 @@ static int ide_init_queue(ide_drive_t *drive)
/* assign drive queue */
drive->queue = q;
- /* needs drive->queue to be set */
- ide_toggle_bounce(drive, 1);
-
return 0;
}
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index df171cb85822..5b714a062fa7 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -146,6 +146,7 @@ config INTEL_IOMMU
select DMA_DIRECT_OPS
select IOMMU_API
select IOMMU_IOVA
+ select NEED_DMA_MAP_STATE
select DMAR_TABLE
help
DMA remapping (DMAR) devices support enables independent address
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 5ed465ab1c76..15f268f646bf 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -27,7 +27,7 @@ obj-$(CONFIG_ARM_GIC) += irq-gic.o irq-gic-common.o
obj-$(CONFIG_ARM_GIC_PM) += irq-gic-pm.o
obj-$(CONFIG_ARCH_REALVIEW) += irq-gic-realview.o
obj-$(CONFIG_ARM_GIC_V2M) += irq-gic-v2m.o
-obj-$(CONFIG_ARM_GIC_V3) += irq-gic-v3.o irq-gic-common.o
+obj-$(CONFIG_ARM_GIC_V3) += irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
obj-$(CONFIG_ARM_GIC_V3_ITS_PCI) += irq-gic-v3-its-pci-msi.o
obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC) += irq-gic-v3-its-fsl-mc-msi.o
diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c
new file mode 100644
index 000000000000..ad70e7c416e3
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#define pr_fmt(fmt) "GICv3: " fmt
+
+#include <linux/dma-iommu.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+struct mbi_range {
+ u32 spi_start;
+ u32 nr_spis;
+ unsigned long *bm;
+};
+
+static struct mutex mbi_lock;
+static phys_addr_t mbi_phys_base;
+static struct mbi_range *mbi_ranges;
+static unsigned int mbi_range_nr;
+
+static struct irq_chip mbi_irq_chip = {
+ .name = "MBI",
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = irq_chip_eoi_parent,
+ .irq_set_type = irq_chip_set_type_parent,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+};
+
+static int mbi_irq_gic_domain_alloc(struct irq_domain *domain,
+ unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ struct irq_fwspec fwspec;
+ struct irq_data *d;
+ int err;
+
+ /*
+ * Using ACPI? There is no MBI support in the spec, you
+ * shouldn't even be here.
+ */
+ if (!is_of_node(domain->parent->fwnode))
+ return -EINVAL;
+
+ /*
+ * Let's default to edge. This is consistent with traditional
+ * MSIs, and systems requiring level signaling will just
+ * enforce the trigger on their own.
+ */
+ fwspec.fwnode = domain->parent->fwnode;
+ fwspec.param_count = 3;
+ fwspec.param[0] = 0;
+ fwspec.param[1] = hwirq - 32;
+ fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+ err = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+ if (err)
+ return err;
+
+ d = irq_domain_get_irq_data(domain->parent, virq);
+ return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+}
+
+static void mbi_free_msi(struct mbi_range *mbi, unsigned int hwirq,
+ int nr_irqs)
+{
+ mutex_lock(&mbi_lock);
+ bitmap_release_region(mbi->bm, hwirq - mbi->spi_start,
+ get_count_order(nr_irqs));
+ mutex_unlock(&mbi_lock);
+}
+
+static int mbi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *args)
+{
+ struct mbi_range *mbi = NULL;
+ int hwirq, offset, i, err = 0;
+
+ mutex_lock(&mbi_lock);
+ for (i = 0; i < mbi_range_nr; i++) {
+ offset = bitmap_find_free_region(mbi_ranges[i].bm,
+ mbi_ranges[i].nr_spis,
+ get_count_order(nr_irqs));
+ if (offset >= 0) {
+ mbi = &mbi_ranges[i];
+ break;
+ }
+ }
+ mutex_unlock(&mbi_lock);
+
+ if (!mbi)
+ return -ENOSPC;
+
+ hwirq = mbi->spi_start + offset;
+
+ for (i = 0; i < nr_irqs; i++) {
+ err = mbi_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
+ if (err)
+ goto fail;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &mbi_irq_chip, mbi);
+ }
+
+ return 0;
+
+fail:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+ mbi_free_msi(mbi, hwirq, nr_irqs);
+ return err;
+}
+
+static void mbi_irq_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct mbi_range *mbi = irq_data_get_irq_chip_data(d);
+
+ mbi_free_msi(mbi, d->hwirq, nr_irqs);
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops mbi_domain_ops = {
+ .alloc = mbi_irq_domain_alloc,
+ .free = mbi_irq_domain_free,
+};
+
+static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR);
+ msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR);
+ msg[0].data = data->parent_data->hwirq;
+
+ iommu_dma_map_msi_msg(data->irq, msg);
+}
+
+#ifdef CONFIG_PCI_MSI
+/* PCI-specific irqchip */
+static void mbi_mask_msi_irq(struct irq_data *d)
+{
+ pci_msi_mask_irq(d);
+ irq_chip_mask_parent(d);
+}
+
+static void mbi_unmask_msi_irq(struct irq_data *d)
+{
+ pci_msi_unmask_irq(d);
+ irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip mbi_msi_irq_chip = {
+ .name = "MSI",
+ .irq_mask = mbi_mask_msi_irq,
+ .irq_unmask = mbi_unmask_msi_irq,
+ .irq_eoi = irq_chip_eoi_parent,
+ .irq_compose_msi_msg = mbi_compose_msi_msg,
+ .irq_write_msi_msg = pci_msi_domain_write_msg,
+};
+
+static struct msi_domain_info mbi_msi_domain_info = {
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI),
+ .chip = &mbi_msi_irq_chip,
+};
+
+static int mbi_allocate_pci_domain(struct irq_domain *nexus_domain,
+ struct irq_domain **pci_domain)
+{
+ *pci_domain = pci_msi_create_irq_domain(nexus_domain->parent->fwnode,
+ &mbi_msi_domain_info,
+ nexus_domain);
+ if (!*pci_domain)
+ return -ENOMEM;
+
+ return 0;
+}
+#else
+static int mbi_allocate_pci_domain(struct irq_domain *nexus_domain,
+ struct irq_domain **pci_domain)
+{
+ *pci_domain = NULL;
+ return 0;
+}
+#endif
+
+static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ mbi_compose_msi_msg(data, msg);
+
+ msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR);
+ msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR);
+ msg[1].data = data->parent_data->hwirq;
+
+ iommu_dma_map_msi_msg(data->irq, &msg[1]);
+}
+
+/* Platform-MSI specific irqchip */
+static struct irq_chip mbi_pmsi_irq_chip = {
+ .name = "pMSI",
+ .irq_set_type = irq_chip_set_type_parent,
+ .irq_compose_msi_msg = mbi_compose_mbi_msg,
+ .flags = IRQCHIP_SUPPORTS_LEVEL_MSI,
+};
+
+static struct msi_domain_ops mbi_pmsi_ops = {
+};
+
+static struct msi_domain_info mbi_pmsi_domain_info = {
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_LEVEL_CAPABLE),
+ .ops = &mbi_pmsi_ops,
+ .chip = &mbi_pmsi_irq_chip,
+};
+
+static int mbi_allocate_domains(struct irq_domain *parent)
+{
+ struct irq_domain *nexus_domain, *pci_domain, *plat_domain;
+ int err;
+
+ nexus_domain = irq_domain_create_tree(parent->fwnode,
+ &mbi_domain_ops, NULL);
+ if (!nexus_domain)
+ return -ENOMEM;
+
+ irq_domain_update_bus_token(nexus_domain, DOMAIN_BUS_NEXUS);
+ nexus_domain->parent = parent;
+
+ err = mbi_allocate_pci_domain(nexus_domain, &pci_domain);
+
+ plat_domain = platform_msi_create_irq_domain(parent->fwnode,
+ &mbi_pmsi_domain_info,
+ nexus_domain);
+
+ if (err || !plat_domain) {
+ if (plat_domain)
+ irq_domain_remove(plat_domain);
+ if (pci_domain)
+ irq_domain_remove(pci_domain);
+ irq_domain_remove(nexus_domain);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int __init mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent)
+{
+ struct device_node *np;
+ const __be32 *reg;
+ int ret, n;
+
+ np = to_of_node(fwnode);
+
+ if (!of_property_read_bool(np, "msi-controller"))
+ return 0;
+
+ n = of_property_count_elems_of_size(np, "mbi-ranges", sizeof(u32));
+ if (n <= 0 || n % 2)
+ return -EINVAL;
+
+ mbi_range_nr = n / 2;
+ mbi_ranges = kcalloc(mbi_range_nr, sizeof(*mbi_ranges), GFP_KERNEL);
+ if (!mbi_ranges)
+ return -ENOMEM;
+
+ for (n = 0; n < mbi_range_nr; n++) {
+ ret = of_property_read_u32_index(np, "mbi-ranges", n * 2,
+ &mbi_ranges[n].spi_start);
+ if (ret)
+ goto err_free_mbi;
+ ret = of_property_read_u32_index(np, "mbi-ranges", n * 2 + 1,
+ &mbi_ranges[n].nr_spis);
+ if (ret)
+ goto err_free_mbi;
+
+ mbi_ranges[n].bm = kcalloc(BITS_TO_LONGS(mbi_ranges[n].nr_spis),
+ sizeof(long), GFP_KERNEL);
+ if (!mbi_ranges[n].bm) {
+ ret = -ENOMEM;
+ goto err_free_mbi;
+ }
+ pr_info("MBI range [%d:%d]\n", mbi_ranges[n].spi_start,
+ mbi_ranges[n].spi_start + mbi_ranges[n].nr_spis - 1);
+ }
+
+ reg = of_get_property(np, "mbi-alias", NULL);
+ if (reg) {
+ mbi_phys_base = of_translate_address(np, reg);
+ if (mbi_phys_base == OF_BAD_ADDR) {
+ ret = -ENXIO;
+ goto err_free_mbi;
+ }
+ } else {
+ struct resource res;
+
+ if (of_address_to_resource(np, 0, &res)) {
+ ret = -ENXIO;
+ goto err_free_mbi;
+ }
+
+ mbi_phys_base = res.start;
+ }
+
+ pr_info("Using MBI frame %pa\n", &mbi_phys_base);
+
+ ret = mbi_allocate_domains(parent);
+ if (ret)
+ goto err_free_mbi;
+
+ return 0;
+
+err_free_mbi:
+ if (mbi_ranges) {
+ for (n = 0; n < mbi_range_nr; n++)
+ kfree(mbi_ranges[n].bm);
+ kfree(mbi_ranges);
+ }
+
+ return ret;
+}
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index e5d101418390..5a67ec084588 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1099,6 +1099,7 @@ static int __init gic_init_bases(void __iomem *dist_base,
gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
&gic_data);
+ irq_domain_update_bus_token(gic_data.domain, DOMAIN_BUS_WIRED);
gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
gic_data.rdists.has_vlpis = true;
gic_data.rdists.has_direct_lpi = true;
@@ -1112,6 +1113,12 @@ static int __init gic_init_bases(void __iomem *dist_base,
pr_info("Distributor has %sRange Selector support\n",
gic_data.has_rss ? "" : "no ");
+ if (typer & GICD_TYPER_MBIS) {
+ err = mbi_init(handle, gic_data.domain);
+ if (err)
+ pr_err("Failed to initialize MBIs\n");
+ }
+
set_handle_irq(gic_handle_irq);
gic_update_vlpi_properties();
diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c
index a59bdbc0b9bb..7b531fd075b8 100644
--- a/drivers/irqchip/irq-meson-gpio.c
+++ b/drivers/irqchip/irq-meson-gpio.c
@@ -63,11 +63,16 @@ static const struct meson_gpio_irq_params gxl_params = {
.nr_hwirq = 110,
};
+static const struct meson_gpio_irq_params axg_params = {
+ .nr_hwirq = 100,
+};
+
static const struct of_device_id meson_irq_gpio_matches[] = {
{ .compatible = "amlogic,meson8-gpio-intc", .data = &meson8_params },
{ .compatible = "amlogic,meson8b-gpio-intc", .data = &meson8b_params },
{ .compatible = "amlogic,meson-gxbb-gpio-intc", .data = &gxbb_params },
{ .compatible = "amlogic,meson-gxl-gpio-intc", .data = &gxl_params },
+ { .compatible = "amlogic,meson-axg-gpio-intc", .data = &axg_params },
{ }
};
diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c
index 17a4a7b6cdbb..4e17f7081efc 100644
--- a/drivers/irqchip/irq-mvebu-gicp.c
+++ b/drivers/irqchip/irq-mvebu-gicp.c
@@ -19,8 +19,6 @@
#include <dt-bindings/interrupt-controller/arm-gic.h>
-#include "irq-mvebu-gicp.h"
-
#define GICP_SETSPI_NSR_OFFSET 0x0
#define GICP_CLRSPI_NSR_OFFSET 0x8
@@ -55,34 +53,18 @@ static int gicp_idx_to_spi(struct mvebu_gicp *gicp, int idx)
return -EINVAL;
}
-int mvebu_gicp_get_doorbells(struct device_node *dn, phys_addr_t *setspi,
- phys_addr_t *clrspi)
-{
- struct platform_device *pdev;
- struct mvebu_gicp *gicp;
-
- pdev = of_find_device_by_node(dn);
- if (!pdev)
- return -ENODEV;
-
- gicp = platform_get_drvdata(pdev);
- if (!gicp)
- return -ENODEV;
-
- *setspi = gicp->res->start + GICP_SETSPI_NSR_OFFSET;
- *clrspi = gicp->res->start + GICP_CLRSPI_NSR_OFFSET;
-
- return 0;
-}
-
static void gicp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
struct mvebu_gicp *gicp = data->chip_data;
phys_addr_t setspi = gicp->res->start + GICP_SETSPI_NSR_OFFSET;
-
- msg->data = data->hwirq;
- msg->address_lo = lower_32_bits(setspi);
- msg->address_hi = upper_32_bits(setspi);
+ phys_addr_t clrspi = gicp->res->start + GICP_CLRSPI_NSR_OFFSET;
+
+ msg[0].data = data->hwirq;
+ msg[0].address_lo = lower_32_bits(setspi);
+ msg[0].address_hi = upper_32_bits(setspi);
+ msg[1].data = data->hwirq;
+ msg[1].address_lo = lower_32_bits(clrspi);
+ msg[1].address_hi = upper_32_bits(clrspi);
}
static struct irq_chip gicp_irq_chip = {
@@ -170,13 +152,15 @@ static const struct irq_domain_ops gicp_domain_ops = {
static struct irq_chip gicp_msi_irq_chip = {
.name = "GICP",
.irq_set_type = irq_chip_set_type_parent,
+ .flags = IRQCHIP_SUPPORTS_LEVEL_MSI,
};
static struct msi_domain_ops gicp_msi_ops = {
};
static struct msi_domain_info gicp_msi_domain_info = {
- .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_LEVEL_CAPABLE),
.ops = &gicp_msi_ops,
.chip = &gicp_msi_irq_chip,
};
diff --git a/drivers/irqchip/irq-mvebu-gicp.h b/drivers/irqchip/irq-mvebu-gicp.h
deleted file mode 100644
index eaa12fb72102..000000000000
--- a/drivers/irqchip/irq-mvebu-gicp.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __MVEBU_GICP_H__
-#define __MVEBU_GICP_H__
-
-#include <linux/types.h>
-
-struct device_node;
-
-int mvebu_gicp_get_doorbells(struct device_node *dn, phys_addr_t *setspi,
- phys_addr_t *clrspi);
-
-#endif /* __MVEBU_GICP_H__ */
diff --git a/drivers/irqchip/irq-mvebu-icu.c b/drivers/irqchip/irq-mvebu-icu.c
index e18c48d3a92e..13063339b416 100644
--- a/drivers/irqchip/irq-mvebu-icu.c
+++ b/drivers/irqchip/irq-mvebu-icu.c
@@ -21,8 +21,6 @@
#include <dt-bindings/interrupt-controller/mvebu-icu.h>
-#include "irq-mvebu-gicp.h"
-
/* ICU registers */
#define ICU_SETSPI_NSR_AL 0x10
#define ICU_SETSPI_NSR_AH 0x14
@@ -43,6 +41,7 @@ struct mvebu_icu {
void __iomem *base;
struct irq_domain *domain;
struct device *dev;
+ atomic_t initialized;
};
struct mvebu_icu_irq_data {
@@ -51,6 +50,18 @@ struct mvebu_icu_irq_data {
unsigned int type;
};
+static void mvebu_icu_init(struct mvebu_icu *icu, struct msi_msg *msg)
+{
+ if (atomic_cmpxchg(&icu->initialized, false, true))
+ return;
+
+ /* Set Clear/Set ICU SPI message address in AP */
+ writel_relaxed(msg[0].address_hi, icu->base + ICU_SETSPI_NSR_AH);
+ writel_relaxed(msg[0].address_lo, icu->base + ICU_SETSPI_NSR_AL);
+ writel_relaxed(msg[1].address_hi, icu->base + ICU_CLRSPI_NSR_AH);
+ writel_relaxed(msg[1].address_lo, icu->base + ICU_CLRSPI_NSR_AL);
+}
+
static void mvebu_icu_write_msg(struct msi_desc *desc, struct msi_msg *msg)
{
struct irq_data *d = irq_get_irq_data(desc->irq);
@@ -59,6 +70,8 @@ static void mvebu_icu_write_msg(struct msi_desc *desc, struct msi_msg *msg)
unsigned int icu_int;
if (msg->address_lo || msg->address_hi) {
+ /* One off initialization */
+ mvebu_icu_init(icu, msg);
/* Configure the ICU with irq number & type */
icu_int = msg->data | ICU_INT_ENABLE;
if (icu_irqd->type & IRQ_TYPE_EDGE_RISING)
@@ -197,9 +210,7 @@ static int mvebu_icu_probe(struct platform_device *pdev)
struct device_node *node = pdev->dev.of_node;
struct device_node *gicp_dn;
struct resource *res;
- phys_addr_t setspi, clrspi;
- u32 i, icu_int;
- int ret;
+ int i;
icu = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_icu),
GFP_KERNEL);
@@ -242,22 +253,12 @@ static int mvebu_icu_probe(struct platform_device *pdev)
if (!gicp_dn)
return -ENODEV;
- ret = mvebu_gicp_get_doorbells(gicp_dn, &setspi, &clrspi);
- if (ret)
- return ret;
-
- /* Set Clear/Set ICU SPI message address in AP */
- writel_relaxed(upper_32_bits(setspi), icu->base + ICU_SETSPI_NSR_AH);
- writel_relaxed(lower_32_bits(setspi), icu->base + ICU_SETSPI_NSR_AL);
- writel_relaxed(upper_32_bits(clrspi), icu->base + ICU_CLRSPI_NSR_AH);
- writel_relaxed(lower_32_bits(clrspi), icu->base + ICU_CLRSPI_NSR_AL);
-
/*
* Clean all ICU interrupts with type SPI_NSR, required to
* avoid unpredictable SPI assignments done by firmware.
*/
for (i = 0 ; i < ICU_MAX_IRQS ; i++) {
- icu_int = readl(icu->base + ICU_INT_CFG(i));
+ u32 icu_int = readl_relaxed(icu->base + ICU_INT_CFG(i));
if ((icu_int >> ICU_GROUP_SHIFT) == ICU_GRP_NSR)
writel_relaxed(0x0, icu->base + ICU_INT_CFG(i));
}
diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c
index 36f0fbe36c35..5089c1e2838d 100644
--- a/drivers/irqchip/irq-stm32-exti.c
+++ b/drivers/irqchip/irq-stm32-exti.c
@@ -14,6 +14,9 @@
#include <linux/irqdomain.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
+#include <linux/syscore_ops.h>
+
+#include <dt-bindings/interrupt-controller/arm-gic.h>
#define IRQS_PER_BANK 32
@@ -23,29 +26,69 @@ struct stm32_exti_bank {
u32 rtsr_ofst;
u32 ftsr_ofst;
u32 swier_ofst;
- u32 pr_ofst;
+ u32 rpr_ofst;
+ u32 fpr_ofst;
+};
+
+#define UNDEF_REG ~0
+
+struct stm32_desc_irq {
+ u32 exti;
+ u32 irq_parent;
+};
+
+struct stm32_exti_drv_data {
+ const struct stm32_exti_bank **exti_banks;
+ const struct stm32_desc_irq *desc_irqs;
+ u32 bank_nr;
+ u32 irq_nr;
+};
+
+struct stm32_exti_chip_data {
+ struct stm32_exti_host_data *host_data;
+ const struct stm32_exti_bank *reg_bank;
+ struct raw_spinlock rlock;
+ u32 wake_active;
+ u32 mask_cache;
+ u32 rtsr_cache;
+ u32 ftsr_cache;
};
+struct stm32_exti_host_data {
+ void __iomem *base;
+ struct stm32_exti_chip_data *chips_data;
+ const struct stm32_exti_drv_data *drv_data;
+};
+
+static struct stm32_exti_host_data *stm32_host_data;
+
static const struct stm32_exti_bank stm32f4xx_exti_b1 = {
.imr_ofst = 0x00,
.emr_ofst = 0x04,
.rtsr_ofst = 0x08,
.ftsr_ofst = 0x0C,
.swier_ofst = 0x10,
- .pr_ofst = 0x14,
+ .rpr_ofst = 0x14,
+ .fpr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank *stm32f4xx_exti_banks[] = {
&stm32f4xx_exti_b1,
};
+static const struct stm32_exti_drv_data stm32f4xx_drv_data = {
+ .exti_banks = stm32f4xx_exti_banks,
+ .bank_nr = ARRAY_SIZE(stm32f4xx_exti_banks),
+};
+
static const struct stm32_exti_bank stm32h7xx_exti_b1 = {
.imr_ofst = 0x80,
.emr_ofst = 0x84,
.rtsr_ofst = 0x00,
.ftsr_ofst = 0x04,
.swier_ofst = 0x08,
- .pr_ofst = 0x88,
+ .rpr_ofst = 0x88,
+ .fpr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank stm32h7xx_exti_b2 = {
@@ -54,7 +97,8 @@ static const struct stm32_exti_bank stm32h7xx_exti_b2 = {
.rtsr_ofst = 0x20,
.ftsr_ofst = 0x24,
.swier_ofst = 0x28,
- .pr_ofst = 0x98,
+ .rpr_ofst = 0x98,
+ .fpr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank stm32h7xx_exti_b3 = {
@@ -63,7 +107,8 @@ static const struct stm32_exti_bank stm32h7xx_exti_b3 = {
.rtsr_ofst = 0x40,
.ftsr_ofst = 0x44,
.swier_ofst = 0x48,
- .pr_ofst = 0xA8,
+ .rpr_ofst = 0xA8,
+ .fpr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank *stm32h7xx_exti_banks[] = {
@@ -72,18 +117,105 @@ static const struct stm32_exti_bank *stm32h7xx_exti_banks[] = {
&stm32h7xx_exti_b3,
};
-static unsigned long stm32_exti_pending(struct irq_chip_generic *gc)
+static const struct stm32_exti_drv_data stm32h7xx_drv_data = {
+ .exti_banks = stm32h7xx_exti_banks,
+ .bank_nr = ARRAY_SIZE(stm32h7xx_exti_banks),
+};
+
+static const struct stm32_exti_bank stm32mp1_exti_b1 = {
+ .imr_ofst = 0x80,
+ .emr_ofst = 0x84,
+ .rtsr_ofst = 0x00,
+ .ftsr_ofst = 0x04,
+ .swier_ofst = 0x08,
+ .rpr_ofst = 0x0C,
+ .fpr_ofst = 0x10,
+};
+
+static const struct stm32_exti_bank stm32mp1_exti_b2 = {
+ .imr_ofst = 0x90,
+ .emr_ofst = 0x94,
+ .rtsr_ofst = 0x20,
+ .ftsr_ofst = 0x24,
+ .swier_ofst = 0x28,
+ .rpr_ofst = 0x2C,
+ .fpr_ofst = 0x30,
+};
+
+static const struct stm32_exti_bank stm32mp1_exti_b3 = {
+ .imr_ofst = 0xA0,
+ .emr_ofst = 0xA4,
+ .rtsr_ofst = 0x40,
+ .ftsr_ofst = 0x44,
+ .swier_ofst = 0x48,
+ .rpr_ofst = 0x4C,
+ .fpr_ofst = 0x50,
+};
+
+static const struct stm32_exti_bank *stm32mp1_exti_banks[] = {
+ &stm32mp1_exti_b1,
+ &stm32mp1_exti_b2,
+ &stm32mp1_exti_b3,
+};
+
+static const struct stm32_desc_irq stm32mp1_desc_irq[] = {
+ { .exti = 1, .irq_parent = 7 },
+ { .exti = 2, .irq_parent = 8 },
+ { .exti = 3, .irq_parent = 9 },
+ { .exti = 4, .irq_parent = 10 },
+ { .exti = 5, .irq_parent = 23 },
+ { .exti = 6, .irq_parent = 64 },
+ { .exti = 7, .irq_parent = 65 },
+ { .exti = 8, .irq_parent = 66 },
+ { .exti = 9, .irq_parent = 67 },
+ { .exti = 10, .irq_parent = 40 },
+ { .exti = 11, .irq_parent = 42 },
+ { .exti = 12, .irq_parent = 76 },
+ { .exti = 13, .irq_parent = 77 },
+ { .exti = 14, .irq_parent = 121 },
+ { .exti = 15, .irq_parent = 127 },
+ { .exti = 16, .irq_parent = 1 },
+ { .exti = 65, .irq_parent = 144 },
+ { .exti = 68, .irq_parent = 143 },
+ { .exti = 73, .irq_parent = 129 },
+};
+
+static const struct stm32_exti_drv_data stm32mp1_drv_data = {
+ .exti_banks = stm32mp1_exti_banks,
+ .bank_nr = ARRAY_SIZE(stm32mp1_exti_banks),
+ .desc_irqs = stm32mp1_desc_irq,
+ .irq_nr = ARRAY_SIZE(stm32mp1_desc_irq),
+};
+
+static int stm32_exti_to_irq(const struct stm32_exti_drv_data *drv_data,
+ irq_hw_number_t hwirq)
{
- const struct stm32_exti_bank *stm32_bank = gc->private;
+ const struct stm32_desc_irq *desc_irq;
+ int i;
- return irq_reg_readl(gc, stm32_bank->pr_ofst);
+ if (!drv_data->desc_irqs)
+ return -EINVAL;
+
+ for (i = 0; i < drv_data->irq_nr; i++) {
+ desc_irq = &drv_data->desc_irqs[i];
+ if (desc_irq->exti == hwirq)
+ return desc_irq->irq_parent;
+ }
+
+ return -EINVAL;
}
-static void stm32_exti_irq_ack(struct irq_chip_generic *gc, u32 mask)
+static unsigned long stm32_exti_pending(struct irq_chip_generic *gc)
{
- const struct stm32_exti_bank *stm32_bank = gc->private;
+ struct stm32_exti_chip_data *chip_data = gc->private;
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+ unsigned long pending;
+
+ pending = irq_reg_readl(gc, stm32_bank->rpr_ofst);
+ if (stm32_bank->fpr_ofst != UNDEF_REG)
+ pending |= irq_reg_readl(gc, stm32_bank->fpr_ofst);
- irq_reg_writel(gc, mask, stm32_bank->pr_ofst);
+ return pending;
}
static void stm32_irq_handler(struct irq_desc *desc)
@@ -92,7 +224,6 @@ static void stm32_irq_handler(struct irq_desc *desc)
struct irq_chip *chip = irq_desc_get_chip(desc);
unsigned int virq, nbanks = domain->gc->num_chips;
struct irq_chip_generic *gc;
- const struct stm32_exti_bank *stm32_bank;
unsigned long pending;
int n, i, irq_base = 0;
@@ -100,13 +231,11 @@ static void stm32_irq_handler(struct irq_desc *desc)
for (i = 0; i < nbanks; i++, irq_base += IRQS_PER_BANK) {
gc = irq_get_domain_generic_chip(domain, irq_base);
- stm32_bank = gc->private;
while ((pending = stm32_exti_pending(gc))) {
for_each_set_bit(n, &pending, IRQS_PER_BANK) {
virq = irq_find_mapping(domain, irq_base + n);
generic_handle_irq(virq);
- stm32_exti_irq_ack(gc, BIT(n));
}
}
}
@@ -114,36 +243,50 @@ static void stm32_irq_handler(struct irq_desc *desc)
chained_irq_exit(chip, desc);
}
-static int stm32_irq_set_type(struct irq_data *data, unsigned int type)
+static int stm32_exti_set_type(struct irq_data *d,
+ unsigned int type, u32 *rtsr, u32 *ftsr)
{
- struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
- const struct stm32_exti_bank *stm32_bank = gc->private;
- int pin = data->hwirq % IRQS_PER_BANK;
- u32 rtsr, ftsr;
-
- irq_gc_lock(gc);
-
- rtsr = irq_reg_readl(gc, stm32_bank->rtsr_ofst);
- ftsr = irq_reg_readl(gc, stm32_bank->ftsr_ofst);
+ u32 mask = BIT(d->hwirq % IRQS_PER_BANK);
switch (type) {
case IRQ_TYPE_EDGE_RISING:
- rtsr |= BIT(pin);
- ftsr &= ~BIT(pin);
+ *rtsr |= mask;
+ *ftsr &= ~mask;
break;
case IRQ_TYPE_EDGE_FALLING:
- rtsr &= ~BIT(pin);
- ftsr |= BIT(pin);
+ *rtsr &= ~mask;
+ *ftsr |= mask;
break;
case IRQ_TYPE_EDGE_BOTH:
- rtsr |= BIT(pin);
- ftsr |= BIT(pin);
+ *rtsr |= mask;
+ *ftsr |= mask;
break;
default:
- irq_gc_unlock(gc);
return -EINVAL;
}
+ return 0;
+}
+
+static int stm32_irq_set_type(struct irq_data *d, unsigned int type)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct stm32_exti_chip_data *chip_data = gc->private;
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+ u32 rtsr, ftsr;
+ int err;
+
+ irq_gc_lock(gc);
+
+ rtsr = irq_reg_readl(gc, stm32_bank->rtsr_ofst);
+ ftsr = irq_reg_readl(gc, stm32_bank->ftsr_ofst);
+
+ err = stm32_exti_set_type(d, type, &rtsr, &ftsr);
+ if (err) {
+ irq_gc_unlock(gc);
+ return err;
+ }
+
irq_reg_writel(gc, rtsr, stm32_bank->rtsr_ofst);
irq_reg_writel(gc, ftsr, stm32_bank->ftsr_ofst);
@@ -152,40 +295,59 @@ static int stm32_irq_set_type(struct irq_data *data, unsigned int type)
return 0;
}
-static int stm32_irq_set_wake(struct irq_data *data, unsigned int on)
+static void stm32_chip_suspend(struct stm32_exti_chip_data *chip_data,
+ u32 wake_active)
{
- struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
- const struct stm32_exti_bank *stm32_bank = gc->private;
- int pin = data->hwirq % IRQS_PER_BANK;
- u32 imr;
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+ void __iomem *base = chip_data->host_data->base;
- irq_gc_lock(gc);
+ /* save rtsr, ftsr registers */
+ chip_data->rtsr_cache = readl_relaxed(base + stm32_bank->rtsr_ofst);
+ chip_data->ftsr_cache = readl_relaxed(base + stm32_bank->ftsr_ofst);
- imr = irq_reg_readl(gc, stm32_bank->imr_ofst);
- if (on)
- imr |= BIT(pin);
- else
- imr &= ~BIT(pin);
- irq_reg_writel(gc, imr, stm32_bank->imr_ofst);
+ writel_relaxed(wake_active, base + stm32_bank->imr_ofst);
+}
+
+static void stm32_chip_resume(struct stm32_exti_chip_data *chip_data,
+ u32 mask_cache)
+{
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+ void __iomem *base = chip_data->host_data->base;
+
+ /* restore rtsr, ftsr, registers */
+ writel_relaxed(chip_data->rtsr_cache, base + stm32_bank->rtsr_ofst);
+ writel_relaxed(chip_data->ftsr_cache, base + stm32_bank->ftsr_ofst);
+ writel_relaxed(mask_cache, base + stm32_bank->imr_ofst);
+}
+
+static void stm32_irq_suspend(struct irq_chip_generic *gc)
+{
+ struct stm32_exti_chip_data *chip_data = gc->private;
+
+ irq_gc_lock(gc);
+ stm32_chip_suspend(chip_data, gc->wake_active);
irq_gc_unlock(gc);
+}
- return 0;
+static void stm32_irq_resume(struct irq_chip_generic *gc)
+{
+ struct stm32_exti_chip_data *chip_data = gc->private;
+
+ irq_gc_lock(gc);
+ stm32_chip_resume(chip_data, gc->mask_cache);
+ irq_gc_unlock(gc);
}
static int stm32_exti_alloc(struct irq_domain *d, unsigned int virq,
unsigned int nr_irqs, void *data)
{
- struct irq_chip_generic *gc;
struct irq_fwspec *fwspec = data;
irq_hw_number_t hwirq;
hwirq = fwspec->param[0];
- gc = irq_get_domain_generic_chip(d, hwirq);
irq_map_generic_chip(d, virq, hwirq);
- irq_domain_set_info(d, virq, hwirq, &gc->chip_types->chip, gc,
- handle_simple_irq, NULL, NULL);
return 0;
}
@@ -198,30 +360,318 @@ static void stm32_exti_free(struct irq_domain *d, unsigned int virq,
irq_domain_reset_irq_data(data);
}
-struct irq_domain_ops irq_exti_domain_ops = {
+static const struct irq_domain_ops irq_exti_domain_ops = {
.map = irq_map_generic_chip,
- .xlate = irq_domain_xlate_onetwocell,
.alloc = stm32_exti_alloc,
.free = stm32_exti_free,
};
-static int
-__init stm32_exti_init(const struct stm32_exti_bank **stm32_exti_banks,
- int bank_nr, struct device_node *node)
+static void stm32_irq_ack(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct stm32_exti_chip_data *chip_data = gc->private;
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+
+ irq_gc_lock(gc);
+
+ irq_reg_writel(gc, d->mask, stm32_bank->rpr_ofst);
+ if (stm32_bank->fpr_ofst != UNDEF_REG)
+ irq_reg_writel(gc, d->mask, stm32_bank->fpr_ofst);
+
+ irq_gc_unlock(gc);
+}
+
+static inline u32 stm32_exti_set_bit(struct irq_data *d, u32 reg)
{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ void __iomem *base = chip_data->host_data->base;
+ u32 val;
+
+ val = readl_relaxed(base + reg);
+ val |= BIT(d->hwirq % IRQS_PER_BANK);
+ writel_relaxed(val, base + reg);
+
+ return val;
+}
+
+static inline u32 stm32_exti_clr_bit(struct irq_data *d, u32 reg)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ void __iomem *base = chip_data->host_data->base;
+ u32 val;
+
+ val = readl_relaxed(base + reg);
+ val &= ~BIT(d->hwirq % IRQS_PER_BANK);
+ writel_relaxed(val, base + reg);
+
+ return val;
+}
+
+static void stm32_exti_h_eoi(struct irq_data *d)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+
+ raw_spin_lock(&chip_data->rlock);
+
+ stm32_exti_set_bit(d, stm32_bank->rpr_ofst);
+ if (stm32_bank->fpr_ofst != UNDEF_REG)
+ stm32_exti_set_bit(d, stm32_bank->fpr_ofst);
+
+ raw_spin_unlock(&chip_data->rlock);
+
+ if (d->parent_data->chip)
+ irq_chip_eoi_parent(d);
+}
+
+static void stm32_exti_h_mask(struct irq_data *d)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+
+ raw_spin_lock(&chip_data->rlock);
+ chip_data->mask_cache = stm32_exti_clr_bit(d, stm32_bank->imr_ofst);
+ raw_spin_unlock(&chip_data->rlock);
+
+ if (d->parent_data->chip)
+ irq_chip_mask_parent(d);
+}
+
+static void stm32_exti_h_unmask(struct irq_data *d)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+
+ raw_spin_lock(&chip_data->rlock);
+ chip_data->mask_cache = stm32_exti_set_bit(d, stm32_bank->imr_ofst);
+ raw_spin_unlock(&chip_data->rlock);
+
+ if (d->parent_data->chip)
+ irq_chip_unmask_parent(d);
+}
+
+static int stm32_exti_h_set_type(struct irq_data *d, unsigned int type)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ const struct stm32_exti_bank *stm32_bank = chip_data->reg_bank;
+ void __iomem *base = chip_data->host_data->base;
+ u32 rtsr, ftsr;
+ int err;
+
+ raw_spin_lock(&chip_data->rlock);
+ rtsr = readl_relaxed(base + stm32_bank->rtsr_ofst);
+ ftsr = readl_relaxed(base + stm32_bank->ftsr_ofst);
+
+ err = stm32_exti_set_type(d, type, &rtsr, &ftsr);
+ if (err) {
+ raw_spin_unlock(&chip_data->rlock);
+ return err;
+ }
+
+ writel_relaxed(rtsr, base + stm32_bank->rtsr_ofst);
+ writel_relaxed(ftsr, base + stm32_bank->ftsr_ofst);
+ raw_spin_unlock(&chip_data->rlock);
+
+ return 0;
+}
+
+static int stm32_exti_h_set_wake(struct irq_data *d, unsigned int on)
+{
+ struct stm32_exti_chip_data *chip_data = irq_data_get_irq_chip_data(d);
+ u32 mask = BIT(d->hwirq % IRQS_PER_BANK);
+
+ raw_spin_lock(&chip_data->rlock);
+
+ if (on)
+ chip_data->wake_active |= mask;
+ else
+ chip_data->wake_active &= ~mask;
+
+ raw_spin_unlock(&chip_data->rlock);
+
+ return 0;
+}
+
+static int stm32_exti_h_set_affinity(struct irq_data *d,
+ const struct cpumask *dest, bool force)
+{
+ if (d->parent_data->chip)
+ return irq_chip_set_affinity_parent(d, dest, force);
+
+ return -EINVAL;
+}
+
+#ifdef CONFIG_PM
+static int stm32_exti_h_suspend(void)
+{
+ struct stm32_exti_chip_data *chip_data;
+ int i;
+
+ for (i = 0; i < stm32_host_data->drv_data->bank_nr; i++) {
+ chip_data = &stm32_host_data->chips_data[i];
+ raw_spin_lock(&chip_data->rlock);
+ stm32_chip_suspend(chip_data, chip_data->wake_active);
+ raw_spin_unlock(&chip_data->rlock);
+ }
+
+ return 0;
+}
+
+static void stm32_exti_h_resume(void)
+{
+ struct stm32_exti_chip_data *chip_data;
+ int i;
+
+ for (i = 0; i < stm32_host_data->drv_data->bank_nr; i++) {
+ chip_data = &stm32_host_data->chips_data[i];
+ raw_spin_lock(&chip_data->rlock);
+ stm32_chip_resume(chip_data, chip_data->mask_cache);
+ raw_spin_unlock(&chip_data->rlock);
+ }
+}
+
+static struct syscore_ops stm32_exti_h_syscore_ops = {
+ .suspend = stm32_exti_h_suspend,
+ .resume = stm32_exti_h_resume,
+};
+
+static void stm32_exti_h_syscore_init(void)
+{
+ register_syscore_ops(&stm32_exti_h_syscore_ops);
+}
+#else
+static inline void stm32_exti_h_syscore_init(void) {}
+#endif
+
+static struct irq_chip stm32_exti_h_chip = {
+ .name = "stm32-exti-h",
+ .irq_eoi = stm32_exti_h_eoi,
+ .irq_mask = stm32_exti_h_mask,
+ .irq_unmask = stm32_exti_h_unmask,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_type = stm32_exti_h_set_type,
+ .irq_set_wake = stm32_exti_h_set_wake,
+ .flags = IRQCHIP_MASK_ON_SUSPEND,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = stm32_exti_h_set_affinity,
+#endif
+};
+
+static int stm32_exti_h_domain_alloc(struct irq_domain *dm,
+ unsigned int virq,
+ unsigned int nr_irqs, void *data)
+{
+ struct stm32_exti_host_data *host_data = dm->host_data;
+ struct stm32_exti_chip_data *chip_data;
+ struct irq_fwspec *fwspec = data;
+ struct irq_fwspec p_fwspec;
+ irq_hw_number_t hwirq;
+ int p_irq, bank;
+
+ hwirq = fwspec->param[0];
+ bank = hwirq / IRQS_PER_BANK;
+ chip_data = &host_data->chips_data[bank];
+
+ irq_domain_set_hwirq_and_chip(dm, virq, hwirq,
+ &stm32_exti_h_chip, chip_data);
+
+ p_irq = stm32_exti_to_irq(host_data->drv_data, hwirq);
+ if (p_irq >= 0) {
+ p_fwspec.fwnode = dm->parent->fwnode;
+ p_fwspec.param_count = 3;
+ p_fwspec.param[0] = GIC_SPI;
+ p_fwspec.param[1] = p_irq;
+ p_fwspec.param[2] = IRQ_TYPE_LEVEL_HIGH;
+
+ return irq_domain_alloc_irqs_parent(dm, virq, 1, &p_fwspec);
+ }
+
+ return 0;
+}
+
+static struct
+stm32_exti_host_data *stm32_exti_host_init(const struct stm32_exti_drv_data *dd,
+ struct device_node *node)
+{
+ struct stm32_exti_host_data *host_data;
+
+ host_data = kzalloc(sizeof(*host_data), GFP_KERNEL);
+ if (!host_data)
+ return NULL;
+
+ host_data->drv_data = dd;
+ host_data->chips_data = kcalloc(dd->bank_nr,
+ sizeof(struct stm32_exti_chip_data),
+ GFP_KERNEL);
+ if (!host_data->chips_data)
+ return NULL;
+
+ host_data->base = of_iomap(node, 0);
+ if (!host_data->base) {
+ pr_err("%pOF: Unable to map registers\n", node);
+ return NULL;
+ }
+
+ stm32_host_data = host_data;
+
+ return host_data;
+}
+
+static struct
+stm32_exti_chip_data *stm32_exti_chip_init(struct stm32_exti_host_data *h_data,
+ u32 bank_idx,
+ struct device_node *node)
+{
+ const struct stm32_exti_bank *stm32_bank;
+ struct stm32_exti_chip_data *chip_data;
+ void __iomem *base = h_data->base;
+ u32 irqs_mask;
+
+ stm32_bank = h_data->drv_data->exti_banks[bank_idx];
+ chip_data = &h_data->chips_data[bank_idx];
+ chip_data->host_data = h_data;
+ chip_data->reg_bank = stm32_bank;
+
+ raw_spin_lock_init(&chip_data->rlock);
+
+ /* Determine number of irqs supported */
+ writel_relaxed(~0UL, base + stm32_bank->rtsr_ofst);
+ irqs_mask = readl_relaxed(base + stm32_bank->rtsr_ofst);
+
+ /*
+ * This IP has no reset, so after hot reboot we should
+ * clear registers to avoid residue
+ */
+ writel_relaxed(0, base + stm32_bank->imr_ofst);
+ writel_relaxed(0, base + stm32_bank->emr_ofst);
+ writel_relaxed(0, base + stm32_bank->rtsr_ofst);
+ writel_relaxed(0, base + stm32_bank->ftsr_ofst);
+ writel_relaxed(~0UL, base + stm32_bank->rpr_ofst);
+ if (stm32_bank->fpr_ofst != UNDEF_REG)
+ writel_relaxed(~0UL, base + stm32_bank->fpr_ofst);
+
+ pr_info("%s: bank%d, External IRQs available:%#x\n",
+ node->full_name, bank_idx, irqs_mask);
+
+ return chip_data;
+}
+
+static int __init stm32_exti_init(const struct stm32_exti_drv_data *drv_data,
+ struct device_node *node)
+{
+ struct stm32_exti_host_data *host_data;
unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
- int nr_irqs, nr_exti, ret, i;
+ int nr_irqs, ret, i;
struct irq_chip_generic *gc;
struct irq_domain *domain;
- void *base;
- base = of_iomap(node, 0);
- if (!base) {
- pr_err("%pOF: Unable to map registers\n", node);
- return -ENOMEM;
+ host_data = stm32_exti_host_init(drv_data, node);
+ if (!host_data) {
+ ret = -ENOMEM;
+ goto out_free_mem;
}
- domain = irq_domain_add_linear(node, bank_nr * IRQS_PER_BANK,
+ domain = irq_domain_add_linear(node, drv_data->bank_nr * IRQS_PER_BANK,
&irq_exti_domain_ops, NULL);
if (!domain) {
pr_err("%s: Could not register interrupt domain.\n",
@@ -234,44 +684,32 @@ __init stm32_exti_init(const struct stm32_exti_bank **stm32_exti_banks,
handle_edge_irq, clr, 0, 0);
if (ret) {
pr_err("%pOF: Could not allocate generic interrupt chip.\n",
- node);
+ node);
goto out_free_domain;
}
- for (i = 0; i < bank_nr; i++) {
- const struct stm32_exti_bank *stm32_bank = stm32_exti_banks[i];
- u32 irqs_mask;
+ for (i = 0; i < drv_data->bank_nr; i++) {
+ const struct stm32_exti_bank *stm32_bank;
+ struct stm32_exti_chip_data *chip_data;
+
+ stm32_bank = drv_data->exti_banks[i];
+ chip_data = stm32_exti_chip_init(host_data, i, node);
gc = irq_get_domain_generic_chip(domain, i * IRQS_PER_BANK);
- gc->reg_base = base;
+ gc->reg_base = host_data->base;
gc->chip_types->type = IRQ_TYPE_EDGE_BOTH;
- gc->chip_types->chip.irq_ack = irq_gc_ack_set_bit;
+ gc->chip_types->chip.irq_ack = stm32_irq_ack;
gc->chip_types->chip.irq_mask = irq_gc_mask_clr_bit;
gc->chip_types->chip.irq_unmask = irq_gc_mask_set_bit;
gc->chip_types->chip.irq_set_type = stm32_irq_set_type;
- gc->chip_types->chip.irq_set_wake = stm32_irq_set_wake;
- gc->chip_types->regs.ack = stm32_bank->pr_ofst;
+ gc->chip_types->chip.irq_set_wake = irq_gc_set_wake;
+ gc->suspend = stm32_irq_suspend;
+ gc->resume = stm32_irq_resume;
+ gc->wake_enabled = IRQ_MSK(IRQS_PER_BANK);
+
gc->chip_types->regs.mask = stm32_bank->imr_ofst;
- gc->private = (void *)stm32_bank;
-
- /* Determine number of irqs supported */
- writel_relaxed(~0UL, base + stm32_bank->rtsr_ofst);
- irqs_mask = readl_relaxed(base + stm32_bank->rtsr_ofst);
- nr_exti = fls(readl_relaxed(base + stm32_bank->rtsr_ofst));
-
- /*
- * This IP has no reset, so after hot reboot we should
- * clear registers to avoid residue
- */
- writel_relaxed(0, base + stm32_bank->imr_ofst);
- writel_relaxed(0, base + stm32_bank->emr_ofst);
- writel_relaxed(0, base + stm32_bank->rtsr_ofst);
- writel_relaxed(0, base + stm32_bank->ftsr_ofst);
- writel_relaxed(~0UL, base + stm32_bank->pr_ofst);
-
- pr_info("%s: bank%d, External IRQs available:%#x\n",
- node->full_name, i, irqs_mask);
+ gc->private = (void *)chip_data;
}
nr_irqs = of_irq_count(node);
@@ -287,15 +725,69 @@ __init stm32_exti_init(const struct stm32_exti_bank **stm32_exti_banks,
out_free_domain:
irq_domain_remove(domain);
out_unmap:
- iounmap(base);
+ iounmap(host_data->base);
+out_free_mem:
+ kfree(host_data->chips_data);
+ kfree(host_data);
+ return ret;
+}
+
+static const struct irq_domain_ops stm32_exti_h_domain_ops = {
+ .alloc = stm32_exti_h_domain_alloc,
+ .free = irq_domain_free_irqs_common,
+};
+
+static int
+__init stm32_exti_hierarchy_init(const struct stm32_exti_drv_data *drv_data,
+ struct device_node *node,
+ struct device_node *parent)
+{
+ struct irq_domain *parent_domain, *domain;
+ struct stm32_exti_host_data *host_data;
+ int ret, i;
+
+ parent_domain = irq_find_host(parent);
+ if (!parent_domain) {
+ pr_err("interrupt-parent not found\n");
+ return -EINVAL;
+ }
+
+ host_data = stm32_exti_host_init(drv_data, node);
+ if (!host_data) {
+ ret = -ENOMEM;
+ goto out_free_mem;
+ }
+
+ for (i = 0; i < drv_data->bank_nr; i++)
+ stm32_exti_chip_init(host_data, i, node);
+
+ domain = irq_domain_add_hierarchy(parent_domain, 0,
+ drv_data->bank_nr * IRQS_PER_BANK,
+ node, &stm32_exti_h_domain_ops,
+ host_data);
+
+ if (!domain) {
+ pr_err("%s: Could not register exti domain.\n", node->name);
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ stm32_exti_h_syscore_init();
+
+ return 0;
+
+out_unmap:
+ iounmap(host_data->base);
+out_free_mem:
+ kfree(host_data->chips_data);
+ kfree(host_data);
return ret;
}
static int __init stm32f4_exti_of_init(struct device_node *np,
struct device_node *parent)
{
- return stm32_exti_init(stm32f4xx_exti_banks,
- ARRAY_SIZE(stm32f4xx_exti_banks), np);
+ return stm32_exti_init(&stm32f4xx_drv_data, np);
}
IRQCHIP_DECLARE(stm32f4_exti, "st,stm32-exti", stm32f4_exti_of_init);
@@ -303,8 +795,15 @@ IRQCHIP_DECLARE(stm32f4_exti, "st,stm32-exti", stm32f4_exti_of_init);
static int __init stm32h7_exti_of_init(struct device_node *np,
struct device_node *parent)
{
- return stm32_exti_init(stm32h7xx_exti_banks,
- ARRAY_SIZE(stm32h7xx_exti_banks), np);
+ return stm32_exti_init(&stm32h7xx_drv_data, np);
}
IRQCHIP_DECLARE(stm32h7_exti, "st,stm32h7-exti", stm32h7_exti_of_init);
+
+static int __init stm32mp1_exti_of_init(struct device_node *np,
+ struct device_node *parent)
+{
+ return stm32_exti_hierarchy_init(&stm32mp1_drv_data, np, parent);
+}
+
+IRQCHIP_DECLARE(stm32mp1_exti, "st,stm32mp1-exti", stm32mp1_exti_of_init);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1f8f489b4167..98f90aadd141 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {
.getname = data_sock_getname,
.sendmsg = mISDN_sock_sendmsg,
.recvmsg = mISDN_sock_recvmsg,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = data_sock_setsockopt,
@@ -745,7 +745,6 @@ static const struct proto_ops base_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index d7f54e492aa6..c63e331738c1 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -279,8 +279,21 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
adc0 = MC13XXX_ADC0_ADINC1 | MC13XXX_ADC0_ADINC2;
adc1 = MC13XXX_ADC1_ADEN | MC13XXX_ADC1_ADTRIGIGN | MC13XXX_ADC1_ASC;
- if (channel > 7)
+ /*
+ * Channels mapped through ADIN7:
+ * 7 - General purpose ADIN7
+ * 16 - UID
+ * 17 - Die temperature
+ */
+ if (channel > 7 && channel < 16) {
adc1 |= MC13XXX_ADC1_ADSEL;
+ } else if (channel == 16) {
+ adc0 |= MC13XXX_ADC0_ADIN7SEL_UID;
+ channel = 7;
+ } else if (channel == 17) {
+ adc0 |= MC13XXX_ADC0_ADIN7SEL_DIE;
+ channel = 7;
+ }
switch (mode) {
case MC13XXX_ADC_MODE_TS:
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 6def5445e03e..57b02c4b3f63 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -81,6 +81,7 @@ config MTD_DATAFLASH_OTP
config MTD_M25P80
tristate "Support most SPI Flash chips (AT26DF, M25P, W25X, ...)"
depends on SPI_MASTER && MTD_SPI_NOR
+ select SPI_MEM
help
This enables access to most modern SPI flash chips, used for
program and data storage. Series supported include Atmel AT26DF,
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index a4e18f6aaa33..e84563d2067f 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -24,12 +24,13 @@
#include <linux/mtd/partitions.h>
#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
#include <linux/spi/flash.h>
#include <linux/mtd/spi-nor.h>
#define MAX_CMD_SIZE 6
struct m25p {
- struct spi_device *spi;
+ struct spi_mem *spimem;
struct spi_nor spi_nor;
u8 command[MAX_CMD_SIZE];
};
@@ -37,97 +38,68 @@ struct m25p {
static int m25p80_read_reg(struct spi_nor *nor, u8 code, u8 *val, int len)
{
struct m25p *flash = nor->priv;
- struct spi_device *spi = flash->spi;
+ struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(code, 1),
+ SPI_MEM_OP_NO_ADDR,
+ SPI_MEM_OP_NO_DUMMY,
+ SPI_MEM_OP_DATA_IN(len, val, 1));
int ret;
- ret = spi_write_then_read(spi, &code, 1, val, len);
+ ret = spi_mem_exec_op(flash->spimem, &op);
if (ret < 0)
- dev_err(&spi->dev, "error %d reading %x\n", ret, code);
+ dev_err(&flash->spimem->spi->dev, "error %d reading %x\n", ret,
+ code);
return ret;
}
-static void m25p_addr2cmd(struct spi_nor *nor, unsigned int addr, u8 *cmd)
-{
- /* opcode is in cmd[0] */
- cmd[1] = addr >> (nor->addr_width * 8 - 8);
- cmd[2] = addr >> (nor->addr_width * 8 - 16);
- cmd[3] = addr >> (nor->addr_width * 8 - 24);
- cmd[4] = addr >> (nor->addr_width * 8 - 32);
-}
-
-static int m25p_cmdsz(struct spi_nor *nor)
-{
- return 1 + nor->addr_width;
-}
-
static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
{
struct m25p *flash = nor->priv;
- struct spi_device *spi = flash->spi;
-
- flash->command[0] = opcode;
- if (buf)
- memcpy(&flash->command[1], buf, len);
+ struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 1),
+ SPI_MEM_OP_NO_ADDR,
+ SPI_MEM_OP_NO_DUMMY,
+ SPI_MEM_OP_DATA_OUT(len, buf, 1));
- return spi_write(spi, flash->command, len + 1);
+ return spi_mem_exec_op(flash->spimem, &op);
}
static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
const u_char *buf)
{
struct m25p *flash = nor->priv;
- struct spi_device *spi = flash->spi;
- unsigned int inst_nbits, addr_nbits, data_nbits, data_idx;
- struct spi_transfer t[3] = {};
- struct spi_message m;
- int cmd_sz = m25p_cmdsz(nor);
- ssize_t ret;
+ struct spi_mem_op op =
+ SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 1),
+ SPI_MEM_OP_ADDR(nor->addr_width, to, 1),
+ SPI_MEM_OP_DUMMY(0, 1),
+ SPI_MEM_OP_DATA_OUT(len, buf, 1));
+ size_t remaining = len;
+ int ret;
/* get transfer protocols. */
- inst_nbits = spi_nor_get_protocol_inst_nbits(nor->write_proto);
- addr_nbits = spi_nor_get_protocol_addr_nbits(nor->write_proto);
- data_nbits = spi_nor_get_protocol_data_nbits(nor->write_proto);
-
- spi_message_init(&m);
+ op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->write_proto);
+ op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->write_proto);
+ op.dummy.buswidth = op.addr.buswidth;
+ op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->write_proto);
if (nor->program_opcode == SPINOR_OP_AAI_WP && nor->sst_write_second)
- cmd_sz = 1;
-
- flash->command[0] = nor->program_opcode;
- m25p_addr2cmd(nor, to, flash->command);
+ op.addr.nbytes = 0;
- t[0].tx_buf = flash->command;
- t[0].tx_nbits = inst_nbits;
- t[0].len = cmd_sz;
- spi_message_add_tail(&t[0], &m);
-
- /* split the op code and address bytes into two transfers if needed. */
- data_idx = 1;
- if (addr_nbits != inst_nbits) {
- t[0].len = 1;
+ while (remaining) {
+ op.data.nbytes = remaining < UINT_MAX ? remaining : UINT_MAX;
+ ret = spi_mem_adjust_op_size(flash->spimem, &op);
+ if (ret)
+ return ret;
- t[1].tx_buf = &flash->command[1];
- t[1].tx_nbits = addr_nbits;
- t[1].len = cmd_sz - 1;
- spi_message_add_tail(&t[1], &m);
+ ret = spi_mem_exec_op(flash->spimem, &op);
+ if (ret)
+ return ret;
- data_idx = 2;
+ op.addr.val += op.data.nbytes;
+ remaining -= op.data.nbytes;
+ op.data.buf.out += op.data.nbytes;
}
- t[data_idx].tx_buf = buf;
- t[data_idx].tx_nbits = data_nbits;
- t[data_idx].len = len;
- spi_message_add_tail(&t[data_idx], &m);
-
- ret = spi_sync(spi, &m);
- if (ret)
- return ret;
-
- ret = m.actual_length - cmd_sz;
- if (ret < 0)
- return -EIO;
- return ret;
+ return len;
}
/*
@@ -138,92 +110,39 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
u_char *buf)
{
struct m25p *flash = nor->priv;
- struct spi_device *spi = flash->spi;
- unsigned int inst_nbits, addr_nbits, data_nbits, data_idx;
- struct spi_transfer t[3];
- struct spi_message m;
- unsigned int dummy = nor->read_dummy;
- ssize_t ret;
- int cmd_sz;
+ struct spi_mem_op op =
+ SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 1),
+ SPI_MEM_OP_ADDR(nor->addr_width, from, 1),
+ SPI_MEM_OP_DUMMY(nor->read_dummy, 1),
+ SPI_MEM_OP_DATA_IN(len, buf, 1));
+ size_t remaining = len;
+ int ret;
/* get transfer protocols. */
- inst_nbits = spi_nor_get_protocol_inst_nbits(nor->read_proto);
- addr_nbits = spi_nor_get_protocol_addr_nbits(nor->read_proto);
- data_nbits = spi_nor_get_protocol_data_nbits(nor->read_proto);
+ op.cmd.buswidth = spi_nor_get_protocol_inst_nbits(nor->read_proto);
+ op.addr.buswidth = spi_nor_get_protocol_addr_nbits(nor->read_proto);
+ op.dummy.buswidth = op.addr.buswidth;
+ op.data.buswidth = spi_nor_get_protocol_data_nbits(nor->read_proto);
/* convert the dummy cycles to the number of bytes */
- dummy = (dummy * addr_nbits) / 8;
-
- if (spi_flash_read_supported(spi)) {
- struct spi_flash_read_message msg;
-
- memset(&msg, 0, sizeof(msg));
+ op.dummy.nbytes = (nor->read_dummy * op.dummy.buswidth) / 8;
- msg.buf = buf;
- msg.from = from;
- msg.len = len;
- msg.read_opcode = nor->read_opcode;
- msg.addr_width = nor->addr_width;
- msg.dummy_bytes = dummy;
- msg.opcode_nbits = inst_nbits;
- msg.addr_nbits = addr_nbits;
- msg.data_nbits = data_nbits;
-
- ret = spi_flash_read(spi, &msg);
- if (ret < 0)
+ while (remaining) {
+ op.data.nbytes = remaining < UINT_MAX ? remaining : UINT_MAX;
+ ret = spi_mem_adjust_op_size(flash->spimem, &op);
+ if (ret)
return ret;
- return msg.retlen;
- }
- spi_message_init(&m);
- memset(t, 0, (sizeof t));
-
- flash->command[0] = nor->read_opcode;
- m25p_addr2cmd(nor, from, flash->command);
-
- t[0].tx_buf = flash->command;
- t[0].tx_nbits = inst_nbits;
- t[0].len = m25p_cmdsz(nor) + dummy;
- spi_message_add_tail(&t[0], &m);
-
- /*
- * Set all dummy/mode cycle bits to avoid sending some manufacturer
- * specific pattern, which might make the memory enter its Continuous
- * Read mode by mistake.
- * Based on the different mode cycle bit patterns listed and described
- * in the JESD216B specification, the 0xff value works for all memories
- * and all manufacturers.
- */
- cmd_sz = t[0].len;
- memset(flash->command + cmd_sz - dummy, 0xff, dummy);
-
- /* split the op code and address bytes into two transfers if needed. */
- data_idx = 1;
- if (addr_nbits != inst_nbits) {
- t[0].len = 1;
-
- t[1].tx_buf = &flash->command[1];
- t[1].tx_nbits = addr_nbits;
- t[1].len = cmd_sz - 1;
- spi_message_add_tail(&t[1], &m);
+ ret = spi_mem_exec_op(flash->spimem, &op);
+ if (ret)
+ return ret;
- data_idx = 2;
+ op.addr.val += op.data.nbytes;
+ remaining -= op.data.nbytes;
+ op.data.buf.in += op.data.nbytes;
}
- t[data_idx].rx_buf = buf;
- t[data_idx].rx_nbits = data_nbits;
- t[data_idx].len = min3(len, spi_max_transfer_size(spi),
- spi_max_message_size(spi) - cmd_sz);
- spi_message_add_tail(&t[data_idx], &m);
-
- ret = spi_sync(spi, &m);
- if (ret)
- return ret;
-
- ret = m.actual_length - cmd_sz;
- if (ret < 0)
- return -EIO;
- return ret;
+ return len;
}
/*
@@ -231,8 +150,9 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
* matches what the READ command supports, at least until this driver
* understands FAST_READ (for clocks over 25 MHz).
*/
-static int m25p_probe(struct spi_device *spi)
+static int m25p_probe(struct spi_mem *spimem)
{
+ struct spi_device *spi = spimem->spi;
struct flash_platform_data *data;
struct m25p *flash;
struct spi_nor *nor;
@@ -244,9 +164,9 @@ static int m25p_probe(struct spi_device *spi)
char *flash_name;
int ret;
- data = dev_get_platdata(&spi->dev);
+ data = dev_get_platdata(&spimem->spi->dev);
- flash = devm_kzalloc(&spi->dev, sizeof(*flash), GFP_KERNEL);
+ flash = devm_kzalloc(&spimem->spi->dev, sizeof(*flash), GFP_KERNEL);
if (!flash)
return -ENOMEM;
@@ -258,12 +178,12 @@ static int m25p_probe(struct spi_device *spi)
nor->write_reg = m25p80_write_reg;
nor->read_reg = m25p80_read_reg;
- nor->dev = &spi->dev;
+ nor->dev = &spimem->spi->dev;
spi_nor_set_flash_node(nor, spi->dev.of_node);
nor->priv = flash;
- spi_set_drvdata(spi, flash);
- flash->spi = spi;
+ spi_mem_set_drvdata(spimem, flash);
+ flash->spimem = spimem;
if (spi->mode & SPI_RX_QUAD) {
hwcaps.mask |= SNOR_HWCAPS_READ_1_1_4;
@@ -303,9 +223,9 @@ static int m25p_probe(struct spi_device *spi)
}
-static int m25p_remove(struct spi_device *spi)
+static int m25p_remove(struct spi_mem *spimem)
{
- struct m25p *flash = spi_get_drvdata(spi);
+ struct m25p *flash = spi_mem_get_drvdata(spimem);
spi_nor_restore(&flash->spi_nor);
@@ -313,9 +233,9 @@ static int m25p_remove(struct spi_device *spi)
return mtd_device_unregister(&flash->spi_nor.mtd);
}
-static void m25p_shutdown(struct spi_device *spi)
+static void m25p_shutdown(struct spi_mem *spimem)
{
- struct m25p *flash = spi_get_drvdata(spi);
+ struct m25p *flash = spi_mem_get_drvdata(spimem);
spi_nor_restore(&flash->spi_nor);
}
@@ -386,12 +306,14 @@ static const struct of_device_id m25p_of_table[] = {
};
MODULE_DEVICE_TABLE(of, m25p_of_table);
-static struct spi_driver m25p80_driver = {
- .driver = {
- .name = "m25p80",
- .of_match_table = m25p_of_table,
+static struct spi_mem_driver m25p80_driver = {
+ .spidrv = {
+ .driver = {
+ .name = "m25p80",
+ .of_match_table = m25p_of_table,
+ },
+ .id_table = m25p_ids,
},
- .id_table = m25p_ids,
.probe = m25p_probe,
.remove = m25p_remove,
.shutdown = m25p_shutdown,
@@ -402,7 +324,7 @@ static struct spi_driver m25p80_driver = {
*/
};
-module_spi_driver(m25p80_driver);
+module_spi_mem_driver(m25p80_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mike Lavender");
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index a4ebd8715494..661828e8fdcf 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1289,9 +1289,8 @@ static int efx_init_io(struct efx_nic *efx)
pci_set_master(pci_dev);
- /* Set the PCI DMA mask. Try all possibilities from our
- * genuine mask down to 32 bits, because some architectures
- * (e.g. x86_64 with iommu_sac_force set) will allow 40 bit
+ /* Set the PCI DMA mask. Try all possibilities from our genuine mask
+ * down to 32 bits, because some architectures will allow 40 bit
* masks event though they reject 46 bit masks.
*/
while (dma_mask > 0x7fffffffUL) {
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 3d6c91e96589..dd5530a4f8c8 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -1242,9 +1242,8 @@ static int ef4_init_io(struct ef4_nic *efx)
pci_set_master(pci_dev);
- /* Set the PCI DMA mask. Try all possibilities from our
- * genuine mask down to 32 bits, because some architectures
- * (e.g. x86_64 with iommu_sac_force set) will allow 40 bit
+ /* Set the PCI DMA mask. Try all possibilities from our genuine mask
+ * down to 32 bits, because some architectures will allow 40 bit
* masks event though they reject 46 bit masks.
*/
while (dma_mask > 0x7fffffffUL) {
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index ce61231e96ea..de51e8f70f44 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = pppoe_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index c4267ecefd85..157b67c1bf8e 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -624,7 +624,6 @@ static const struct proto_ops pptp_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = pptp_getname,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 30852270484f..2e96b34bc936 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -276,7 +276,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
if (rw == READ) {
if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
return -EIO;
- return memcpy_mcsafe(buf, nsio->addr + offset, size);
+ if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+ return -EIO;
}
if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..e023d6aa22b5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -101,15 +101,15 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
void *pmem_addr, unsigned int len)
{
unsigned int chunk;
- int rc;
+ unsigned long rem;
void *mem;
while (len) {
mem = kmap_atomic(page);
chunk = min_t(unsigned int, len, PAGE_SIZE);
- rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+ rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
kunmap_atomic(mem);
- if (rc)
+ if (rem)
return BLK_STS_IOERR;
len -= chunk;
off = 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 04a20da76786..c8b30067b6ae 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -357,7 +357,7 @@ static void nvme_free_ns_head(struct kref *ref)
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
list_del_init(&head->entry);
- cleanup_srcu_struct(&head->srcu);
+ cleanup_srcu_struct_quiesced(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head);
}
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 064c818105bd..33d85511d790 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -76,6 +76,8 @@ int of_device_add(struct platform_device *ofdev)
* of_dma_configure - Setup DMA configuration
* @dev: Device to apply DMA configuration
* @np: Pointer to OF node having DMA configuration
+ * @force_dma: Whether device is to be set up by of_dma_configure() even if
+ * DMA capability is not explicitly described by firmware.
*
* Try to get devices's DMA configuration from DT and update it
* accordingly.
@@ -84,7 +86,7 @@ int of_device_add(struct platform_device *ofdev)
* can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
* to fix up DMA configuration.
*/
-int of_dma_configure(struct device *dev, struct device_node *np)
+int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
{
u64 dma_addr, paddr, size = 0;
int ret;
@@ -100,7 +102,7 @@ int of_dma_configure(struct device *dev, struct device_node *np)
* DMA configuration regardless of whether "dma-ranges" is
* correctly specified or not.
*/
- if (!dev->bus->force_dma)
+ if (!force_dma)
return ret == -ENODEV ? 0 : ret;
dma_addr = offset = 0;
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 9a4f4246231d..895c83e0c7b6 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -353,7 +353,7 @@ int of_reserved_mem_device_init_by_idx(struct device *dev,
/* ensure that dma_ops is set for virtual devices
* using reserved memory
*/
- of_dma_configure(dev, np);
+ of_dma_configure(dev, np, true);
dev_info(dev, "assigned reserved memory node %s\n", rmem->name);
} else {
diff --git a/drivers/parisc/Kconfig b/drivers/parisc/Kconfig
index 3a102a84d637..5a48b5606110 100644
--- a/drivers/parisc/Kconfig
+++ b/drivers/parisc/Kconfig
@@ -103,11 +103,6 @@ config IOMMU_SBA
depends on PCI_LBA
default PCI_LBA
-config IOMMU_HELPER
- bool
- depends on IOMMU_SBA || IOMMU_CCIO
- default y
-
source "drivers/pcmcia/Kconfig"
endmenu
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index d29cedb3f23b..614823617b8b 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1570,8 +1570,6 @@ static int __init ccio_probe(struct parisc_device *dev)
}
#endif
ioc_count++;
-
- parisc_has_iommu();
return 0;
}
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 0d33d1f86d10..11de0eccf968 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1989,8 +1989,6 @@ static int __init sba_driver_callback(struct parisc_device *dev)
proc_create_single("sba_iommu", 0, root, sba_proc_info);
proc_create_single("sba_iommu-bitmap", 0, root, sba_proc_bitmap_info);
#endif
-
- parisc_has_iommu();
return 0;
}
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..29a487f31dae 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -5,10 +5,6 @@
source "drivers/pci/pcie/Kconfig"
-config PCI_BUS_ADDR_T_64BIT
- def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
- depends on PCI
-
config PCI_MSI
bool "Message Signaled Interrupts (MSI and MSI-X)"
depends on PCI
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index bc2ded4c451f..35b7fc87eac5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -120,7 +120,7 @@ int devm_request_pci_bus_resources(struct device *dev,
EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
static struct pci_bus_region pci_64_bit = {0,
(pci_bus_addr_t) 0xffffffffffffffffULL};
static struct pci_bus_region pci_high = {(pci_bus_addr_t) 0x100000000ULL,
@@ -230,7 +230,7 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
resource_size_t),
void *alignf_data)
{
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
int rc;
if (res->flags & IORESOURCE_MEM_64) {
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 30250631efe7..f45b74fcc059 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1434,6 +1434,9 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
{
struct irq_domain *domain;
+ if (WARN_ON(info->flags & MSI_FLAG_LEVEL_CAPABLE))
+ info->flags &= ~MSI_FLAG_LEVEL_CAPABLE;
+
if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
pci_msi_domain_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index b9a131137e64..f8269a725667 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -16,6 +16,8 @@
#include <linux/pm_runtime.h>
#include <linux/suspend.h>
#include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/acpi.h>
#include "pci.h"
#include "pcie/portdrv.h"
@@ -1577,6 +1579,35 @@ static int pci_bus_num_vf(struct device *dev)
return pci_num_vf(to_pci_dev(dev));
}
+/**
+ * pci_dma_configure - Setup DMA configuration
+ * @dev: ptr to dev structure
+ *
+ * Function to update PCI devices's DMA configuration using the same
+ * info from the OF node or ACPI node of host bridge's parent (if any).
+ */
+static int pci_dma_configure(struct device *dev)
+{
+ struct device *bridge;
+ int ret = 0;
+
+ bridge = pci_get_host_bridge_device(to_pci_dev(dev));
+
+ if (IS_ENABLED(CONFIG_OF) && bridge->parent &&
+ bridge->parent->of_node) {
+ ret = of_dma_configure(dev, bridge->parent->of_node, true);
+ } else if (has_acpi_companion(bridge)) {
+ struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
+ enum dev_dma_attr attr = acpi_get_dma_attr(adev);
+
+ if (attr != DEV_DMA_NOT_SUPPORTED)
+ ret = acpi_dma_configure(dev, attr);
+ }
+
+ pci_put_host_bridge_device(bridge);
+ return ret;
+}
+
struct bus_type pci_bus_type = {
.name = "pci",
.match = pci_bus_match,
@@ -1589,7 +1620,7 @@ struct bus_type pci_bus_type = {
.drv_groups = pci_drv_groups,
.pm = PCI_PM_OPS_PTR,
.num_vf = pci_bus_num_vf,
- .force_dma = true,
+ .dma_configure = pci_dma_configure,
};
EXPORT_SYMBOL(pci_bus_type);
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index 6cbcff42ba47..dfed60982a8a 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -267,12 +267,13 @@ static void stm32_gpio_irq_release_resources(struct irq_data *irq_data)
}
static struct irq_chip stm32_gpio_irq_chip = {
- .name = "stm32gpio",
- .irq_ack = irq_chip_ack_parent,
- .irq_mask = irq_chip_mask_parent,
- .irq_unmask = irq_chip_unmask_parent,
- .irq_set_type = irq_chip_set_type_parent,
- .irq_set_wake = irq_chip_set_wake_parent,
+ .name = "stm32gpio",
+ .irq_eoi = irq_chip_eoi_parent,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_set_type = irq_chip_set_type_parent,
+ .irq_set_wake = irq_chip_set_wake_parent,
.irq_request_resources = stm32_gpio_irq_request_resources,
.irq_release_resources = stm32_gpio_irq_release_resources,
};
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index e728a96cabfd..cb0df9eb3e0f 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -38,6 +38,17 @@ config CHROMEOS_PSTORE
If you have a supported Chromebook, choose Y or M here.
The module will be called chromeos_pstore.
+config CHROMEOS_TBMC
+ tristate "ChromeOS Tablet Switch Controller"
+ depends on ACPI
+ depends on INPUT
+ help
+ This option adds a driver for the tablet switch on
+ select Chrome OS systems.
+
+ To compile this driver as a module, choose M here: the
+ module will be called chromeos_tbmc.
+
config CROS_EC_CTL
tristate
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index ff3b369911f0..e44c37a63fa9 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o
obj-$(CONFIG_CHROMEOS_PSTORE) += chromeos_pstore.o
+obj-$(CONFIG_CHROMEOS_TBMC) += chromeos_tbmc.o
cros_ec_ctl-objs := cros_ec_sysfs.o cros_ec_lightbar.o \
cros_ec_vbc.o cros_ec_debugfs.o
obj-$(CONFIG_CROS_EC_CTL) += cros_ec_ctl.o
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 5c47f451e43b..24326eecd787 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/acpi.h>
#include <linux/dmi.h>
#include <linux/i2c.h>
#include <linux/input.h>
@@ -54,6 +55,11 @@ struct i2c_peripheral {
struct i2c_client *client;
};
+struct acpi_peripheral {
+ char hid[ACPI_ID_LEN];
+ const struct property_entry *properties;
+};
+
struct chromeos_laptop {
/*
* Note that we can't mark this pointer as const because
@@ -61,6 +67,9 @@ struct chromeos_laptop {
*/
struct i2c_peripheral *i2c_peripherals;
unsigned int num_i2c_peripherals;
+
+ const struct acpi_peripheral *acpi_peripherals;
+ unsigned int num_acpi_peripherals;
};
static const struct chromeos_laptop *cros_laptop;
@@ -148,6 +157,38 @@ static void chromeos_laptop_check_adapter(struct i2c_adapter *adapter)
}
}
+static bool chromeos_laptop_adjust_client(struct i2c_client *client)
+{
+ const struct acpi_peripheral *acpi_dev;
+ struct acpi_device_id acpi_ids[2] = { };
+ int i;
+ int error;
+
+ if (!has_acpi_companion(&client->dev))
+ return false;
+
+ for (i = 0; i < cros_laptop->num_acpi_peripherals; i++) {
+ acpi_dev = &cros_laptop->acpi_peripherals[i];
+
+ memcpy(acpi_ids[0].id, acpi_dev->hid, ACPI_ID_LEN);
+
+ if (acpi_match_device(acpi_ids, &client->dev)) {
+ error = device_add_properties(&client->dev,
+ acpi_dev->properties);
+ if (error) {
+ dev_err(&client->dev,
+ "failed to add properties: %d\n",
+ error);
+ break;
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static void chromeos_laptop_detach_i2c_client(struct i2c_client *client)
{
struct i2c_peripheral *i2c_dev;
@@ -170,6 +211,8 @@ static int chromeos_laptop_i2c_notifier_call(struct notifier_block *nb,
case BUS_NOTIFY_ADD_DEVICE:
if (dev->type == &i2c_adapter_type)
chromeos_laptop_check_adapter(to_i2c_adapter(dev));
+ else if (dev->type == &i2c_client_type)
+ chromeos_laptop_adjust_client(to_i2c_client(dev));
break;
case BUS_NOTIFY_REMOVED_DEVICE:
@@ -191,6 +234,12 @@ static const struct chromeos_laptop _name __initconst = { \
.num_i2c_peripherals = ARRAY_SIZE(_name##_peripherals), \
}
+#define DECLARE_ACPI_CROS_LAPTOP(_name) \
+static const struct chromeos_laptop _name __initconst = { \
+ .acpi_peripherals = _name##_peripherals, \
+ .num_acpi_peripherals = ARRAY_SIZE(_name##_peripherals), \
+}
+
static struct i2c_peripheral samsung_series_5_550_peripherals[] __initdata = {
/* Touchpad. */
{
@@ -234,16 +283,25 @@ static const int chromebook_pixel_tp_keys[] __initconst = {
static const struct property_entry
chromebook_pixel_trackpad_props[] __initconst = {
+ PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
PROPERTY_ENTRY_U32_ARRAY("linux,gpio-keymap", chromebook_pixel_tp_keys),
{ }
};
+static const struct property_entry
+chromebook_atmel_touchscreen_props[] __initconst = {
+ PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
+ { }
+};
+
static struct i2c_peripheral chromebook_pixel_peripherals[] __initdata = {
/* Touch Screen. */
{
.board_info = {
I2C_BOARD_INFO("atmel_mxt_ts",
ATMEL_TS_I2C_ADDR),
+ .properties =
+ chromebook_atmel_touchscreen_props,
.flags = I2C_CLIENT_WAKE,
},
.dmi_name = "touchscreen",
@@ -354,6 +412,8 @@ static struct i2c_peripheral acer_c720_peripherals[] __initdata = {
.board_info = {
I2C_BOARD_INFO("atmel_mxt_ts",
ATMEL_TS_I2C_ADDR),
+ .properties =
+ chromebook_atmel_touchscreen_props,
.flags = I2C_CLIENT_WAKE,
},
.dmi_name = "touchscreen",
@@ -419,6 +479,47 @@ static struct i2c_peripheral cr48_peripherals[] __initdata = {
};
DECLARE_CROS_LAPTOP(cr48);
+static const u32 samus_touchpad_buttons[] __initconst = {
+ KEY_RESERVED,
+ KEY_RESERVED,
+ KEY_RESERVED,
+ BTN_LEFT
+};
+
+static const struct property_entry samus_trackpad_props[] __initconst = {
+ PROPERTY_ENTRY_STRING("compatible", "atmel,maxtouch"),
+ PROPERTY_ENTRY_U32_ARRAY("linux,gpio-keymap", samus_touchpad_buttons),
+ { }
+};
+
+static struct acpi_peripheral samus_peripherals[] __initdata = {
+ /* Touchpad */
+ {
+ .hid = "ATML0000",
+ .properties = samus_trackpad_props,
+ },
+ /* Touchsceen */
+ {
+ .hid = "ATML0001",
+ .properties = chromebook_atmel_touchscreen_props,
+ },
+};
+DECLARE_ACPI_CROS_LAPTOP(samus);
+
+static struct acpi_peripheral generic_atmel_peripherals[] __initdata = {
+ /* Touchpad */
+ {
+ .hid = "ATML0000",
+ .properties = chromebook_pixel_trackpad_props,
+ },
+ /* Touchsceen */
+ {
+ .hid = "ATML0001",
+ .properties = chromebook_atmel_touchscreen_props,
+ },
+};
+DECLARE_ACPI_CROS_LAPTOP(generic_atmel);
+
static const struct dmi_system_id chromeos_laptop_dmi_table[] __initconst = {
{
.ident = "Samsung Series 5 550",
@@ -502,17 +603,72 @@ static const struct dmi_system_id chromeos_laptop_dmi_table[] __initconst = {
},
.driver_data = (void *)&cr48,
},
+ /* Devices with peripherals incompletely described in ACPI */
+ {
+ .ident = "Chromebook Pro",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Google"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Caroline"),
+ },
+ .driver_data = (void *)&samus,
+ },
+ {
+ .ident = "Google Pixel 2 (2015)",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Samus"),
+ },
+ .driver_data = (void *)&samus,
+ },
+ {
+ .ident = "Samsung Chromebook 3",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Celes"),
+ },
+ .driver_data = (void *)&samus,
+ },
+ {
+ /*
+ * Other Chromebooks with Atmel touch controllers:
+ * - Winky (touchpad)
+ * - Clapper, Expresso, Rambi, Glimmer (touchscreen)
+ */
+ .ident = "Other Chromebook",
+ .matches = {
+ /*
+ * This will match all Google devices, not only devices
+ * with Atmel, but we will validate that the device
+ * actually has matching peripherals.
+ */
+ DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+ },
+ .driver_data = (void *)&generic_atmel,
+ },
{ }
};
MODULE_DEVICE_TABLE(dmi, chromeos_laptop_dmi_table);
-static int __init chromeos_laptop_scan_adapter(struct device *dev, void *data)
+static int __init chromeos_laptop_scan_peripherals(struct device *dev, void *data)
{
- struct i2c_adapter *adapter;
+ int error;
- adapter = i2c_verify_adapter(dev);
- if (adapter)
- chromeos_laptop_check_adapter(adapter);
+ if (dev->type == &i2c_adapter_type) {
+ chromeos_laptop_check_adapter(to_i2c_adapter(dev));
+ } else if (dev->type == &i2c_client_type) {
+ if (chromeos_laptop_adjust_client(to_i2c_client(dev))) {
+ /*
+ * Now that we have needed properties re-trigger
+ * driver probe in case driver was initialized
+ * earlier and probe failed.
+ */
+ error = device_attach(dev);
+ if (error < 0)
+ dev_warn(dev,
+ "%s: device_attach() failed: %d\n",
+ __func__, error);
+ }
+ }
return 0;
}
@@ -556,27 +712,24 @@ static int __init chromeos_laptop_setup_irq(struct i2c_peripheral *i2c_dev)
return 0;
}
-static struct chromeos_laptop * __init
-chromeos_laptop_prepare(const struct chromeos_laptop *src)
+static int __init
+chromeos_laptop_prepare_i2c_peripherals(struct chromeos_laptop *cros_laptop,
+ const struct chromeos_laptop *src)
{
- struct chromeos_laptop *cros_laptop;
struct i2c_peripheral *i2c_dev;
struct i2c_board_info *info;
- int error;
int i;
+ int error;
- cros_laptop = kzalloc(sizeof(*cros_laptop), GFP_KERNEL);
- if (!cros_laptop)
- return ERR_PTR(-ENOMEM);
+ if (!src->num_i2c_peripherals)
+ return 0;
cros_laptop->i2c_peripherals = kmemdup(src->i2c_peripherals,
src->num_i2c_peripherals *
sizeof(*src->i2c_peripherals),
GFP_KERNEL);
- if (!cros_laptop->i2c_peripherals) {
- error = -ENOMEM;
- goto err_free_cros_laptop;
- }
+ if (!cros_laptop->i2c_peripherals)
+ return -ENOMEM;
cros_laptop->num_i2c_peripherals = src->num_i2c_peripherals;
@@ -586,7 +739,7 @@ chromeos_laptop_prepare(const struct chromeos_laptop *src)
error = chromeos_laptop_setup_irq(i2c_dev);
if (error)
- goto err_destroy_cros_peripherals;
+ goto err_out;
/* We need to deep-copy properties */
if (info->properties) {
@@ -594,14 +747,14 @@ chromeos_laptop_prepare(const struct chromeos_laptop *src)
property_entries_dup(info->properties);
if (IS_ERR(info->properties)) {
error = PTR_ERR(info->properties);
- goto err_destroy_cros_peripherals;
+ goto err_out;
}
}
}
- return cros_laptop;
+ return 0;
-err_destroy_cros_peripherals:
+err_out:
while (--i >= 0) {
i2c_dev = &cros_laptop->i2c_peripherals[i];
info = &i2c_dev->board_info;
@@ -609,13 +762,74 @@ err_destroy_cros_peripherals:
property_entries_free(info->properties);
}
kfree(cros_laptop->i2c_peripherals);
-err_free_cros_laptop:
- kfree(cros_laptop);
- return ERR_PTR(error);
+ return error;
+}
+
+static int __init
+chromeos_laptop_prepare_acpi_peripherals(struct chromeos_laptop *cros_laptop,
+ const struct chromeos_laptop *src)
+{
+ struct acpi_peripheral *acpi_peripherals;
+ struct acpi_peripheral *acpi_dev;
+ const struct acpi_peripheral *src_dev;
+ int n_peripherals = 0;
+ int i;
+ int error;
+
+ for (i = 0; i < src->num_acpi_peripherals; i++) {
+ if (acpi_dev_present(src->acpi_peripherals[i].hid, NULL, -1))
+ n_peripherals++;
+ }
+
+ if (!n_peripherals)
+ return 0;
+
+ acpi_peripherals = kcalloc(n_peripherals,
+ sizeof(*src->acpi_peripherals),
+ GFP_KERNEL);
+ if (!acpi_peripherals)
+ return -ENOMEM;
+
+ acpi_dev = acpi_peripherals;
+ for (i = 0; i < src->num_acpi_peripherals; i++) {
+ src_dev = &src->acpi_peripherals[i];
+ if (!acpi_dev_present(src_dev->hid, NULL, -1))
+ continue;
+
+ *acpi_dev = *src_dev;
+
+ /* We need to deep-copy properties */
+ if (src_dev->properties) {
+ acpi_dev->properties =
+ property_entries_dup(src_dev->properties);
+ if (IS_ERR(acpi_dev->properties)) {
+ error = PTR_ERR(acpi_dev->properties);
+ goto err_out;
+ }
+ }
+
+ acpi_dev++;
+ }
+
+ cros_laptop->acpi_peripherals = acpi_peripherals;
+ cros_laptop->num_acpi_peripherals = n_peripherals;
+
+ return 0;
+
+err_out:
+ while (--i >= 0) {
+ acpi_dev = &acpi_peripherals[i];
+ if (acpi_dev->properties)
+ property_entries_free(acpi_dev->properties);
+ }
+
+ kfree(acpi_peripherals);
+ return error;
}
static void chromeos_laptop_destroy(const struct chromeos_laptop *cros_laptop)
{
+ const struct acpi_peripheral *acpi_dev;
struct i2c_peripheral *i2c_dev;
struct i2c_board_info *info;
int i;
@@ -631,10 +845,41 @@ static void chromeos_laptop_destroy(const struct chromeos_laptop *cros_laptop)
property_entries_free(info->properties);
}
+ for (i = 0; i < cros_laptop->num_acpi_peripherals; i++) {
+ acpi_dev = &cros_laptop->acpi_peripherals[i];
+
+ if (acpi_dev->properties)
+ property_entries_free(acpi_dev->properties);
+ }
+
kfree(cros_laptop->i2c_peripherals);
+ kfree(cros_laptop->acpi_peripherals);
kfree(cros_laptop);
}
+static struct chromeos_laptop * __init
+chromeos_laptop_prepare(const struct chromeos_laptop *src)
+{
+ struct chromeos_laptop *cros_laptop;
+ int error;
+
+ cros_laptop = kzalloc(sizeof(*cros_laptop), GFP_KERNEL);
+ if (!cros_laptop)
+ return ERR_PTR(-ENOMEM);
+
+ error = chromeos_laptop_prepare_i2c_peripherals(cros_laptop, src);
+ if (!error)
+ error = chromeos_laptop_prepare_acpi_peripherals(cros_laptop,
+ src);
+
+ if (error) {
+ chromeos_laptop_destroy(cros_laptop);
+ return ERR_PTR(error);
+ }
+
+ return cros_laptop;
+}
+
static int __init chromeos_laptop_init(void)
{
const struct dmi_system_id *dmi_id;
@@ -652,21 +897,33 @@ static int __init chromeos_laptop_init(void)
if (IS_ERR(cros_laptop))
return PTR_ERR(cros_laptop);
+ if (!cros_laptop->num_i2c_peripherals &&
+ !cros_laptop->num_acpi_peripherals) {
+ pr_debug("no relevant devices detected\n");
+ error = -ENODEV;
+ goto err_destroy_cros_laptop;
+ }
+
error = bus_register_notifier(&i2c_bus_type,
&chromeos_laptop_i2c_notifier);
if (error) {
- pr_err("failed to register i2c bus notifier: %d\n", error);
- chromeos_laptop_destroy(cros_laptop);
- return error;
+ pr_err("failed to register i2c bus notifier: %d\n",
+ error);
+ goto err_destroy_cros_laptop;
}
/*
- * Scan adapters that have been registered before we installed
- * the notifier to make sure we do not miss any devices.
+ * Scan adapters that have been registered and clients that have
+ * been created before we installed the notifier to make sure
+ * we do not miss any devices.
*/
- i2c_for_each_dev(NULL, chromeos_laptop_scan_adapter);
+ i2c_for_each_dev(NULL, chromeos_laptop_scan_peripherals);
return 0;
+
+err_destroy_cros_laptop:
+ chromeos_laptop_destroy(cros_laptop);
+ return error;
}
static void __exit chromeos_laptop_exit(void)
diff --git a/drivers/platform/chrome/chromeos_tbmc.c b/drivers/platform/chrome/chromeos_tbmc.c
new file mode 100644
index 000000000000..b935df6a9694
--- /dev/null
+++ b/drivers/platform/chrome/chromeos_tbmc.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Driver to detect Tablet Mode for ChromeOS convertible.
+//
+// Copyright (C) 2017 Google, Inc.
+// Author: Gwendal Grignou <gwendal@chromium.org>
+
+#include <linux/acpi.h>
+#include <linux/input.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#define DRV_NAME "chromeos_tbmc"
+#define ACPI_DRV_NAME "GOOG0006"
+
+static int chromeos_tbmc_query_switch(struct acpi_device *adev,
+ struct input_dev *idev)
+{
+ unsigned long long state;
+ acpi_status status;
+
+ status = acpi_evaluate_integer(adev->handle, "TBMC", NULL, &state);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ /* input layer checks if event is redundant */
+ input_report_switch(idev, SW_TABLET_MODE, state);
+ input_sync(idev);
+
+ return 0;
+}
+
+static __maybe_unused int chromeos_tbmc_resume(struct device *dev)
+{
+ struct acpi_device *adev = to_acpi_device(dev);
+
+ return chromeos_tbmc_query_switch(adev, adev->driver_data);
+}
+
+static void chromeos_tbmc_notify(struct acpi_device *adev, u32 event)
+{
+ switch (event) {
+ case 0x80:
+ chromeos_tbmc_query_switch(adev, adev->driver_data);
+ break;
+ default:
+ dev_err(&adev->dev, "Unexpected event: 0x%08X\n", event);
+ }
+}
+
+static int chromeos_tbmc_open(struct input_dev *idev)
+{
+ struct acpi_device *adev = input_get_drvdata(idev);
+
+ return chromeos_tbmc_query_switch(adev, idev);
+}
+
+static int chromeos_tbmc_add(struct acpi_device *adev)
+{
+ struct input_dev *idev;
+ struct device *dev = &adev->dev;
+ int ret;
+
+ idev = devm_input_allocate_device(dev);
+ if (!idev)
+ return -ENOMEM;
+
+ idev->name = "Tablet Mode Switch";
+ idev->phys = acpi_device_hid(adev);
+
+ idev->id.bustype = BUS_HOST;
+ idev->id.version = 1;
+ idev->id.product = 0;
+ idev->open = chromeos_tbmc_open;
+
+ input_set_drvdata(idev, adev);
+ adev->driver_data = idev;
+
+ input_set_capability(idev, EV_SW, SW_TABLET_MODE);
+ ret = input_register_device(idev);
+ if (ret) {
+ dev_err(dev, "cannot register input device\n");
+ return ret;
+ }
+ return 0;
+}
+
+static const struct acpi_device_id chromeos_tbmc_acpi_device_ids[] = {
+ { ACPI_DRV_NAME, 0 },
+ { }
+};
+MODULE_DEVICE_TABLE(acpi, chromeos_tbmc_acpi_device_ids);
+
+static const SIMPLE_DEV_PM_OPS(chromeos_tbmc_pm_ops, NULL,
+ chromeos_tbmc_resume);
+
+static struct acpi_driver chromeos_tbmc_driver = {
+ .name = DRV_NAME,
+ .class = DRV_NAME,
+ .ids = chromeos_tbmc_acpi_device_ids,
+ .ops = {
+ .add = chromeos_tbmc_add,
+ .notify = chromeos_tbmc_notify,
+ },
+ .drv.pm = &chromeos_tbmc_pm_ops,
+};
+
+module_acpi_driver(chromeos_tbmc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ChromeOS ACPI tablet switch driver");
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 6ea79d495aa2..68193bb53383 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -170,8 +170,7 @@ static ssize_t version_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
uint32_t version = 0, flags = 0;
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
int ret;
ret = lb_throttle();
@@ -193,8 +192,7 @@ static ssize_t brightness_store(struct device *dev,
struct cros_ec_command *msg;
int ret;
unsigned int val;
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
if (kstrtouint(buf, 0, &val))
return -EINVAL;
@@ -238,8 +236,7 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
{
struct ec_params_lightbar *param;
struct cros_ec_command *msg;
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
unsigned int val[4];
int ret, i = 0, j = 0, ok = 0;
@@ -311,8 +308,7 @@ static ssize_t sequence_show(struct device *dev,
struct ec_response_lightbar *resp;
struct cros_ec_command *msg;
int ret;
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
msg = alloc_lightbar_cmd_msg(ec);
if (!msg)
@@ -439,8 +435,7 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
struct cros_ec_command *msg;
unsigned int num;
int ret, len;
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
for (len = 0; len < count; len++)
if (!isalnum(buf[len]))
@@ -488,8 +483,7 @@ static ssize_t program_store(struct device *dev, struct device_attribute *attr,
int extra_bytes, max_size, ret;
struct ec_params_lightbar *param;
struct cros_ec_command *msg;
- struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
- class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
/*
* We might need to reject the program for size reasons. The EC
@@ -599,8 +593,7 @@ static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct cros_ec_dev *ec = container_of(dev,
- struct cros_ec_dev, class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
struct platform_device *pdev = to_platform_device(ec->dev);
struct cros_ec_platform *pdata = pdev->dev.platform_data;
int is_cros_ec;
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 3682e1539251..31c8b8c49e45 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -435,7 +435,13 @@ static int __init cros_ec_lpc_init(void)
int ret;
acpi_status status;
- if (!dmi_check_system(cros_ec_lpc_dmi_table)) {
+ status = acpi_get_devices(ACPI_DRV_NAME, cros_ec_lpc_parse_device,
+ &cros_ec_lpc_acpi_device_found, NULL);
+ if (ACPI_FAILURE(status))
+ pr_warn(DRV_NAME ": Looking for %s failed\n", ACPI_DRV_NAME);
+
+ if (!cros_ec_lpc_acpi_device_found &&
+ !dmi_check_system(cros_ec_lpc_dmi_table)) {
pr_err(DRV_NAME ": unsupported system.\n");
return -ENODEV;
}
@@ -450,11 +456,6 @@ static int __init cros_ec_lpc_init(void)
return ret;
}
- status = acpi_get_devices(ACPI_DRV_NAME, cros_ec_lpc_parse_device,
- &cros_ec_lpc_acpi_device_found, NULL);
- if (ACPI_FAILURE(status))
- pr_warn(DRV_NAME ": Looking for %s failed\n", ACPI_DRV_NAME);
-
if (!cros_ec_lpc_acpi_device_found) {
/* Register the device, and it'll get hooked up automatically */
ret = platform_device_register(&cros_ec_lpc_device);
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 5a6db3fe213a..f34a50121064 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -34,8 +34,6 @@
#include <linux/types.h>
#include <linux/uaccess.h>
-#define to_cros_ec_dev(dev) container_of(dev, struct cros_ec_dev, class_dev)
-
/* Accessor functions */
static ssize_t reboot_show(struct device *dev,
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 6d38e6b08334..5356f26bc022 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -29,8 +29,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
loff_t pos, size_t count)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
- class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
struct cros_ec_device *ecdev = ec->ec_dev;
struct ec_params_vbnvcontext *params;
struct cros_ec_command *msg;
@@ -70,8 +69,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj,
loff_t pos, size_t count)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
- class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
struct cros_ec_device *ecdev = ec->ec_dev;
struct ec_params_vbnvcontext *params;
struct cros_ec_command *msg;
@@ -111,8 +109,7 @@ static umode_t cros_ec_vbc_is_visible(struct kobject *kobj,
struct bin_attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
- class_dev);
+ struct cros_ec_dev *ec = to_cros_ec_dev(dev);
struct device_node *np = ec->ec_dev->dev->of_node;
if (IS_ENABLED(CONFIG_OF) && np) {
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 7bdc6aaa0ba3..2016e0ed5865 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -18,7 +18,6 @@
#include <linux/fs.h>
#include <linux/blkpg.h>
#include <linux/slab.h>
-#include <asm/compat.h>
#include <asm/ccwdev.h>
#include <asm/schid.h>
#include <asm/cmb.h>
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 61822480a2a0..16a4e8528bbc 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -19,7 +19,6 @@
#include <linux/slab.h>
#include <linux/types.h>
-#include <asm/compat.h>
#include <asm/ccwdev.h>
#include <asm/cio.h>
#include <asm/ebcdic.h>
diff --git a/drivers/s390/char/sclp_ctl.c b/drivers/s390/char/sclp_ctl.c
index a78cea0c3a09..248b5db3eaa8 100644
--- a/drivers/s390/char/sclp_ctl.c
+++ b/drivers/s390/char/sclp_ctl.c
@@ -14,7 +14,6 @@
#include <linux/init.h>
#include <linux/ioctl.h>
#include <linux/fs.h>
-#include <asm/compat.h>
#include <asm/sclp_ctl.h>
#include <asm/sclp.h>
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 17e411c57576..948ce82a7725 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -23,7 +23,6 @@
#include <linux/mutex.h>
#include <linux/cma.h>
#include <linux/mm.h>
-#include <asm/compat.h>
#include <asm/cpcmd.h>
#include <asm/debug.h>
#include <asm/vmcp.h>
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 0015729d917d..8d9f36625ba5 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -16,7 +16,6 @@
#include <linux/miscdevice.h>
#include <linux/kernel_stat.h>
-#include <asm/compat.h>
#include <asm/cio.h>
#include <asm/chsc.h>
#include <asm/isc.h>
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index dffd820731f2..f5a0d894d9ad 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -10,6 +10,7 @@
#define KMSG_COMPONENT "qeth"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <linux/compat.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/string.h>
@@ -32,7 +33,6 @@
#include <asm/chpid.h>
#include <asm/io.h>
#include <asm/sysinfo.h>
-#include <asm/compat.h>
#include <asm/diag.h>
#include <asm/cio.h>
#include <asm/ccwdev.h>
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f125fd71c0f2..fb38aeff9dbd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2149,27 +2149,6 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
return blk_mq_map_queues(set);
}
-static u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
-{
- struct device *host_dev;
- u64 bounce_limit = 0xffffffff;
-
- if (shost->unchecked_isa_dma)
- return BLK_BOUNCE_ISA;
- /*
- * Platforms with virtual-DMA translation
- * hardware have no practical limit.
- */
- if (!PCI_DMA_BUS_IS_PHYS)
- return BLK_BOUNCE_ANY;
-
- host_dev = scsi_get_device(shost);
- if (host_dev && host_dev->dma_mask)
- bounce_limit = (u64)dma_max_pfn(host_dev) << PAGE_SHIFT;
-
- return bounce_limit;
-}
-
void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
{
struct device *dev = shost->dma_dev;
@@ -2189,7 +2168,8 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
}
blk_queue_max_hw_sectors(q, shost->max_sectors);
- blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
+ if (shost->unchecked_isa_dma)
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ISA);
blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 2d4146ce2f1b..ad5d68e1dab7 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -47,6 +47,13 @@ config SPI_MASTER
if SPI_MASTER
+config SPI_MEM
+ bool "SPI memory extension"
+ help
+ Enable this option if you want to enable the SPI memory extension.
+ This extension is meant to simplify interaction with SPI memories
+ by providing a high-level interface to send memory-like commands.
+
comment "SPI Master Controller Drivers"
config SPI_ALTERA
@@ -71,7 +78,6 @@ config SPI_ARMADA_3700
config SPI_ATMEL
tristate "Atmel SPI Controller"
- depends on HAS_DMA
depends on ARCH_AT91 || COMPILE_TEST
help
This selects a driver for the Atmel SPI Controller, present on
@@ -115,14 +121,6 @@ config SPI_BCM2835AUX
"universal SPI master", and the regular SPI controller.
This driver is for the universal/auxiliary SPI controller.
-config SPI_BCM53XX
- tristate "Broadcom BCM53xx SPI controller"
- depends on ARCH_BCM_5301X
- depends on BCMA_POSSIBLE
- select BCMA
- help
- Enable support for the SPI controller on Broadcom BCM53xx ARM SoCs.
-
config SPI_BCM63XX
tristate "Broadcom BCM63xx SPI controller"
depends on BCM63XX || COMPILE_TEST
@@ -233,7 +231,6 @@ config SPI_EFM32
config SPI_EP93XX
tristate "Cirrus Logic EP93xx SPI controller"
- depends on HAS_DMA
depends on ARCH_EP93XX || COMPILE_TEST
help
This enables using the Cirrus EP93xx SPI controller in master
@@ -355,7 +352,6 @@ config SPI_FSL_SPI
config SPI_FSL_DSPI
tristate "Freescale DSPI controller"
select REGMAP_MMIO
- depends on HAS_DMA
depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || M5441x || COMPILE_TEST
help
This enables support for the Freescale DSPI controller in master
@@ -431,7 +427,6 @@ config SPI_OMAP_UWIRE
config SPI_OMAP24XX
tristate "McSPI driver for OMAP"
- depends on HAS_DMA
depends on ARCH_OMAP2PLUS || COMPILE_TEST
select SG_SPLIT
help
@@ -440,7 +435,6 @@ config SPI_OMAP24XX
config SPI_TI_QSPI
tristate "DRA7xxx QSPI controller support"
- depends on HAS_DMA
depends on ARCH_OMAP2PLUS || COMPILE_TEST
help
QSPI master controller for DRA7xxx used for flash devices.
@@ -469,7 +463,6 @@ config SPI_PIC32
config SPI_PIC32_SQI
tristate "Microchip PIC32 Quad SPI driver"
depends on MACH_PIC32 || COMPILE_TEST
- depends on HAS_DMA
help
SPI driver for PIC32 Quad SPI controller.
@@ -572,7 +565,7 @@ config SPI_SC18IS602
config SPI_SH_MSIOF
tristate "SuperH MSIOF SPI controller"
- depends on HAVE_CLK && HAS_DMA
+ depends on HAVE_CLK
depends on ARCH_SHMOBILE || ARCH_RENESAS || COMPILE_TEST
help
SPI driver for SuperH and SH Mobile MSIOF blocks.
@@ -650,7 +643,7 @@ config SPI_MXS
config SPI_TEGRA114
tristate "NVIDIA Tegra114 SPI Controller"
depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST
- depends on RESET_CONTROLLER && HAS_DMA
+ depends on RESET_CONTROLLER
help
SPI driver for NVIDIA Tegra114 SPI Controller interface. This controller
is different than the older SoCs SPI controller and also register interface
@@ -668,7 +661,7 @@ config SPI_TEGRA20_SFLASH
config SPI_TEGRA20_SLINK
tristate "Nvidia Tegra20/Tegra30 SLINK Controller"
depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST
- depends on RESET_CONTROLLER && HAS_DMA
+ depends on RESET_CONTROLLER
help
SPI driver for Nvidia Tegra20/Tegra30 SLINK Controller interface.
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index b935f10eb961..cb1f4378b87c 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -8,6 +8,7 @@ ccflags-$(CONFIG_SPI_DEBUG) := -DDEBUG
# small core, mostly translating board-specific
# config declarations into driver model code
obj-$(CONFIG_SPI_MASTER) += spi.o
+obj-$(CONFIG_SPI_MEM) += spi-mem.o
obj-$(CONFIG_SPI_SPIDEV) += spidev.o
obj-$(CONFIG_SPI_LOOPBACK_TEST) += spi-loopback-test.o
@@ -20,7 +21,6 @@ obj-$(CONFIG_SPI_AU1550) += spi-au1550.o
obj-$(CONFIG_SPI_AXI_SPI_ENGINE) += spi-axi-spi-engine.o
obj-$(CONFIG_SPI_BCM2835) += spi-bcm2835.o
obj-$(CONFIG_SPI_BCM2835AUX) += spi-bcm2835aux.o
-obj-$(CONFIG_SPI_BCM53XX) += spi-bcm53xx.o
obj-$(CONFIG_SPI_BCM63XX) += spi-bcm63xx.o
obj-$(CONFIG_SPI_BCM63XX_HSSPI) += spi-bcm63xx-hsspi.o
obj-$(CONFIG_SPI_BCM_QSPI) += spi-iproc-qspi.o spi-brcmstb-qspi.o spi-bcm-qspi.o
diff --git a/drivers/spi/internals.h b/drivers/spi/internals.h
new file mode 100644
index 000000000000..4a28a8395552
--- /dev/null
+++ b/drivers/spi/internals.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ *
+ * Helpers needed by the spi or spi-mem logic. Should not be used outside of
+ * spi-mem.c and spi.c.
+ */
+
+#ifndef __LINUX_SPI_INTERNALS_H
+#define __LINUX_SPI_INTERNALS_H
+
+#include <linux/device.h>
+#include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
+#include <linux/spi/spi.h>
+
+void spi_flush_queue(struct spi_controller *ctrl);
+
+#ifdef CONFIG_HAS_DMA
+int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+ struct sg_table *sgt, void *buf, size_t len,
+ enum dma_data_direction dir);
+void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
+ struct sg_table *sgt, enum dma_data_direction dir);
+#else /* !CONFIG_HAS_DMA */
+static inline int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+ struct sg_table *sgt, void *buf, size_t len,
+ enum dma_data_direction dir)
+{
+ return -EINVAL;
+}
+
+static inline void spi_unmap_buf(struct spi_controller *ctlr,
+ struct device *dev, struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+}
+#endif /* CONFIG_HAS_DMA */
+
+#endif /* __LINUX_SPI_INTERNALS_H */
diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 6573152ce893..8612525fa4e3 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -30,6 +30,7 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include "spi-bcm-qspi.h"
@@ -215,10 +216,10 @@ struct bcm_qspi {
int bspi_maj_rev;
int bspi_min_rev;
int bspi_enabled;
- struct spi_flash_read_message *bspi_rf_msg;
- u32 bspi_rf_msg_idx;
- u32 bspi_rf_msg_len;
- u32 bspi_rf_msg_status;
+ const struct spi_mem_op *bspi_rf_op;
+ u32 bspi_rf_op_idx;
+ u32 bspi_rf_op_len;
+ u32 bspi_rf_op_status;
struct bcm_xfer_mode xfer_mode;
u32 s3_strap_override_ctrl;
bool bspi_mode;
@@ -313,26 +314,26 @@ static inline void bcm_qspi_bspi_lr_clear(struct bcm_qspi *qspi)
static void bcm_qspi_bspi_lr_data_read(struct bcm_qspi *qspi)
{
- u32 *buf = (u32 *)qspi->bspi_rf_msg->buf;
+ u32 *buf = (u32 *)qspi->bspi_rf_op->data.buf.in;
u32 data = 0;
- dev_dbg(&qspi->pdev->dev, "xfer %p rx %p rxlen %d\n", qspi->bspi_rf_msg,
- qspi->bspi_rf_msg->buf, qspi->bspi_rf_msg_len);
+ dev_dbg(&qspi->pdev->dev, "xfer %p rx %p rxlen %d\n", qspi->bspi_rf_op,
+ qspi->bspi_rf_op->data.buf.in, qspi->bspi_rf_op_len);
while (!bcm_qspi_bspi_lr_is_fifo_empty(qspi)) {
data = bcm_qspi_bspi_lr_read_fifo(qspi);
- if (likely(qspi->bspi_rf_msg_len >= 4) &&
+ if (likely(qspi->bspi_rf_op_len >= 4) &&
IS_ALIGNED((uintptr_t)buf, 4)) {
- buf[qspi->bspi_rf_msg_idx++] = data;
- qspi->bspi_rf_msg_len -= 4;
+ buf[qspi->bspi_rf_op_idx++] = data;
+ qspi->bspi_rf_op_len -= 4;
} else {
/* Read out remaining bytes, make sure*/
- u8 *cbuf = (u8 *)&buf[qspi->bspi_rf_msg_idx];
+ u8 *cbuf = (u8 *)&buf[qspi->bspi_rf_op_idx];
data = cpu_to_le32(data);
- while (qspi->bspi_rf_msg_len) {
+ while (qspi->bspi_rf_op_len) {
*cbuf++ = (u8)data;
data >>= 8;
- qspi->bspi_rf_msg_len--;
+ qspi->bspi_rf_op_len--;
}
}
}
@@ -349,14 +350,12 @@ static void bcm_qspi_bspi_set_xfer_params(struct bcm_qspi *qspi, u8 cmd_byte,
}
static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
- struct spi_flash_read_message *msg,
- int hp)
+ const struct spi_mem_op *op, int hp)
{
int bpc = 0, bpp = 0;
- u8 command = msg->read_opcode;
- int width = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
- int addrlen = msg->addr_width;
- int addr_nbits = msg->addr_nbits ? msg->addr_nbits : SPI_NBITS_SINGLE;
+ u8 command = op->cmd.opcode;
+ int width = op->cmd.buswidth ? op->cmd.buswidth : SPI_NBITS_SINGLE;
+ int addrlen = op->addr.nbytes * 8;
int flex_mode = 1;
dev_dbg(&qspi->pdev->dev, "set flex mode w %x addrlen %x hp %d\n",
@@ -365,7 +364,7 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
if (addrlen == BSPI_ADDRLEN_4BYTES)
bpp = BSPI_BPP_ADDR_SELECT_MASK;
- bpp |= msg->dummy_bytes * (8/addr_nbits);
+ bpp |= (op->dummy.nbytes * 8) / op->dummy.buswidth;
switch (width) {
case SPI_NBITS_SINGLE:
@@ -397,11 +396,10 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi,
}
static int bcm_qspi_bspi_set_override(struct bcm_qspi *qspi,
- struct spi_flash_read_message *msg,
- int hp)
+ const struct spi_mem_op *op, int hp)
{
- int width = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
- int addrlen = msg->addr_width;
+ int width = op->data.buswidth ? op->data.buswidth : SPI_NBITS_SINGLE;
+ int addrlen = op->addr.nbytes;
u32 data = bcm_qspi_read(qspi, BSPI, BSPI_STRAP_OVERRIDE_CTRL);
dev_dbg(&qspi->pdev->dev, "set override mode w %x addrlen %x hp %d\n",
@@ -437,17 +435,17 @@ static int bcm_qspi_bspi_set_override(struct bcm_qspi *qspi,
/* set the override mode */
data |= BSPI_STRAP_OVERRIDE_CTRL_OVERRIDE;
bcm_qspi_write(qspi, BSPI, BSPI_STRAP_OVERRIDE_CTRL, data);
- bcm_qspi_bspi_set_xfer_params(qspi, msg->read_opcode, 0, 0, 0);
+ bcm_qspi_bspi_set_xfer_params(qspi, op->cmd.opcode, 0, 0, 0);
return 0;
}
static int bcm_qspi_bspi_set_mode(struct bcm_qspi *qspi,
- struct spi_flash_read_message *msg, int hp)
+ const struct spi_mem_op *op, int hp)
{
int error = 0;
- int width = msg->data_nbits ? msg->data_nbits : SPI_NBITS_SINGLE;
- int addrlen = msg->addr_width;
+ int width = op->data.buswidth ? op->data.buswidth : SPI_NBITS_SINGLE;
+ int addrlen = op->addr.nbytes;
/* default mode */
qspi->xfer_mode.flex_mode = true;
@@ -460,12 +458,12 @@ static int bcm_qspi_bspi_set_mode(struct bcm_qspi *qspi,
if (val & mask || qspi->s3_strap_override_ctrl & mask) {
qspi->xfer_mode.flex_mode = false;
bcm_qspi_write(qspi, BSPI, BSPI_FLEX_MODE_ENABLE, 0);
- error = bcm_qspi_bspi_set_override(qspi, msg, hp);
+ error = bcm_qspi_bspi_set_override(qspi, op, hp);
}
}
if (qspi->xfer_mode.flex_mode)
- error = bcm_qspi_bspi_set_flex_mode(qspi, msg, hp);
+ error = bcm_qspi_bspi_set_flex_mode(qspi, op, hp);
if (error) {
dev_warn(&qspi->pdev->dev,
@@ -802,19 +800,20 @@ done:
return slot;
}
-static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
+static int bcm_qspi_bspi_exec_mem_op(struct spi_device *spi,
+ const struct spi_mem_op *op)
{
struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
- u32 addr = 0, len, rdlen, len_words;
+ u32 addr = 0, len, rdlen, len_words, from = 0;
int ret = 0;
unsigned long timeo = msecs_to_jiffies(100);
struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
if (bcm_qspi_bspi_ver_three(qspi))
- if (msg->addr_width == BSPI_ADDRLEN_4BYTES)
+ if (op->addr.nbytes == BSPI_ADDRLEN_4BYTES)
return -EIO;
+ from = op->addr.val;
bcm_qspi_chip_select(qspi, spi->chip_select);
bcm_qspi_write(qspi, MSPI, MSPI_WRITE_LOCK, 0);
@@ -823,15 +822,15 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
* the upper address byte to bspi
*/
if (bcm_qspi_bspi_ver_three(qspi) == false) {
- addr = msg->from & 0xff000000;
+ addr = from & 0xff000000;
bcm_qspi_write(qspi, BSPI,
BSPI_BSPI_FLASH_UPPER_ADDR_BYTE, addr);
}
if (!qspi->xfer_mode.flex_mode)
- addr = msg->from;
+ addr = from;
else
- addr = msg->from & 0x00ffffff;
+ addr = from & 0x00ffffff;
if (bcm_qspi_bspi_ver_three(qspi) == true)
addr = (addr + 0xc00000) & 0xffffff;
@@ -840,8 +839,8 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
* read into the entire buffer by breaking the reads
* into RAF buffer read lengths
*/
- len = msg->len;
- qspi->bspi_rf_msg_idx = 0;
+ len = op->data.nbytes;
+ qspi->bspi_rf_op_idx = 0;
do {
if (len > BSPI_READ_LENGTH)
@@ -852,9 +851,9 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
reinit_completion(&qspi->bspi_done);
bcm_qspi_enable_bspi(qspi);
len_words = (rdlen + 3) >> 2;
- qspi->bspi_rf_msg = msg;
- qspi->bspi_rf_msg_status = 0;
- qspi->bspi_rf_msg_len = rdlen;
+ qspi->bspi_rf_op = op;
+ qspi->bspi_rf_op_status = 0;
+ qspi->bspi_rf_op_len = rdlen;
dev_dbg(&qspi->pdev->dev,
"bspi xfr addr 0x%x len 0x%x", addr, rdlen);
bcm_qspi_write(qspi, BSPI, BSPI_RAF_START_ADDR, addr);
@@ -879,7 +878,6 @@ static int bcm_qspi_bspi_flash_read(struct spi_device *spi,
}
/* set msg return length */
- msg->retlen += rdlen;
addr += rdlen;
len -= rdlen;
} while (len);
@@ -914,61 +912,63 @@ static int bcm_qspi_transfer_one(struct spi_master *master,
return 0;
}
-static int bcm_qspi_mspi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
+static int bcm_qspi_mspi_exec_mem_op(struct spi_device *spi,
+ const struct spi_mem_op *op)
{
- struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
+ struct spi_master *master = spi->master;
+ struct bcm_qspi *qspi = spi_master_get_devdata(master);
struct spi_transfer t[2];
- u8 cmd[6];
- int ret;
+ u8 cmd[6] = { };
+ int ret, i;
memset(cmd, 0, sizeof(cmd));
memset(t, 0, sizeof(t));
/* tx */
/* opcode is in cmd[0] */
- cmd[0] = msg->read_opcode;
- cmd[1] = msg->from >> (msg->addr_width * 8 - 8);
- cmd[2] = msg->from >> (msg->addr_width * 8 - 16);
- cmd[3] = msg->from >> (msg->addr_width * 8 - 24);
- cmd[4] = msg->from >> (msg->addr_width * 8 - 32);
+ cmd[0] = op->cmd.opcode;
+ for (i = 0; i < op->addr.nbytes; i++)
+ cmd[1 + i] = op->addr.val >> (8 * (op->addr.nbytes - i - 1));
+
t[0].tx_buf = cmd;
- t[0].len = msg->addr_width + msg->dummy_bytes + 1;
+ t[0].len = op->addr.nbytes + op->dummy.nbytes + 1;
t[0].bits_per_word = spi->bits_per_word;
- t[0].tx_nbits = msg->opcode_nbits;
+ t[0].tx_nbits = op->cmd.buswidth;
/* lets mspi know that this is not last transfer */
qspi->trans_pos.mspi_last_trans = false;
- ret = bcm_qspi_transfer_one(spi->master, spi, &t[0]);
+ ret = bcm_qspi_transfer_one(master, spi, &t[0]);
/* rx */
qspi->trans_pos.mspi_last_trans = true;
if (!ret) {
/* rx */
- t[1].rx_buf = msg->buf;
- t[1].len = msg->len;
- t[1].rx_nbits = msg->data_nbits;
+ t[1].rx_buf = op->data.buf.in;
+ t[1].len = op->data.nbytes;
+ t[1].rx_nbits = op->data.buswidth;
t[1].bits_per_word = spi->bits_per_word;
- ret = bcm_qspi_transfer_one(spi->master, spi, &t[1]);
+ ret = bcm_qspi_transfer_one(master, spi, &t[1]);
}
- if (!ret)
- msg->retlen = msg->len;
-
return ret;
}
-static int bcm_qspi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
+static int bcm_qspi_exec_mem_op(struct spi_mem *mem,
+ const struct spi_mem_op *op)
{
+ struct spi_device *spi = mem->spi;
struct bcm_qspi *qspi = spi_master_get_devdata(spi->master);
int ret = 0;
bool mspi_read = false;
- u32 addr, len;
+ u32 addr = 0, len;
u_char *buf;
- buf = msg->buf;
- addr = msg->from;
- len = msg->len;
+ if (!op->data.nbytes || !op->addr.nbytes || op->addr.nbytes > 4 ||
+ op->data.dir != SPI_MEM_DATA_IN)
+ return -ENOTSUPP;
+
+ buf = op->data.buf.in;
+ addr = op->addr.val;
+ len = op->data.nbytes;
if (bcm_qspi_bspi_ver_three(qspi) == true) {
/*
@@ -990,12 +990,12 @@ static int bcm_qspi_flash_read(struct spi_device *spi,
mspi_read = true;
if (mspi_read)
- return bcm_qspi_mspi_flash_read(spi, msg);
+ return bcm_qspi_mspi_exec_mem_op(spi, op);
- ret = bcm_qspi_bspi_set_mode(qspi, msg, -1);
+ ret = bcm_qspi_bspi_set_mode(qspi, op, -1);
if (!ret)
- ret = bcm_qspi_bspi_flash_read(spi, msg);
+ ret = bcm_qspi_bspi_exec_mem_op(spi, op);
return ret;
}
@@ -1034,10 +1034,10 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
u32 status = qspi_dev_id->irqp->mask;
- if (qspi->bspi_enabled && qspi->bspi_rf_msg) {
+ if (qspi->bspi_enabled && qspi->bspi_rf_op) {
bcm_qspi_bspi_lr_data_read(qspi);
- if (qspi->bspi_rf_msg_len == 0) {
- qspi->bspi_rf_msg = NULL;
+ if (qspi->bspi_rf_op_len == 0) {
+ qspi->bspi_rf_op = NULL;
if (qspi->soc_intc) {
/* disable soc BSPI interrupt */
soc_intc->bcm_qspi_int_set(soc_intc, BSPI_DONE,
@@ -1046,7 +1046,7 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
status = INTR_BSPI_LR_SESSION_DONE_MASK;
}
- if (qspi->bspi_rf_msg_status)
+ if (qspi->bspi_rf_op_status)
bcm_qspi_bspi_lr_clear(qspi);
else
bcm_qspi_bspi_flush_prefetch_buffers(qspi);
@@ -1058,7 +1058,7 @@ static irqreturn_t bcm_qspi_bspi_lr_l2_isr(int irq, void *dev_id)
}
status &= INTR_BSPI_LR_SESSION_DONE_MASK;
- if (qspi->bspi_enabled && status && qspi->bspi_rf_msg_len == 0)
+ if (qspi->bspi_enabled && status && qspi->bspi_rf_op_len == 0)
complete(&qspi->bspi_done);
return IRQ_HANDLED;
@@ -1071,7 +1071,7 @@ static irqreturn_t bcm_qspi_bspi_lr_err_l2_isr(int irq, void *dev_id)
struct bcm_qspi_soc_intc *soc_intc = qspi->soc_intc;
dev_err(&qspi->pdev->dev, "BSPI INT error\n");
- qspi->bspi_rf_msg_status = -EIO;
+ qspi->bspi_rf_op_status = -EIO;
if (qspi->soc_intc)
/* clear soc interrupt */
soc_intc->bcm_qspi_int_ack(soc_intc, BSPI_ERR);
@@ -1194,6 +1194,10 @@ static void bcm_qspi_hw_uninit(struct bcm_qspi *qspi)
}
+static const struct spi_controller_mem_ops bcm_qspi_mem_ops = {
+ .exec_op = bcm_qspi_exec_mem_op,
+};
+
static const struct of_device_id bcm_qspi_of_match[] = {
{ .compatible = "brcm,spi-bcm-qspi" },
{},
@@ -1236,7 +1240,7 @@ int bcm_qspi_probe(struct platform_device *pdev,
master->mode_bits = SPI_CPHA | SPI_CPOL | SPI_RX_DUAL | SPI_RX_QUAD;
master->setup = bcm_qspi_setup;
master->transfer_one = bcm_qspi_transfer_one;
- master->spi_flash_read = bcm_qspi_flash_read;
+ master->mem_ops = &bcm_qspi_mem_ops;
master->cleanup = bcm_qspi_cleanup;
master->dev.of_node = dev->of_node;
master->num_chipselect = NUM_CHIPSELECT;
diff --git a/drivers/spi/spi-bcm53xx.c b/drivers/spi/spi-bcm53xx.c
deleted file mode 100644
index d02ceb7a29d1..000000000000
--- a/drivers/spi/spi-bcm53xx.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (C) 2014-2016 Rafał Miłecki <rafal@milecki.pl>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/bcma/bcma.h>
-#include <linux/spi/spi.h>
-
-#include "spi-bcm53xx.h"
-
-#define BCM53XXSPI_MAX_SPI_BAUD 13500000 /* 216 MHz? */
-#define BCM53XXSPI_FLASH_WINDOW SZ_32M
-
-/* The longest observed required wait was 19 ms */
-#define BCM53XXSPI_SPE_TIMEOUT_MS 80
-
-struct bcm53xxspi {
- struct bcma_device *core;
- struct spi_master *master;
- void __iomem *mmio_base;
- bool bspi; /* Boot SPI mode with memory mapping */
-};
-
-static inline u32 bcm53xxspi_read(struct bcm53xxspi *b53spi, u16 offset)
-{
- return bcma_read32(b53spi->core, offset);
-}
-
-static inline void bcm53xxspi_write(struct bcm53xxspi *b53spi, u16 offset,
- u32 value)
-{
- bcma_write32(b53spi->core, offset, value);
-}
-
-static void bcm53xxspi_disable_bspi(struct bcm53xxspi *b53spi)
-{
- struct device *dev = &b53spi->core->dev;
- unsigned long deadline;
- u32 tmp;
-
- if (!b53spi->bspi)
- return;
-
- tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
- if (tmp & 0x1)
- return;
-
- deadline = jiffies + usecs_to_jiffies(200);
- do {
- tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_BUSY_STATUS);
- if (!(tmp & 0x1)) {
- bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL,
- 0x1);
- ndelay(200);
- b53spi->bspi = false;
- return;
- }
- udelay(1);
- } while (!time_after_eq(jiffies, deadline));
-
- dev_warn(dev, "Timeout disabling BSPI\n");
-}
-
-static void bcm53xxspi_enable_bspi(struct bcm53xxspi *b53spi)
-{
- u32 tmp;
-
- if (b53spi->bspi)
- return;
-
- tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
- if (!(tmp & 0x1))
- return;
-
- bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL, 0x0);
- b53spi->bspi = true;
-}
-
-static inline unsigned int bcm53xxspi_calc_timeout(size_t len)
-{
- /* Do some magic calculation based on length and buad. Add 10% and 1. */
- return (len * 9000 / BCM53XXSPI_MAX_SPI_BAUD * 110 / 100) + 1;
-}
-
-static int bcm53xxspi_wait(struct bcm53xxspi *b53spi, unsigned int timeout_ms)
-{
- unsigned long deadline;
- u32 tmp;
-
- /* SPE bit has to be 0 before we read MSPI STATUS */
- deadline = jiffies + msecs_to_jiffies(BCM53XXSPI_SPE_TIMEOUT_MS);
- do {
- tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
- if (!(tmp & B53SPI_MSPI_SPCR2_SPE))
- break;
- udelay(5);
- } while (!time_after_eq(jiffies, deadline));
-
- if (tmp & B53SPI_MSPI_SPCR2_SPE)
- goto spi_timeout;
-
- /* Check status */
- deadline = jiffies + msecs_to_jiffies(timeout_ms);
- do {
- tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_MSPI_STATUS);
- if (tmp & B53SPI_MSPI_MSPI_STATUS_SPIF) {
- bcm53xxspi_write(b53spi, B53SPI_MSPI_MSPI_STATUS, 0);
- return 0;
- }
-
- cpu_relax();
- udelay(100);
- } while (!time_after_eq(jiffies, deadline));
-
-spi_timeout:
- bcm53xxspi_write(b53spi, B53SPI_MSPI_MSPI_STATUS, 0);
-
- pr_err("Timeout waiting for SPI to be ready!\n");
-
- return -EBUSY;
-}
-
-static void bcm53xxspi_buf_write(struct bcm53xxspi *b53spi, u8 *w_buf,
- size_t len, bool cont)
-{
- u32 tmp;
- int i;
-
- for (i = 0; i < len; i++) {
- /* Transmit Register File MSB */
- bcm53xxspi_write(b53spi, B53SPI_MSPI_TXRAM + 4 * (i * 2),
- (unsigned int)w_buf[i]);
- }
-
- for (i = 0; i < len; i++) {
- tmp = B53SPI_CDRAM_CONT | B53SPI_CDRAM_PCS_DISABLE_ALL |
- B53SPI_CDRAM_PCS_DSCK;
- if (!cont && i == len - 1)
- tmp &= ~B53SPI_CDRAM_CONT;
- tmp &= ~0x1;
- /* Command Register File */
- bcm53xxspi_write(b53spi, B53SPI_MSPI_CDRAM + 4 * i, tmp);
- }
-
- /* Set queue pointers */
- bcm53xxspi_write(b53spi, B53SPI_MSPI_NEWQP, 0);
- bcm53xxspi_write(b53spi, B53SPI_MSPI_ENDQP, len - 1);
-
- if (cont)
- bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 1);
-
- /* Start SPI transfer */
- tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
- tmp |= B53SPI_MSPI_SPCR2_SPE;
- if (cont)
- tmp |= B53SPI_MSPI_SPCR2_CONT_AFTER_CMD;
- bcm53xxspi_write(b53spi, B53SPI_MSPI_SPCR2, tmp);
-
- /* Wait for SPI to finish */
- bcm53xxspi_wait(b53spi, bcm53xxspi_calc_timeout(len));
-
- if (!cont)
- bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 0);
-}
-
-static void bcm53xxspi_buf_read(struct bcm53xxspi *b53spi, u8 *r_buf,
- size_t len, bool cont)
-{
- u32 tmp;
- int i;
-
- for (i = 0; i < len; i++) {
- tmp = B53SPI_CDRAM_CONT | B53SPI_CDRAM_PCS_DISABLE_ALL |
- B53SPI_CDRAM_PCS_DSCK;
- if (!cont && i == len - 1)
- tmp &= ~B53SPI_CDRAM_CONT;
- tmp &= ~0x1;
- /* Command Register File */
- bcm53xxspi_write(b53spi, B53SPI_MSPI_CDRAM + 4 * i, tmp);
- }
-
- /* Set queue pointers */
- bcm53xxspi_write(b53spi, B53SPI_MSPI_NEWQP, 0);
- bcm53xxspi_write(b53spi, B53SPI_MSPI_ENDQP, len - 1);
-
- if (cont)
- bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 1);
-
- /* Start SPI transfer */
- tmp = bcm53xxspi_read(b53spi, B53SPI_MSPI_SPCR2);
- tmp |= B53SPI_MSPI_SPCR2_SPE;
- if (cont)
- tmp |= B53SPI_MSPI_SPCR2_CONT_AFTER_CMD;
- bcm53xxspi_write(b53spi, B53SPI_MSPI_SPCR2, tmp);
-
- /* Wait for SPI to finish */
- bcm53xxspi_wait(b53spi, bcm53xxspi_calc_timeout(len));
-
- if (!cont)
- bcm53xxspi_write(b53spi, B53SPI_MSPI_WRITE_LOCK, 0);
-
- for (i = 0; i < len; ++i) {
- u16 reg = B53SPI_MSPI_RXRAM + 4 * (1 + i * 2);
-
- /* Data stored in the transmit register file LSB */
- r_buf[i] = (u8)bcm53xxspi_read(b53spi, reg);
- }
-}
-
-static int bcm53xxspi_transfer_one(struct spi_master *master,
- struct spi_device *spi,
- struct spi_transfer *t)
-{
- struct bcm53xxspi *b53spi = spi_master_get_devdata(master);
- u8 *buf;
- size_t left;
-
- bcm53xxspi_disable_bspi(b53spi);
-
- if (t->tx_buf) {
- buf = (u8 *)t->tx_buf;
- left = t->len;
- while (left) {
- size_t to_write = min_t(size_t, 16, left);
- bool cont = !spi_transfer_is_last(master, t) ||
- left - to_write > 0;
-
- bcm53xxspi_buf_write(b53spi, buf, to_write, cont);
- left -= to_write;
- buf += to_write;
- }
- }
-
- if (t->rx_buf) {
- buf = (u8 *)t->rx_buf;
- left = t->len;
- while (left) {
- size_t to_read = min_t(size_t, 16, left);
- bool cont = !spi_transfer_is_last(master, t) ||
- left - to_read > 0;
-
- bcm53xxspi_buf_read(b53spi, buf, to_read, cont);
- left -= to_read;
- buf += to_read;
- }
- }
-
- return 0;
-}
-
-static int bcm53xxspi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
-{
- struct bcm53xxspi *b53spi = spi_master_get_devdata(spi->master);
- int ret = 0;
-
- if (msg->from + msg->len > BCM53XXSPI_FLASH_WINDOW)
- return -EINVAL;
-
- bcm53xxspi_enable_bspi(b53spi);
- memcpy_fromio(msg->buf, b53spi->mmio_base + msg->from, msg->len);
- msg->retlen = msg->len;
-
- return ret;
-}
-
-/**************************************************
- * BCMA
- **************************************************/
-
-static const struct bcma_device_id bcm53xxspi_bcma_tbl[] = {
- BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_NS_QSPI, BCMA_ANY_REV, BCMA_ANY_CLASS),
- {},
-};
-MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
-
-static int bcm53xxspi_bcma_probe(struct bcma_device *core)
-{
- struct device *dev = &core->dev;
- struct bcm53xxspi *b53spi;
- struct spi_master *master;
- int err;
-
- if (core->bus->drv_cc.core->id.rev != 42) {
- pr_err("SPI on SoC with unsupported ChipCommon rev\n");
- return -ENOTSUPP;
- }
-
- master = spi_alloc_master(dev, sizeof(*b53spi));
- if (!master)
- return -ENOMEM;
-
- b53spi = spi_master_get_devdata(master);
- b53spi->master = master;
- b53spi->core = core;
-
- if (core->addr_s[0])
- b53spi->mmio_base = devm_ioremap(dev, core->addr_s[0],
- BCM53XXSPI_FLASH_WINDOW);
- b53spi->bspi = true;
- bcm53xxspi_disable_bspi(b53spi);
-
- master->dev.of_node = dev->of_node;
- master->transfer_one = bcm53xxspi_transfer_one;
- if (b53spi->mmio_base)
- master->spi_flash_read = bcm53xxspi_flash_read;
-
- bcma_set_drvdata(core, b53spi);
-
- err = devm_spi_register_master(dev, master);
- if (err) {
- spi_master_put(master);
- bcma_set_drvdata(core, NULL);
- return err;
- }
-
- return 0;
-}
-
-static struct bcma_driver bcm53xxspi_bcma_driver = {
- .name = KBUILD_MODNAME,
- .id_table = bcm53xxspi_bcma_tbl,
- .probe = bcm53xxspi_bcma_probe,
-};
-
-/**************************************************
- * Init & exit
- **************************************************/
-
-static int __init bcm53xxspi_module_init(void)
-{
- int err = 0;
-
- err = bcma_driver_register(&bcm53xxspi_bcma_driver);
- if (err)
- pr_err("Failed to register bcma driver: %d\n", err);
-
- return err;
-}
-
-static void __exit bcm53xxspi_module_exit(void)
-{
- bcma_driver_unregister(&bcm53xxspi_bcma_driver);
-}
-
-module_init(bcm53xxspi_module_init);
-module_exit(bcm53xxspi_module_exit);
-
-MODULE_DESCRIPTION("Broadcom BCM53xx SPI Controller driver");
-MODULE_AUTHOR("Rafał Miłecki <zajec5@gmail.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/spi/spi-bcm53xx.h b/drivers/spi/spi-bcm53xx.h
deleted file mode 100644
index 03e3442086ec..000000000000
--- a/drivers/spi/spi-bcm53xx.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef SPI_BCM53XX_H
-#define SPI_BCM53XX_H
-
-#define B53SPI_BSPI_REVISION_ID 0x000
-#define B53SPI_BSPI_SCRATCH 0x004
-#define B53SPI_BSPI_MAST_N_BOOT_CTRL 0x008
-#define B53SPI_BSPI_BUSY_STATUS 0x00c
-#define B53SPI_BSPI_INTR_STATUS 0x010
-#define B53SPI_BSPI_B0_STATUS 0x014
-#define B53SPI_BSPI_B0_CTRL 0x018
-#define B53SPI_BSPI_B1_STATUS 0x01c
-#define B53SPI_BSPI_B1_CTRL 0x020
-#define B53SPI_BSPI_STRAP_OVERRIDE_CTRL 0x024
-#define B53SPI_BSPI_FLEX_MODE_ENABLE 0x028
-#define B53SPI_BSPI_BITS_PER_CYCLE 0x02c
-#define B53SPI_BSPI_BITS_PER_PHASE 0x030
-#define B53SPI_BSPI_CMD_AND_MODE_BYTE 0x034
-#define B53SPI_BSPI_BSPI_FLASH_UPPER_ADDR_BYTE 0x038
-#define B53SPI_BSPI_BSPI_XOR_VALUE 0x03c
-#define B53SPI_BSPI_BSPI_XOR_ENABLE 0x040
-#define B53SPI_BSPI_BSPI_PIO_MODE_ENABLE 0x044
-#define B53SPI_BSPI_BSPI_PIO_IODIR 0x048
-#define B53SPI_BSPI_BSPI_PIO_DATA 0x04c
-
-/* RAF */
-#define B53SPI_RAF_START_ADDR 0x100
-#define B53SPI_RAF_NUM_WORDS 0x104
-#define B53SPI_RAF_CTRL 0x108
-#define B53SPI_RAF_FULLNESS 0x10c
-#define B53SPI_RAF_WATERMARK 0x110
-#define B53SPI_RAF_STATUS 0x114
-#define B53SPI_RAF_READ_DATA 0x118
-#define B53SPI_RAF_WORD_CNT 0x11c
-#define B53SPI_RAF_CURR_ADDR 0x120
-
-/* MSPI */
-#define B53SPI_MSPI_SPCR0_LSB 0x200
-#define B53SPI_MSPI_SPCR0_MSB 0x204
-#define B53SPI_MSPI_SPCR1_LSB 0x208
-#define B53SPI_MSPI_SPCR1_MSB 0x20c
-#define B53SPI_MSPI_NEWQP 0x210
-#define B53SPI_MSPI_ENDQP 0x214
-#define B53SPI_MSPI_SPCR2 0x218
-#define B53SPI_MSPI_SPCR2_SPE 0x00000040
-#define B53SPI_MSPI_SPCR2_CONT_AFTER_CMD 0x00000080
-#define B53SPI_MSPI_MSPI_STATUS 0x220
-#define B53SPI_MSPI_MSPI_STATUS_SPIF 0x00000001
-#define B53SPI_MSPI_CPTQP 0x224
-#define B53SPI_MSPI_TXRAM 0x240 /* 32 registers, up to 0x2b8 */
-#define B53SPI_MSPI_RXRAM 0x2c0 /* 32 registers, up to 0x33c */
-#define B53SPI_MSPI_CDRAM 0x340 /* 16 registers, up to 0x37c */
-#define B53SPI_CDRAM_PCS_PCS0 0x00000001
-#define B53SPI_CDRAM_PCS_PCS1 0x00000002
-#define B53SPI_CDRAM_PCS_PCS2 0x00000004
-#define B53SPI_CDRAM_PCS_PCS3 0x00000008
-#define B53SPI_CDRAM_PCS_DISABLE_ALL 0x0000000f
-#define B53SPI_CDRAM_PCS_DSCK 0x00000010
-#define B53SPI_CDRAM_BITSE 0x00000040
-#define B53SPI_CDRAM_CONT 0x00000080
-#define B53SPI_MSPI_WRITE_LOCK 0x380
-#define B53SPI_MSPI_DISABLE_FLUSH_GEN 0x384
-
-/* Interrupt */
-#define B53SPI_INTR_RAF_LR_FULLNESS_REACHED 0x3a0
-#define B53SPI_INTR_RAF_LR_TRUNCATED 0x3a4
-#define B53SPI_INTR_RAF_LR_IMPATIENT 0x3a8
-#define B53SPI_INTR_RAF_LR_SESSION_DONE 0x3ac
-#define B53SPI_INTR_RAF_LR_OVERREAD 0x3b0
-#define B53SPI_INTR_MSPI_DONE 0x3b4
-#define B53SPI_INTR_MSPI_HALT_SET_TRANSACTION_DONE 0x3b8
-
-#endif /* SPI_BCM53XX_H */
diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c
index cbcba614b253..c23849f7aa7b 100644
--- a/drivers/spi/spi-bcm63xx-hsspi.c
+++ b/drivers/spi/spi-bcm63xx-hsspi.c
@@ -352,22 +352,31 @@ static int bcm63xx_hsspi_probe(struct platform_device *pdev)
if (IS_ERR(clk))
return PTR_ERR(clk);
+ ret = clk_prepare_enable(clk);
+ if (ret)
+ return ret;
+
rate = clk_get_rate(clk);
if (!rate) {
struct clk *pll_clk = devm_clk_get(dev, "pll");
- if (IS_ERR(pll_clk))
- return PTR_ERR(pll_clk);
+ if (IS_ERR(pll_clk)) {
+ ret = PTR_ERR(pll_clk);
+ goto out_disable_clk;
+ }
+
+ ret = clk_prepare_enable(pll_clk);
+ if (ret)
+ goto out_disable_clk;
rate = clk_get_rate(pll_clk);
- if (!rate)
- return -EINVAL;
+ clk_disable_unprepare(pll_clk);
+ if (!rate) {
+ ret = -EINVAL;
+ goto out_disable_clk;
+ }
}
- ret = clk_prepare_enable(clk);
- if (ret)
- return ret;
-
master = spi_alloc_master(&pdev->dev, sizeof(*bs));
if (!master) {
ret = -ENOMEM;
diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c
index 4a001634023e..f3dad6fcdc35 100644
--- a/drivers/spi/spi-cadence.c
+++ b/drivers/spi/spi-cadence.c
@@ -694,8 +694,7 @@ static int cdns_spi_remove(struct platform_device *pdev)
*/
static int __maybe_unused cdns_spi_suspend(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
- struct spi_master *master = platform_get_drvdata(pdev);
+ struct spi_master *master = dev_get_drvdata(dev);
return spi_master_suspend(master);
}
@@ -710,8 +709,7 @@ static int __maybe_unused cdns_spi_suspend(struct device *dev)
*/
static int __maybe_unused cdns_spi_resume(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
- struct spi_master *master = platform_get_drvdata(pdev);
+ struct spi_master *master = dev_get_drvdata(dev);
struct cdns_spi *xspi = spi_master_get_devdata(master);
cdns_spi_init_hw(xspi);
diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c
index cb3c73007ca1..e6d5cc6ab108 100644
--- a/drivers/spi/spi-fsl-lpspi.c
+++ b/drivers/spi/spi-fsl-lpspi.c
@@ -1,19 +1,8 @@
-/*
- * Freescale i.MX7ULP LPSPI driver
- *
- * Copyright 2016 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Freescale i.MX7ULP LPSPI driver
+//
+// Copyright 2016 Freescale Semiconductor, Inc.
#include <linux/clk.h>
#include <linux/completion.h>
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index a056ee88a960..866246f21041 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -1,22 +1,6 @@
-/*
- * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
- * Copyright (C) 2008 Juergen Beisert
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02110-1301, USA.
- */
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
+// Copyright (C) 2008 Juergen Beisert
#include <linux/clk.h>
#include <linux/completion.h>
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
new file mode 100644
index 000000000000..990770dfa5cf
--- /dev/null
+++ b/drivers/spi/spi-mem.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+#include <linux/dmaengine.h>
+#include <linux/pm_runtime.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
+
+#include "internals.h"
+
+/**
+ * spi_controller_dma_map_mem_op_data() - DMA-map the buffer attached to a
+ * memory operation
+ * @ctlr: the SPI controller requesting this dma_map()
+ * @op: the memory operation containing the buffer to map
+ * @sgt: a pointer to a non-initialized sg_table that will be filled by this
+ * function
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares everything for you and provides a ready-to-use
+ * sg_table. This function is not intended to be called from spi drivers.
+ * Only SPI controller drivers should use it.
+ * Note that the caller must ensure the memory region pointed by
+ * op->data.buf.{in,out} is DMA-able before calling this function.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sgt)
+{
+ struct device *dmadev;
+
+ if (!op->data.nbytes)
+ return -EINVAL;
+
+ if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+ dmadev = ctlr->dma_tx->device->dev;
+ else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+ dmadev = ctlr->dma_rx->device->dev;
+ else
+ dmadev = ctlr->dev.parent;
+
+ if (!dmadev)
+ return -EINVAL;
+
+ return spi_map_buf(ctlr, dmadev, sgt, op->data.buf.in, op->data.nbytes,
+ op->data.dir == SPI_MEM_DATA_IN ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_map_mem_op_data);
+
+/**
+ * spi_controller_dma_unmap_mem_op_data() - DMA-unmap the buffer attached to a
+ * memory operation
+ * @ctlr: the SPI controller requesting this dma_unmap()
+ * @op: the memory operation containing the buffer to unmap
+ * @sgt: a pointer to an sg_table previously initialized by
+ * spi_controller_dma_map_mem_op_data()
+ *
+ * Some controllers might want to do DMA on the data buffer embedded in @op.
+ * This helper prepares things so that the CPU can access the
+ * op->data.buf.{in,out} buffer again.
+ *
+ * This function is not intended to be called from SPI drivers. Only SPI
+ * controller drivers should use it.
+ *
+ * This function should be called after the DMA operation has finished and is
+ * only valid if the previous spi_controller_dma_map_mem_op_data() call
+ * returned 0.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sgt)
+{
+ struct device *dmadev;
+
+ if (!op->data.nbytes)
+ return;
+
+ if (op->data.dir == SPI_MEM_DATA_OUT && ctlr->dma_tx)
+ dmadev = ctlr->dma_tx->device->dev;
+ else if (op->data.dir == SPI_MEM_DATA_IN && ctlr->dma_rx)
+ dmadev = ctlr->dma_rx->device->dev;
+ else
+ dmadev = ctlr->dev.parent;
+
+ spi_unmap_buf(ctlr, dmadev, sgt,
+ op->data.dir == SPI_MEM_DATA_IN ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+}
+EXPORT_SYMBOL_GPL(spi_controller_dma_unmap_mem_op_data);
+
+static int spi_check_buswidth_req(struct spi_mem *mem, u8 buswidth, bool tx)
+{
+ u32 mode = mem->spi->mode;
+
+ switch (buswidth) {
+ case 1:
+ return 0;
+
+ case 2:
+ if ((tx && (mode & (SPI_TX_DUAL | SPI_TX_QUAD))) ||
+ (!tx && (mode & (SPI_RX_DUAL | SPI_RX_QUAD))))
+ return 0;
+
+ break;
+
+ case 4:
+ if ((tx && (mode & SPI_TX_QUAD)) ||
+ (!tx && (mode & SPI_RX_QUAD)))
+ return 0;
+
+ break;
+
+ default:
+ break;
+ }
+
+ return -ENOTSUPP;
+}
+
+static bool spi_mem_default_supports_op(struct spi_mem *mem,
+ const struct spi_mem_op *op)
+{
+ if (spi_check_buswidth_req(mem, op->cmd.buswidth, true))
+ return false;
+
+ if (op->addr.nbytes &&
+ spi_check_buswidth_req(mem, op->addr.buswidth, true))
+ return false;
+
+ if (op->dummy.nbytes &&
+ spi_check_buswidth_req(mem, op->dummy.buswidth, true))
+ return false;
+
+ if (op->data.nbytes &&
+ spi_check_buswidth_req(mem, op->data.buswidth,
+ op->data.dir == SPI_MEM_DATA_OUT))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(spi_mem_default_supports_op);
+
+/**
+ * spi_mem_supports_op() - Check if a memory device and the controller it is
+ * connected to support a specific memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to check
+ *
+ * Some controllers are only supporting Single or Dual IOs, others might only
+ * support specific opcodes, or it can even be that the controller and device
+ * both support Quad IOs but the hardware prevents you from using it because
+ * only 2 IO lines are connected.
+ *
+ * This function checks whether a specific operation is supported.
+ *
+ * Return: true if @op is supported, false otherwise.
+ */
+bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+ struct spi_controller *ctlr = mem->spi->controller;
+
+ if (ctlr->mem_ops && ctlr->mem_ops->supports_op)
+ return ctlr->mem_ops->supports_op(mem, op);
+
+ return spi_mem_default_supports_op(mem, op);
+}
+EXPORT_SYMBOL_GPL(spi_mem_supports_op);
+
+/**
+ * spi_mem_exec_op() - Execute a memory operation
+ * @mem: the SPI memory
+ * @op: the memory operation to execute
+ *
+ * Executes a memory operation.
+ *
+ * This function first checks that @op is supported and then tries to execute
+ * it.
+ *
+ * Return: 0 in case of success, a negative error code otherwise.
+ */
+int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+ unsigned int tmpbufsize, xferpos = 0, totalxferlen = 0;
+ struct spi_controller *ctlr = mem->spi->controller;
+ struct spi_transfer xfers[4] = { };
+ struct spi_message msg;
+ u8 *tmpbuf;
+ int ret;
+
+ if (!spi_mem_supports_op(mem, op))
+ return -ENOTSUPP;
+
+ if (ctlr->mem_ops) {
+ /*
+ * Flush the message queue before executing our SPI memory
+ * operation to prevent preemption of regular SPI transfers.
+ */
+ spi_flush_queue(ctlr);
+
+ if (ctlr->auto_runtime_pm) {
+ ret = pm_runtime_get_sync(ctlr->dev.parent);
+ if (ret < 0) {
+ dev_err(&ctlr->dev,
+ "Failed to power device: %d\n",
+ ret);
+ return ret;
+ }
+ }
+
+ mutex_lock(&ctlr->bus_lock_mutex);
+ mutex_lock(&ctlr->io_mutex);
+ ret = ctlr->mem_ops->exec_op(mem, op);
+ mutex_unlock(&ctlr->io_mutex);
+ mutex_unlock(&ctlr->bus_lock_mutex);
+
+ if (ctlr->auto_runtime_pm)
+ pm_runtime_put(ctlr->dev.parent);
+
+ /*
+ * Some controllers only optimize specific paths (typically the
+ * read path) and expect the core to use the regular SPI
+ * interface in other cases.
+ */
+ if (!ret || ret != -ENOTSUPP)
+ return ret;
+ }
+
+ tmpbufsize = sizeof(op->cmd.opcode) + op->addr.nbytes +
+ op->dummy.nbytes;
+
+ /*
+ * Allocate a buffer to transmit the CMD, ADDR cycles with kmalloc() so
+ * we're guaranteed that this buffer is DMA-able, as required by the
+ * SPI layer.
+ */
+ tmpbuf = kzalloc(tmpbufsize, GFP_KERNEL | GFP_DMA);
+ if (!tmpbuf)
+ return -ENOMEM;
+
+ spi_message_init(&msg);
+
+ tmpbuf[0] = op->cmd.opcode;
+ xfers[xferpos].tx_buf = tmpbuf;
+ xfers[xferpos].len = sizeof(op->cmd.opcode);
+ xfers[xferpos].tx_nbits = op->cmd.buswidth;
+ spi_message_add_tail(&xfers[xferpos], &msg);
+ xferpos++;
+ totalxferlen++;
+
+ if (op->addr.nbytes) {
+ int i;
+
+ for (i = 0; i < op->addr.nbytes; i++)
+ tmpbuf[i + 1] = op->addr.val >>
+ (8 * (op->addr.nbytes - i - 1));
+
+ xfers[xferpos].tx_buf = tmpbuf + 1;
+ xfers[xferpos].len = op->addr.nbytes;
+ xfers[xferpos].tx_nbits = op->addr.buswidth;
+ spi_message_add_tail(&xfers[xferpos], &msg);
+ xferpos++;
+ totalxferlen += op->addr.nbytes;
+ }
+
+ if (op->dummy.nbytes) {
+ memset(tmpbuf + op->addr.nbytes + 1, 0xff, op->dummy.nbytes);
+ xfers[xferpos].tx_buf = tmpbuf + op->addr.nbytes + 1;
+ xfers[xferpos].len = op->dummy.nbytes;
+ xfers[xferpos].tx_nbits = op->dummy.buswidth;
+ spi_message_add_tail(&xfers[xferpos], &msg);
+ xferpos++;
+ totalxferlen += op->dummy.nbytes;
+ }
+
+ if (op->data.nbytes) {
+ if (op->data.dir == SPI_MEM_DATA_IN) {
+ xfers[xferpos].rx_buf = op->data.buf.in;
+ xfers[xferpos].rx_nbits = op->data.buswidth;
+ } else {
+ xfers[xferpos].tx_buf = op->data.buf.out;
+ xfers[xferpos].tx_nbits = op->data.buswidth;
+ }
+
+ xfers[xferpos].len = op->data.nbytes;
+ spi_message_add_tail(&xfers[xferpos], &msg);
+ xferpos++;
+ totalxferlen += op->data.nbytes;
+ }
+
+ ret = spi_sync(mem->spi, &msg);
+
+ kfree(tmpbuf);
+
+ if (ret)
+ return ret;
+
+ if (msg.actual_length != totalxferlen)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_exec_op);
+
+/**
+ * spi_mem_adjust_op_size() - Adjust the data size of a SPI mem operation to
+ * match controller limitations
+ * @mem: the SPI memory
+ * @op: the operation to adjust
+ *
+ * Some controllers have FIFO limitations and must split a data transfer
+ * operation into multiple ones, others require a specific alignment for
+ * optimized accesses. This function allows SPI mem drivers to split a single
+ * operation into multiple sub-operations when required.
+ *
+ * Return: a negative error code if the controller can't properly adjust @op,
+ * 0 otherwise. Note that @op->data.nbytes will be updated if @op
+ * can't be handled in a single step.
+ */
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
+{
+ struct spi_controller *ctlr = mem->spi->controller;
+
+ if (ctlr->mem_ops && ctlr->mem_ops->adjust_op_size)
+ return ctlr->mem_ops->adjust_op_size(mem, op);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(spi_mem_adjust_op_size);
+
+static inline struct spi_mem_driver *to_spi_mem_drv(struct device_driver *drv)
+{
+ return container_of(drv, struct spi_mem_driver, spidrv.driver);
+}
+
+static int spi_mem_probe(struct spi_device *spi)
+{
+ struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+ struct spi_mem *mem;
+
+ mem = devm_kzalloc(&spi->dev, sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ mem->spi = spi;
+ spi_set_drvdata(spi, mem);
+
+ return memdrv->probe(mem);
+}
+
+static int spi_mem_remove(struct spi_device *spi)
+{
+ struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+ struct spi_mem *mem = spi_get_drvdata(spi);
+
+ if (memdrv->remove)
+ return memdrv->remove(mem);
+
+ return 0;
+}
+
+static void spi_mem_shutdown(struct spi_device *spi)
+{
+ struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+ struct spi_mem *mem = spi_get_drvdata(spi);
+
+ if (memdrv->shutdown)
+ memdrv->shutdown(mem);
+}
+
+/**
+ * spi_mem_driver_register_with_owner() - Register a SPI memory driver
+ * @memdrv: the SPI memory driver to register
+ * @owner: the owner of this driver
+ *
+ * Registers a SPI memory driver.
+ *
+ * Return: 0 in case of success, a negative error core otherwise.
+ */
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *memdrv,
+ struct module *owner)
+{
+ memdrv->spidrv.probe = spi_mem_probe;
+ memdrv->spidrv.remove = spi_mem_remove;
+ memdrv->spidrv.shutdown = spi_mem_shutdown;
+
+ return __spi_register_driver(owner, &memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_register_with_owner);
+
+/**
+ * spi_mem_driver_unregister_with_owner() - Unregister a SPI memory driver
+ * @memdrv: the SPI memory driver to unregister
+ *
+ * Unregisters a SPI memory driver.
+ */
+void spi_mem_driver_unregister(struct spi_mem_driver *memdrv)
+{
+ spi_unregister_driver(&memdrv->spidrv);
+}
+EXPORT_SYMBOL_GPL(spi_mem_driver_unregister);
diff --git a/drivers/spi/spi-meson-spicc.c b/drivers/spi/spi-meson-spicc.c
index 5c82910e3480..7fe4488ace57 100644
--- a/drivers/spi/spi-meson-spicc.c
+++ b/drivers/spi/spi-meson-spicc.c
@@ -574,10 +574,15 @@ static int meson_spicc_probe(struct platform_device *pdev)
master->max_speed_hz = rate >> 2;
ret = devm_spi_register_master(&pdev->dev, master);
- if (!ret)
- return 0;
+ if (ret) {
+ dev_err(&pdev->dev, "spi master registration failed\n");
+ goto out_clk;
+ }
- dev_err(&pdev->dev, "spi master registration failed\n");
+ return 0;
+
+out_clk:
+ clk_disable_unprepare(spicc->core);
out_master:
spi_master_put(master);
diff --git a/drivers/spi/spi-mpc52xx.c b/drivers/spi/spi-mpc52xx.c
index e8b59ce4dc3a..0e55784a3ad9 100644
--- a/drivers/spi/spi-mpc52xx.c
+++ b/drivers/spi/spi-mpc52xx.c
@@ -447,7 +447,7 @@ static int mpc52xx_spi_probe(struct platform_device *op)
for (i = 0; i < ms->gpio_cs_count; i++) {
gpio_cs = of_get_gpio(op->dev.of_node, i);
- if (gpio_cs < 0) {
+ if (!gpio_is_valid(gpio_cs)) {
dev_err(&op->dev,
"could not parse the gpio field in oftree\n");
rc = -ENODEV;
diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c
index 3d216b950b41..6ac95a2a21ce 100644
--- a/drivers/spi/spi-mxs.c
+++ b/drivers/spi/spi-mxs.c
@@ -1,32 +1,22 @@
-/*
- * Freescale MXS SPI master driver
- *
- * Copyright 2012 DENX Software Engineering, GmbH.
- * Copyright 2012 Freescale Semiconductor, Inc.
- * Copyright 2008 Embedded Alley Solutions, Inc All Rights Reserved.
- *
- * Rework and transition to new API by:
- * Marek Vasut <marex@denx.de>
- *
- * Based on previous attempt by:
- * Fabio Estevam <fabio.estevam@freescale.com>
- *
- * Based on code from U-Boot bootloader by:
- * Marek Vasut <marex@denx.de>
- *
- * Based on spi-stmp.c, which is:
- * Author: Dmitry Pervushin <dimka@embeddedalley.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Freescale MXS SPI master driver
+//
+// Copyright 2012 DENX Software Engineering, GmbH.
+// Copyright 2012 Freescale Semiconductor, Inc.
+// Copyright 2008 Embedded Alley Solutions, Inc All Rights Reserved.
+//
+// Rework and transition to new API by:
+// Marek Vasut <marex@denx.de>
+//
+// Based on previous attempt by:
+// Fabio Estevam <fabio.estevam@freescale.com>
+//
+// Based on code from U-Boot bootloader by:
+// Marek Vasut <marex@denx.de>
+//
+// Based on spi-stmp.c, which is:
+// Author: Dmitry Pervushin <dimka@embeddedalley.com>
#include <linux/kernel.h>
#include <linux/ioport.h>
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 9bf64e6eca9b..6c628a54e946 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -255,6 +255,7 @@ static void omap2_mcspi_set_cs(struct spi_device *spi, bool enable)
if (spi->controller_state) {
int err = pm_runtime_get_sync(mcspi->dev);
if (err < 0) {
+ pm_runtime_put_noidle(mcspi->dev);
dev_err(mcspi->dev, "failed to get sync: %d\n", err);
return;
}
@@ -350,20 +351,6 @@ disable_fifo:
mcspi->fifo_depth = 0;
}
-static void omap2_mcspi_restore_ctx(struct omap2_mcspi *mcspi)
-{
- struct spi_master *spi_cntrl = mcspi->master;
- struct omap2_mcspi_regs *ctx = &mcspi->ctx;
- struct omap2_mcspi_cs *cs;
-
- /* McSPI: context restore */
- mcspi_write_reg(spi_cntrl, OMAP2_MCSPI_MODULCTRL, ctx->modulctrl);
- mcspi_write_reg(spi_cntrl, OMAP2_MCSPI_WAKEUPENABLE, ctx->wakeupenable);
-
- list_for_each_entry(cs, &ctx->cs, node)
- writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
-}
-
static int mcspi_wait_for_reg_bit(void __iomem *reg, unsigned long bit)
{
unsigned long timeout;
@@ -1065,8 +1052,11 @@ static int omap2_mcspi_setup(struct spi_device *spi)
}
ret = pm_runtime_get_sync(mcspi->dev);
- if (ret < 0)
+ if (ret < 0) {
+ pm_runtime_put_noidle(mcspi->dev);
+
return ret;
+ }
ret = omap2_mcspi_setup_transfer(spi, NULL);
pm_runtime_mark_last_busy(mcspi->dev);
@@ -1284,8 +1274,11 @@ static int omap2_mcspi_master_setup(struct omap2_mcspi *mcspi)
int ret = 0;
ret = pm_runtime_get_sync(mcspi->dev);
- if (ret < 0)
+ if (ret < 0) {
+ pm_runtime_put_noidle(mcspi->dev);
+
return ret;
+ }
mcspi_write_reg(master, OMAP2_MCSPI_WAKEUPENABLE,
OMAP2_MCSPI_WAKEUPENABLE_WKEN);
@@ -1297,14 +1290,39 @@ static int omap2_mcspi_master_setup(struct omap2_mcspi *mcspi)
return 0;
}
+/*
+ * When SPI wake up from off-mode, CS is in activate state. If it was in
+ * inactive state when driver was suspend, then force it to inactive state at
+ * wake up.
+ */
static int omap_mcspi_runtime_resume(struct device *dev)
{
- struct omap2_mcspi *mcspi;
- struct spi_master *master;
+ struct spi_master *master = dev_get_drvdata(dev);
+ struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
+ struct omap2_mcspi_regs *ctx = &mcspi->ctx;
+ struct omap2_mcspi_cs *cs;
- master = dev_get_drvdata(dev);
- mcspi = spi_master_get_devdata(master);
- omap2_mcspi_restore_ctx(mcspi);
+ /* McSPI: context restore */
+ mcspi_write_reg(master, OMAP2_MCSPI_MODULCTRL, ctx->modulctrl);
+ mcspi_write_reg(master, OMAP2_MCSPI_WAKEUPENABLE, ctx->wakeupenable);
+
+ list_for_each_entry(cs, &ctx->cs, node) {
+ /*
+ * We need to toggle CS state for OMAP take this
+ * change in account.
+ */
+ if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
+ cs->chconf0 |= OMAP2_MCSPI_CHCONF_FORCE;
+ writel_relaxed(cs->chconf0,
+ cs->base + OMAP2_MCSPI_CHCONF0);
+ cs->chconf0 &= ~OMAP2_MCSPI_CHCONF_FORCE;
+ writel_relaxed(cs->chconf0,
+ cs->base + OMAP2_MCSPI_CHCONF0);
+ } else {
+ writel_relaxed(cs->chconf0,
+ cs->base + OMAP2_MCSPI_CHCONF0);
+ }
+ }
return 0;
}
@@ -1447,50 +1465,33 @@ static int omap2_mcspi_remove(struct platform_device *pdev)
MODULE_ALIAS("platform:omap2_mcspi");
#ifdef CONFIG_SUSPEND
-/*
- * When SPI wake up from off-mode, CS is in activate state. If it was in
- * unactive state when driver was suspend, then force it to unactive state at
- * wake up.
- */
-static int omap2_mcspi_resume(struct device *dev)
+static int omap2_mcspi_suspend_noirq(struct device *dev)
{
- struct spi_master *master = dev_get_drvdata(dev);
- struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
- struct omap2_mcspi_regs *ctx = &mcspi->ctx;
- struct omap2_mcspi_cs *cs;
-
- pm_runtime_get_sync(mcspi->dev);
- list_for_each_entry(cs, &ctx->cs, node) {
- if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
- /*
- * We need to toggle CS state for OMAP take this
- * change in account.
- */
- cs->chconf0 |= OMAP2_MCSPI_CHCONF_FORCE;
- writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
- cs->chconf0 &= ~OMAP2_MCSPI_CHCONF_FORCE;
- writel_relaxed(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
- }
- }
- pm_runtime_mark_last_busy(mcspi->dev);
- pm_runtime_put_autosuspend(mcspi->dev);
-
- return pinctrl_pm_select_default_state(dev);
+ return pinctrl_pm_select_sleep_state(dev);
}
-static int omap2_mcspi_suspend(struct device *dev)
+static int omap2_mcspi_resume_noirq(struct device *dev)
{
- return pinctrl_pm_select_sleep_state(dev);
+ struct spi_master *master = dev_get_drvdata(dev);
+ struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
+ int error;
+
+ error = pinctrl_pm_select_default_state(dev);
+ if (error)
+ dev_warn(mcspi->dev, "%s: failed to set pins: %i\n",
+ __func__, error);
+
+ return 0;
}
#else
-#define omap2_mcspi_suspend NULL
-#define omap2_mcspi_resume NULL
+#define omap2_mcspi_suspend_noirq NULL
+#define omap2_mcspi_resume_noirq NULL
#endif
static const struct dev_pm_ops omap2_mcspi_pm_ops = {
- .resume = omap2_mcspi_resume,
- .suspend = omap2_mcspi_suspend,
+ .suspend_noirq = omap2_mcspi_suspend_noirq,
+ .resume_noirq = omap2_mcspi_resume_noirq,
.runtime_resume = omap_mcspi_runtime_resume,
};
diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index 3d7f66080c57..2fa7f4b43492 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -51,19 +51,15 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
if (!pxa25x_ssp_comp(drv_data))
pxa2xx_spi_write(drv_data, SSTO, 0);
- if (!error) {
- msg->actual_length += drv_data->len;
- msg->state = pxa2xx_spi_next_transfer(drv_data);
- } else {
+ if (error) {
/* In case we got an error we disable the SSP now */
pxa2xx_spi_write(drv_data, SSCR0,
pxa2xx_spi_read(drv_data, SSCR0)
& ~SSCR0_SSE);
-
- msg->state = ERROR_STATE;
+ msg->status = -EIO;
}
- tasklet_schedule(&drv_data->pump_transfers);
+ spi_finalize_current_transfer(drv_data->master);
}
}
@@ -74,11 +70,11 @@ static void pxa2xx_spi_dma_callback(void *data)
static struct dma_async_tx_descriptor *
pxa2xx_spi_dma_prepare_one(struct driver_data *drv_data,
- enum dma_transfer_direction dir)
+ enum dma_transfer_direction dir,
+ struct spi_transfer *xfer)
{
struct chip_data *chip =
spi_get_ctldata(drv_data->master->cur_msg->spi);
- struct spi_transfer *xfer = drv_data->cur_transfer;
enum dma_slave_buswidth width;
struct dma_slave_config cfg;
struct dma_chan *chan;
@@ -144,12 +140,13 @@ irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data)
return IRQ_NONE;
}
-int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
+int pxa2xx_spi_dma_prepare(struct driver_data *drv_data,
+ struct spi_transfer *xfer)
{
struct dma_async_tx_descriptor *tx_desc, *rx_desc;
int err;
- tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV);
+ tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV, xfer);
if (!tx_desc) {
dev_err(&drv_data->pdev->dev,
"failed to get DMA TX descriptor\n");
@@ -157,7 +154,7 @@ int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
goto err_tx;
}
- rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM);
+ rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM, xfer);
if (!rx_desc) {
dev_err(&drv_data->pdev->dev,
"failed to get DMA RX descriptor\n");
@@ -187,6 +184,13 @@ void pxa2xx_spi_dma_start(struct driver_data *drv_data)
atomic_set(&drv_data->dma_running, 1);
}
+void pxa2xx_spi_dma_stop(struct driver_data *drv_data)
+{
+ atomic_set(&drv_data->dma_running, 0);
+ dmaengine_terminate_sync(drv_data->master->dma_rx);
+ dmaengine_terminate_sync(drv_data->master->dma_tx);
+}
+
int pxa2xx_spi_dma_setup(struct driver_data *drv_data)
{
struct pxa2xx_spi_master *pdata = drv_data->master_info;
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 82dcb88fcfba..0b2d60d30f69 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -340,9 +340,11 @@ static void lpss_ssp_setup(struct driver_data *drv_data)
}
}
-static void lpss_ssp_select_cs(struct driver_data *drv_data,
+static void lpss_ssp_select_cs(struct spi_device *spi,
const struct lpss_config *config)
{
+ struct driver_data *drv_data =
+ spi_controller_get_devdata(spi->controller);
u32 value, cs;
if (!config->cs_sel_mask)
@@ -350,7 +352,7 @@ static void lpss_ssp_select_cs(struct driver_data *drv_data,
value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
- cs = drv_data->master->cur_msg->spi->chip_select;
+ cs = spi->chip_select;
cs <<= config->cs_sel_shift;
if (cs != (value & config->cs_sel_mask)) {
/*
@@ -369,15 +371,17 @@ static void lpss_ssp_select_cs(struct driver_data *drv_data,
}
}
-static void lpss_ssp_cs_control(struct driver_data *drv_data, bool enable)
+static void lpss_ssp_cs_control(struct spi_device *spi, bool enable)
{
+ struct driver_data *drv_data =
+ spi_controller_get_devdata(spi->controller);
const struct lpss_config *config;
u32 value;
config = lpss_get_config(drv_data);
if (enable)
- lpss_ssp_select_cs(drv_data, config);
+ lpss_ssp_select_cs(spi, config);
value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
if (enable)
@@ -387,10 +391,11 @@ static void lpss_ssp_cs_control(struct driver_data *drv_data, bool enable)
__lpss_ssp_write_priv(drv_data, config->reg_cs_ctrl, value);
}
-static void cs_assert(struct driver_data *drv_data)
+static void cs_assert(struct spi_device *spi)
{
- struct chip_data *chip =
- spi_get_ctldata(drv_data->master->cur_msg->spi);
+ struct chip_data *chip = spi_get_ctldata(spi);
+ struct driver_data *drv_data =
+ spi_controller_get_devdata(spi->controller);
if (drv_data->ssp_type == CE4100_SSP) {
pxa2xx_spi_write(drv_data, SSSR, chip->frm);
@@ -408,13 +413,14 @@ static void cs_assert(struct driver_data *drv_data)
}
if (is_lpss_ssp(drv_data))
- lpss_ssp_cs_control(drv_data, true);
+ lpss_ssp_cs_control(spi, true);
}
-static void cs_deassert(struct driver_data *drv_data)
+static void cs_deassert(struct spi_device *spi)
{
- struct chip_data *chip =
- spi_get_ctldata(drv_data->master->cur_msg->spi);
+ struct chip_data *chip = spi_get_ctldata(spi);
+ struct driver_data *drv_data =
+ spi_controller_get_devdata(spi->controller);
unsigned long timeout;
if (drv_data->ssp_type == CE4100_SSP)
@@ -437,7 +443,15 @@ static void cs_deassert(struct driver_data *drv_data)
}
if (is_lpss_ssp(drv_data))
- lpss_ssp_cs_control(drv_data, false);
+ lpss_ssp_cs_control(spi, false);
+}
+
+static void pxa2xx_spi_set_cs(struct spi_device *spi, bool level)
+{
+ if (level)
+ cs_deassert(spi);
+ else
+ cs_assert(spi);
}
int pxa2xx_spi_flush(struct driver_data *drv_data)
@@ -549,70 +563,6 @@ static int u32_reader(struct driver_data *drv_data)
return drv_data->rx == drv_data->rx_end;
}
-void *pxa2xx_spi_next_transfer(struct driver_data *drv_data)
-{
- struct spi_message *msg = drv_data->master->cur_msg;
- struct spi_transfer *trans = drv_data->cur_transfer;
-
- /* Move to next transfer */
- if (trans->transfer_list.next != &msg->transfers) {
- drv_data->cur_transfer =
- list_entry(trans->transfer_list.next,
- struct spi_transfer,
- transfer_list);
- return RUNNING_STATE;
- } else
- return DONE_STATE;
-}
-
-/* caller already set message->status; dma and pio irqs are blocked */
-static void giveback(struct driver_data *drv_data)
-{
- struct spi_transfer* last_transfer;
- struct spi_message *msg;
-
- msg = drv_data->master->cur_msg;
- drv_data->cur_transfer = NULL;
-
- last_transfer = list_last_entry(&msg->transfers, struct spi_transfer,
- transfer_list);
-
- /* Delay if requested before any change in chip select */
- if (last_transfer->delay_usecs)
- udelay(last_transfer->delay_usecs);
-
- /* Drop chip select UNLESS cs_change is true or we are returning
- * a message with an error, or next message is for another chip
- */
- if (!last_transfer->cs_change)
- cs_deassert(drv_data);
- else {
- struct spi_message *next_msg;
-
- /* Holding of cs was hinted, but we need to make sure
- * the next message is for the same chip. Don't waste
- * time with the following tests unless this was hinted.
- *
- * We cannot postpone this until pump_messages, because
- * after calling msg->complete (below) the driver that
- * sent the current message could be unloaded, which
- * could invalidate the cs_control() callback...
- */
-
- /* get a pointer to the next message, if any */
- next_msg = spi_get_next_queued_message(drv_data->master);
-
- /* see if the next and current messages point
- * to the same chip
- */
- if ((next_msg && next_msg->spi != msg->spi) ||
- msg->state == ERROR_STATE)
- cs_deassert(drv_data);
- }
-
- spi_finalize_current_message(drv_data->master);
-}
-
static void reset_sccr1(struct driver_data *drv_data)
{
struct chip_data *chip =
@@ -648,8 +598,8 @@ static void int_error_stop(struct driver_data *drv_data, const char* msg)
dev_err(&drv_data->pdev->dev, "%s\n", msg);
- drv_data->master->cur_msg->state = ERROR_STATE;
- tasklet_schedule(&drv_data->pump_transfers);
+ drv_data->master->cur_msg->status = -EIO;
+ spi_finalize_current_transfer(drv_data->master);
}
static void int_transfer_complete(struct driver_data *drv_data)
@@ -660,19 +610,7 @@ static void int_transfer_complete(struct driver_data *drv_data)
if (!pxa25x_ssp_comp(drv_data))
pxa2xx_spi_write(drv_data, SSTO, 0);
- /* Update total byte transferred return count actual bytes read */
- drv_data->master->cur_msg->actual_length += drv_data->len -
- (drv_data->rx_end - drv_data->rx);
-
- /* Transfer delays and chip select release are
- * handled in pump_transfers or giveback
- */
-
- /* Move to next transfer */
- drv_data->master->cur_msg->state = pxa2xx_spi_next_transfer(drv_data);
-
- /* Schedule transfer tasklet */
- tasklet_schedule(&drv_data->pump_transfers);
+ spi_finalize_current_transfer(drv_data->master);
}
static irqreturn_t interrupt_transfer(struct driver_data *drv_data)
@@ -973,17 +911,16 @@ static bool pxa2xx_spi_can_dma(struct spi_controller *master,
xfer->len >= chip->dma_burst_size;
}
-static void pump_transfers(unsigned long data)
+static int pxa2xx_spi_transfer_one(struct spi_controller *master,
+ struct spi_device *spi,
+ struct spi_transfer *transfer)
{
- struct driver_data *drv_data = (struct driver_data *)data;
- struct spi_controller *master = drv_data->master;
+ struct driver_data *drv_data = spi_controller_get_devdata(master);
struct spi_message *message = master->cur_msg;
struct chip_data *chip = spi_get_ctldata(message->spi);
u32 dma_thresh = chip->dma_threshold;
u32 dma_burst = chip->dma_burst_size;
u32 change_mask = pxa2xx_spi_get_ssrc1_change_mask(drv_data);
- struct spi_transfer *transfer;
- struct spi_transfer *previous;
u32 clk_div;
u8 bits;
u32 speed;
@@ -992,36 +929,6 @@ static void pump_transfers(unsigned long data)
int err;
int dma_mapped;
- /* Get current state information */
- transfer = drv_data->cur_transfer;
-
- /* Handle for abort */
- if (message->state == ERROR_STATE) {
- message->status = -EIO;
- giveback(drv_data);
- return;
- }
-
- /* Handle end of message */
- if (message->state == DONE_STATE) {
- message->status = 0;
- giveback(drv_data);
- return;
- }
-
- /* Delay if requested at end of transfer before CS change */
- if (message->state == RUNNING_STATE) {
- previous = list_entry(transfer->transfer_list.prev,
- struct spi_transfer,
- transfer_list);
- if (previous->delay_usecs)
- udelay(previous->delay_usecs);
-
- /* Drop chip select only if cs_change is requested */
- if (previous->cs_change)
- cs_deassert(drv_data);
- }
-
/* Check if we can DMA this transfer */
if (transfer->len > MAX_DMA_LEN && chip->enable_dma) {
@@ -1029,34 +936,27 @@ static void pump_transfers(unsigned long data)
if (message->is_dma_mapped
|| transfer->rx_dma || transfer->tx_dma) {
dev_err(&drv_data->pdev->dev,
- "pump_transfers: mapped transfer length of "
- "%u is greater than %d\n",
+ "Mapped transfer length of %u is greater than %d\n",
transfer->len, MAX_DMA_LEN);
- message->status = -EINVAL;
- giveback(drv_data);
- return;
+ return -EINVAL;
}
/* warn ... we force this to PIO mode */
dev_warn_ratelimited(&message->spi->dev,
- "pump_transfers: DMA disabled for transfer length %ld "
- "greater than %d\n",
- (long)drv_data->len, MAX_DMA_LEN);
+ "DMA disabled for transfer length %ld greater than %d\n",
+ (long)transfer->len, MAX_DMA_LEN);
}
/* Setup the transfer state based on the type of transfer */
if (pxa2xx_spi_flush(drv_data) == 0) {
- dev_err(&drv_data->pdev->dev, "pump_transfers: flush failed\n");
- message->status = -EIO;
- giveback(drv_data);
- return;
+ dev_err(&drv_data->pdev->dev, "Flush failed\n");
+ return -EIO;
}
drv_data->n_bytes = chip->n_bytes;
drv_data->tx = (void *)transfer->tx_buf;
drv_data->tx_end = drv_data->tx + transfer->len;
drv_data->rx = transfer->rx_buf;
drv_data->rx_end = drv_data->rx + transfer->len;
- drv_data->len = transfer->len;
drv_data->write = drv_data->tx ? chip->write : null_writer;
drv_data->read = drv_data->rx ? chip->read : null_reader;
@@ -1095,11 +995,9 @@ static void pump_transfers(unsigned long data)
bits, &dma_burst,
&dma_thresh))
dev_warn_ratelimited(&message->spi->dev,
- "pump_transfers: DMA burst size reduced to match bits_per_word\n");
+ "DMA burst size reduced to match bits_per_word\n");
}
- message->state = RUNNING_STATE;
-
dma_mapped = master->can_dma &&
master->can_dma(master, message->spi, transfer) &&
master->cur_msg_mapped;
@@ -1108,12 +1006,9 @@ static void pump_transfers(unsigned long data)
/* Ensure we have the correct interrupt handler */
drv_data->transfer_handler = pxa2xx_spi_dma_transfer;
- err = pxa2xx_spi_dma_prepare(drv_data, dma_burst);
- if (err) {
- message->status = err;
- giveback(drv_data);
- return;
- }
+ err = pxa2xx_spi_dma_prepare(drv_data, transfer);
+ if (err)
+ return err;
/* Clear status and start DMA engine */
cr1 = chip->cr1 | dma_thresh | drv_data->dma_cr1;
@@ -1175,27 +1070,40 @@ static void pump_transfers(unsigned long data)
pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
}
- cs_assert(drv_data);
-
- /* after chip select, release the data by enabling service
- * requests and interrupts, without changing any mode bits */
+ /*
+ * Release the data by enabling service requests and interrupts,
+ * without changing any mode bits
+ */
pxa2xx_spi_write(drv_data, SSCR1, cr1);
+
+ return 1;
}
-static int pxa2xx_spi_transfer_one_message(struct spi_controller *master,
- struct spi_message *msg)
+static void pxa2xx_spi_handle_err(struct spi_controller *master,
+ struct spi_message *msg)
{
struct driver_data *drv_data = spi_controller_get_devdata(master);
- /* Initial message state*/
- msg->state = START_STATE;
- drv_data->cur_transfer = list_entry(msg->transfers.next,
- struct spi_transfer,
- transfer_list);
+ /* Disable the SSP */
+ pxa2xx_spi_write(drv_data, SSCR0,
+ pxa2xx_spi_read(drv_data, SSCR0) & ~SSCR0_SSE);
+ /* Clear and disable interrupts and service requests */
+ write_SSSR_CS(drv_data, drv_data->clear_sr);
+ pxa2xx_spi_write(drv_data, SSCR1,
+ pxa2xx_spi_read(drv_data, SSCR1)
+ & ~(drv_data->int_cr1 | drv_data->dma_cr1));
+ if (!pxa25x_ssp_comp(drv_data))
+ pxa2xx_spi_write(drv_data, SSTO, 0);
- /* Mark as busy and launch transfers */
- tasklet_schedule(&drv_data->pump_transfers);
- return 0;
+ /*
+ * Stop the DMA if running. Note DMA callback handler may have unset
+ * the dma_running already, which is fine as stopping is not needed
+ * then but we shouldn't rely this flag for anything else than
+ * stopping. For instance to differentiate between PIO and DMA
+ * transfers.
+ */
+ if (atomic_read(&drv_data->dma_running))
+ pxa2xx_spi_dma_stop(drv_data);
}
static int pxa2xx_spi_unprepare_transfer(struct spi_controller *master)
@@ -1651,7 +1559,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
master->dma_alignment = DMA_ALIGNMENT;
master->cleanup = cleanup;
master->setup = setup;
- master->transfer_one_message = pxa2xx_spi_transfer_one_message;
+ master->set_cs = pxa2xx_spi_set_cs;
+ master->transfer_one = pxa2xx_spi_transfer_one;
+ master->handle_err = pxa2xx_spi_handle_err;
master->unprepare_transfer_hardware = pxa2xx_spi_unprepare_transfer;
master->fw_translate_cs = pxa2xx_spi_fw_translate_cs;
master->auto_runtime_pm = true;
@@ -1702,7 +1612,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
}
/* Enable SOC clock */
- clk_prepare_enable(ssp->clk);
+ status = clk_prepare_enable(ssp->clk);
+ if (status)
+ goto out_error_dma_irq_alloc;
master->max_speed_hz = clk_get_rate(ssp->clk);
@@ -1787,9 +1699,6 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
}
}
- tasklet_init(&drv_data->pump_transfers, pump_transfers,
- (unsigned long)drv_data);
-
pm_runtime_set_autosuspend_delay(&pdev->dev, 50);
pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_set_active(&pdev->dev);
@@ -1809,6 +1718,8 @@ out_error_clock_enabled:
pm_runtime_put_noidle(&pdev->dev);
pm_runtime_disable(&pdev->dev);
clk_disable_unprepare(ssp->clk);
+
+out_error_dma_irq_alloc:
pxa2xx_spi_dma_release(drv_data);
free_irq(ssp->irq, drv_data);
@@ -1882,8 +1793,11 @@ static int pxa2xx_spi_resume(struct device *dev)
int status;
/* Enable the SSP clock */
- if (!pm_runtime_suspended(dev))
- clk_prepare_enable(ssp->clk);
+ if (!pm_runtime_suspended(dev)) {
+ status = clk_prepare_enable(ssp->clk);
+ if (status)
+ return status;
+ }
/* Restore LPSS private register bits */
if (is_lpss_ssp(drv_data))
@@ -1912,9 +1826,10 @@ static int pxa2xx_spi_runtime_suspend(struct device *dev)
static int pxa2xx_spi_runtime_resume(struct device *dev)
{
struct driver_data *drv_data = dev_get_drvdata(dev);
+ int status;
- clk_prepare_enable(drv_data->ssp->clk);
- return 0;
+ status = clk_prepare_enable(drv_data->ssp->clk);
+ return status;
}
#endif
diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index 0ae7defd3492..513c53aaeab2 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -46,15 +46,10 @@ struct driver_data {
u32 clear_sr;
u32 mask_sr;
- /* Message Transfer pump */
- struct tasklet_struct pump_transfers;
-
/* DMA engine support */
atomic_t dma_running;
- /* Current message transfer state info */
- struct spi_transfer *cur_transfer;
- size_t len;
+ /* Current transfer state info */
void *tx;
void *tx_end;
void *rx;
@@ -104,11 +99,6 @@ static inline void pxa2xx_spi_write(const struct driver_data *drv_data,
__raw_writel(val, drv_data->ioaddr + reg);
}
-#define START_STATE ((void *)0)
-#define RUNNING_STATE ((void *)1)
-#define DONE_STATE ((void *)2)
-#define ERROR_STATE ((void *)-1)
-
#define DMA_ALIGNMENT 8
static inline int pxa25x_ssp_comp(struct driver_data *drv_data)
@@ -133,14 +123,15 @@ static inline void write_SSSR_CS(struct driver_data *drv_data, u32 val)
}
extern int pxa2xx_spi_flush(struct driver_data *drv_data);
-extern void *pxa2xx_spi_next_transfer(struct driver_data *drv_data);
#define MAX_DMA_LEN SZ_64K
#define DEFAULT_DMA_CR1 (SSCR1_TSRE | SSCR1_RSRE | SSCR1_TRAIL)
extern irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data);
-extern int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst);
+extern int pxa2xx_spi_dma_prepare(struct driver_data *drv_data,
+ struct spi_transfer *xfer);
extern void pxa2xx_spi_dma_start(struct driver_data *drv_data);
+extern void pxa2xx_spi_dma_stop(struct driver_data *drv_data);
extern int pxa2xx_spi_dma_setup(struct driver_data *drv_data);
extern void pxa2xx_spi_dma_release(struct driver_data *drv_data);
extern int pxa2xx_spi_set_dma_burst_and_threshold(struct chip_data *chip,
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index baa3a9fa2638..7b7151ec14c8 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -28,15 +28,15 @@
#define S3C64XX_SPI_CH_CFG 0x00
#define S3C64XX_SPI_CLK_CFG 0x04
-#define S3C64XX_SPI_MODE_CFG 0x08
-#define S3C64XX_SPI_SLAVE_SEL 0x0C
+#define S3C64XX_SPI_MODE_CFG 0x08
+#define S3C64XX_SPI_SLAVE_SEL 0x0C
#define S3C64XX_SPI_INT_EN 0x10
#define S3C64XX_SPI_STATUS 0x14
#define S3C64XX_SPI_TX_DATA 0x18
#define S3C64XX_SPI_RX_DATA 0x1C
-#define S3C64XX_SPI_PACKET_CNT 0x20
-#define S3C64XX_SPI_PENDING_CLR 0x24
-#define S3C64XX_SPI_SWAP_CFG 0x28
+#define S3C64XX_SPI_PACKET_CNT 0x20
+#define S3C64XX_SPI_PENDING_CLR 0x24
+#define S3C64XX_SPI_SWAP_CFG 0x28
#define S3C64XX_SPI_FB_CLK 0x2C
#define S3C64XX_SPI_CH_HS_EN (1<<6) /* High Speed Enable */
@@ -77,9 +77,9 @@
#define S3C64XX_SPI_INT_TX_FIFORDY_EN (1<<0)
#define S3C64XX_SPI_ST_RX_OVERRUN_ERR (1<<5)
-#define S3C64XX_SPI_ST_RX_UNDERRUN_ERR (1<<4)
+#define S3C64XX_SPI_ST_RX_UNDERRUN_ERR (1<<4)
#define S3C64XX_SPI_ST_TX_OVERRUN_ERR (1<<3)
-#define S3C64XX_SPI_ST_TX_UNDERRUN_ERR (1<<2)
+#define S3C64XX_SPI_ST_TX_UNDERRUN_ERR (1<<2)
#define S3C64XX_SPI_ST_RX_FIFORDY (1<<1)
#define S3C64XX_SPI_ST_TX_FIFORDY (1<<0)
@@ -100,7 +100,7 @@
#define S3C64XX_SPI_SWAP_TX_BIT (1<<1)
#define S3C64XX_SPI_SWAP_TX_EN (1<<0)
-#define S3C64XX_SPI_FBCLK_MSK (3<<0)
+#define S3C64XX_SPI_FBCLK_MSK (3<<0)
#define FIFO_LVL_MASK(i) ((i)->port_conf->fifo_lvl_mask[i->port_id])
#define S3C64XX_SPI_ST_TX_DONE(v, i) (((v) & \
@@ -156,7 +156,6 @@ struct s3c64xx_spi_port_config {
* @ioclk: Pointer to the i/o clock between master and slave
* @master: Pointer to the SPI Protocol master.
* @cntrlr_info: Platform specific data for the controller this driver manages.
- * @tgl_spi: Pointer to the last CS left untoggled by the cs_change hint.
* @lock: Controller specific lock.
* @state: Set of FLAGS to indicate status.
* @rx_dmach: Controller's DMA channel for Rx.
@@ -177,7 +176,6 @@ struct s3c64xx_spi_driver_data {
struct platform_device *pdev;
struct spi_master *master;
struct s3c64xx_spi_info *cntrlr_info;
- struct spi_device *tgl_spi;
spinlock_t lock;
unsigned long sfr_start;
struct completion xfer_completion;
@@ -190,7 +188,7 @@ struct s3c64xx_spi_driver_data {
unsigned int port_id;
};
-static void flush_fifo(struct s3c64xx_spi_driver_data *sdd)
+static void s3c64xx_flush_fifo(struct s3c64xx_spi_driver_data *sdd)
{
void __iomem *regs = sdd->regs;
unsigned long loops;
@@ -350,9 +348,8 @@ static bool s3c64xx_spi_can_dma(struct spi_master *master,
return xfer->len > (FIFO_LVL_MASK(sdd) >> 1) + 1;
}
-static void enable_datapath(struct s3c64xx_spi_driver_data *sdd,
- struct spi_device *spi,
- struct spi_transfer *xfer, int dma_mode)
+static void s3c64xx_enable_datapath(struct s3c64xx_spi_driver_data *sdd,
+ struct spi_transfer *xfer, int dma_mode)
{
void __iomem *regs = sdd->regs;
u32 modecfg, chcfg;
@@ -442,8 +439,8 @@ static u32 s3c64xx_spi_wait_for_timeout(struct s3c64xx_spi_driver_data *sdd,
return RX_FIFO_LVL(status, sdd);
}
-static int wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
- struct spi_transfer *xfer)
+static int s3c64xx_wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
+ struct spi_transfer *xfer)
{
void __iomem *regs = sdd->regs;
unsigned long val;
@@ -485,8 +482,8 @@ static int wait_for_dma(struct s3c64xx_spi_driver_data *sdd,
return 0;
}
-static int wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
- struct spi_transfer *xfer)
+static int s3c64xx_wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
+ struct spi_transfer *xfer)
{
void __iomem *regs = sdd->regs;
unsigned long val;
@@ -505,6 +502,8 @@ static int wait_for_pio(struct s3c64xx_spi_driver_data *sdd,
status = readl(regs + S3C64XX_SPI_STATUS);
} while (RX_FIFO_LVL(status, sdd) < xfer->len && --val);
+ if (!val)
+ return -EIO;
/* If it was only Tx */
if (!xfer->rx_buf) {
@@ -635,11 +634,15 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
struct spi_transfer *xfer)
{
struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(master);
+ const unsigned int fifo_len = (FIFO_LVL_MASK(sdd) >> 1) + 1;
+ const void *tx_buf = NULL;
+ void *rx_buf = NULL;
+ int target_len = 0, origin_len = 0;
+ int use_dma = 0;
int status;
u32 speed;
u8 bpw;
unsigned long flags;
- int use_dma;
reinit_completion(&sdd->xfer_completion);
@@ -654,48 +657,77 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
s3c64xx_spi_config(sdd);
}
- /* Polling method for xfers not bigger than FIFO capacity */
- use_dma = 0;
- if (!is_polling(sdd) &&
- (sdd->rx_dma.ch && sdd->tx_dma.ch &&
- (xfer->len > ((FIFO_LVL_MASK(sdd) >> 1) + 1))))
+ if (!is_polling(sdd) && (xfer->len > fifo_len) &&
+ sdd->rx_dma.ch && sdd->tx_dma.ch) {
use_dma = 1;
- spin_lock_irqsave(&sdd->lock, flags);
+ } else if (is_polling(sdd) && xfer->len > fifo_len) {
+ tx_buf = xfer->tx_buf;
+ rx_buf = xfer->rx_buf;
+ origin_len = xfer->len;
- /* Pending only which is to be done */
- sdd->state &= ~RXBUSY;
- sdd->state &= ~TXBUSY;
+ target_len = xfer->len;
+ if (xfer->len > fifo_len)
+ xfer->len = fifo_len;
+ }
- enable_datapath(sdd, spi, xfer, use_dma);
+ do {
+ spin_lock_irqsave(&sdd->lock, flags);
- /* Start the signals */
- s3c64xx_spi_set_cs(spi, true);
+ /* Pending only which is to be done */
+ sdd->state &= ~RXBUSY;
+ sdd->state &= ~TXBUSY;
- spin_unlock_irqrestore(&sdd->lock, flags);
+ s3c64xx_enable_datapath(sdd, xfer, use_dma);
- if (use_dma)
- status = wait_for_dma(sdd, xfer);
- else
- status = wait_for_pio(sdd, xfer);
-
- if (status) {
- dev_err(&spi->dev, "I/O Error: rx-%d tx-%d res:rx-%c tx-%c len-%d\n",
- xfer->rx_buf ? 1 : 0, xfer->tx_buf ? 1 : 0,
- (sdd->state & RXBUSY) ? 'f' : 'p',
- (sdd->state & TXBUSY) ? 'f' : 'p',
- xfer->len);
-
- if (use_dma) {
- if (xfer->tx_buf != NULL
- && (sdd->state & TXBUSY))
- dmaengine_terminate_all(sdd->tx_dma.ch);
- if (xfer->rx_buf != NULL
- && (sdd->state & RXBUSY))
- dmaengine_terminate_all(sdd->rx_dma.ch);
+ /* Start the signals */
+ s3c64xx_spi_set_cs(spi, true);
+
+ spin_unlock_irqrestore(&sdd->lock, flags);
+
+ if (use_dma)
+ status = s3c64xx_wait_for_dma(sdd, xfer);
+ else
+ status = s3c64xx_wait_for_pio(sdd, xfer);
+
+ if (status) {
+ dev_err(&spi->dev,
+ "I/O Error: rx-%d tx-%d res:rx-%c tx-%c len-%d\n",
+ xfer->rx_buf ? 1 : 0, xfer->tx_buf ? 1 : 0,
+ (sdd->state & RXBUSY) ? 'f' : 'p',
+ (sdd->state & TXBUSY) ? 'f' : 'p',
+ xfer->len);
+
+ if (use_dma) {
+ if (xfer->tx_buf && (sdd->state & TXBUSY))
+ dmaengine_terminate_all(sdd->tx_dma.ch);
+ if (xfer->rx_buf && (sdd->state & RXBUSY))
+ dmaengine_terminate_all(sdd->rx_dma.ch);
+ }
+ } else {
+ s3c64xx_flush_fifo(sdd);
}
- } else {
- flush_fifo(sdd);
+ if (target_len > 0) {
+ target_len -= xfer->len;
+
+ if (xfer->tx_buf)
+ xfer->tx_buf += xfer->len;
+
+ if (xfer->rx_buf)
+ xfer->rx_buf += xfer->len;
+
+ if (target_len > fifo_len)
+ xfer->len = fifo_len;
+ else
+ xfer->len = target_len;
+ }
+ } while (target_len > 0);
+
+ if (origin_len) {
+ /* Restore original xfer buffers and length */
+ xfer->tx_buf = tx_buf;
+ xfer->rx_buf = rx_buf;
+ xfer->len = origin_len;
}
return status;
@@ -891,7 +923,7 @@ static irqreturn_t s3c64xx_spi_irq(int irq, void *data)
return IRQ_HANDLED;
}
-static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd, int channel)
+static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd)
{
struct s3c64xx_spi_info *sci = sdd->cntrlr_info;
void __iomem *regs = sdd->regs;
@@ -929,7 +961,7 @@ static void s3c64xx_spi_hwinit(struct s3c64xx_spi_driver_data *sdd, int channel)
val |= (S3C64XX_SPI_TRAILCNT << S3C64XX_SPI_TRAILCNT_OFF);
writel(val, regs + S3C64XX_SPI_MODE_CFG);
- flush_fifo(sdd);
+ s3c64xx_flush_fifo(sdd);
}
#ifdef CONFIG_OF
@@ -1145,7 +1177,7 @@ static int s3c64xx_spi_probe(struct platform_device *pdev)
pm_runtime_get_sync(&pdev->dev);
/* Setup Deufult Mode */
- s3c64xx_spi_hwinit(sdd, sdd->port_id);
+ s3c64xx_spi_hwinit(sdd);
spin_lock_init(&sdd->lock);
init_completion(&sdd->xfer_completion);
@@ -1260,8 +1292,6 @@ static int s3c64xx_spi_resume(struct device *dev)
if (ret < 0)
return ret;
- s3c64xx_spi_hwinit(sdd, sdd->port_id);
-
return spi_master_resume(master);
}
#endif /* CONFIG_PM_SLEEP */
@@ -1299,6 +1329,8 @@ static int s3c64xx_spi_runtime_resume(struct device *dev)
if (ret != 0)
goto err_disable_src_clk;
+ s3c64xx_spi_hwinit(sdd);
+
return 0;
err_disable_src_clk:
@@ -1344,15 +1376,6 @@ static struct s3c64xx_spi_port_config exynos4_spi_port_config = {
.clk_from_cmu = true,
};
-static struct s3c64xx_spi_port_config exynos5440_spi_port_config = {
- .fifo_lvl_mask = { 0x1ff },
- .rx_lvl_offset = 15,
- .tx_st_done = 25,
- .high_speed = true,
- .clk_from_cmu = true,
- .quirks = S3C64XX_SPI_QUIRK_POLL,
-};
-
static struct s3c64xx_spi_port_config exynos7_spi_port_config = {
.fifo_lvl_mask = { 0x1ff, 0x7F, 0x7F, 0x7F, 0x7F, 0x1ff},
.rx_lvl_offset = 15,
@@ -1396,9 +1419,6 @@ static const struct of_device_id s3c64xx_spi_dt_match[] = {
{ .compatible = "samsung,exynos4210-spi",
.data = (void *)&exynos4_spi_port_config,
},
- { .compatible = "samsung,exynos5440-spi",
- .data = (void *)&exynos5440_spi_port_config,
- },
{ .compatible = "samsung,exynos7-spi",
.data = (void *)&exynos7_spi_port_config,
},
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 8171eedbfc90..0e74cbf9929d 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -39,7 +39,7 @@ struct sh_msiof_chipdata {
u16 tx_fifo_size;
u16 rx_fifo_size;
u16 master_flags;
- u16 min_div;
+ u16 min_div_pow;
};
struct sh_msiof_spi_priv {
@@ -51,7 +51,7 @@ struct sh_msiof_spi_priv {
struct completion done;
unsigned int tx_fifo_size;
unsigned int rx_fifo_size;
- unsigned int min_div;
+ unsigned int min_div_pow;
void *tx_dma_page;
void *rx_dma_page;
dma_addr_t tx_dma_addr;
@@ -249,43 +249,46 @@ static irqreturn_t sh_msiof_spi_irq(int irq, void *data)
return IRQ_HANDLED;
}
-static struct {
- unsigned short div;
- unsigned short brdv;
-} const sh_msiof_spi_div_table[] = {
- { 1, SCR_BRDV_DIV_1 },
- { 2, SCR_BRDV_DIV_2 },
- { 4, SCR_BRDV_DIV_4 },
- { 8, SCR_BRDV_DIV_8 },
- { 16, SCR_BRDV_DIV_16 },
- { 32, SCR_BRDV_DIV_32 },
+static const u32 sh_msiof_spi_div_array[] = {
+ SCR_BRDV_DIV_1, SCR_BRDV_DIV_2, SCR_BRDV_DIV_4,
+ SCR_BRDV_DIV_8, SCR_BRDV_DIV_16, SCR_BRDV_DIV_32,
};
static void sh_msiof_spi_set_clk_regs(struct sh_msiof_spi_priv *p,
unsigned long parent_rate, u32 spi_hz)
{
- unsigned long div = 1024;
+ unsigned long div;
u32 brps, scr;
- size_t k;
+ unsigned int div_pow = p->min_div_pow;
- if (!WARN_ON(!spi_hz || !parent_rate))
- div = DIV_ROUND_UP(parent_rate, spi_hz);
-
- div = max_t(unsigned long, div, p->min_div);
+ if (!spi_hz || !parent_rate) {
+ WARN(1, "Invalid clock rate parameters %lu and %u\n",
+ parent_rate, spi_hz);
+ return;
+ }
- for (k = 0; k < ARRAY_SIZE(sh_msiof_spi_div_table); k++) {
- brps = DIV_ROUND_UP(div, sh_msiof_spi_div_table[k].div);
+ div = DIV_ROUND_UP(parent_rate, spi_hz);
+ if (div <= 1024) {
/* SCR_BRDV_DIV_1 is valid only if BRPS is x 1/1 or x 1/2 */
- if (sh_msiof_spi_div_table[k].div == 1 && brps > 2)
- continue;
- if (brps <= 32) /* max of brdv is 32 */
- break;
- }
+ if (!div_pow && div <= 32 && div > 2)
+ div_pow = 1;
+
+ if (div_pow)
+ brps = (div + 1) >> div_pow;
+ else
+ brps = div;
- k = min_t(int, k, ARRAY_SIZE(sh_msiof_spi_div_table) - 1);
- brps = min_t(int, brps, 32);
+ for (; brps > 32; div_pow++)
+ brps = (brps + 1) >> 1;
+ } else {
+ /* Set transfer rate composite divisor to 2^5 * 32 = 1024 */
+ dev_err(&p->pdev->dev,
+ "Requested SPI transfer rate %d is too low\n", spi_hz);
+ div_pow = 5;
+ brps = 32;
+ }
- scr = sh_msiof_spi_div_table[k].brdv | SCR_BRPS(brps);
+ scr = sh_msiof_spi_div_array[div_pow] | SCR_BRPS(brps);
sh_msiof_write(p, TSCR, scr);
if (!(p->master->flags & SPI_MASTER_MUST_TX))
sh_msiof_write(p, RSCR, scr);
@@ -564,14 +567,16 @@ static int sh_msiof_spi_setup(struct spi_device *spi)
/* Configure native chip select mode/polarity early */
clr = MDR1_SYNCMD_MASK;
- set = MDR1_TRMD | TMDR1_PCON | MDR1_SYNCMD_SPI;
+ set = MDR1_SYNCMD_SPI;
if (spi->mode & SPI_CS_HIGH)
clr |= BIT(MDR1_SYNCAC_SHIFT);
else
set |= BIT(MDR1_SYNCAC_SHIFT);
pm_runtime_get_sync(&p->pdev->dev);
tmp = sh_msiof_read(p, TMDR1) & ~clr;
- sh_msiof_write(p, TMDR1, tmp | set);
+ sh_msiof_write(p, TMDR1, tmp | set | MDR1_TRMD | TMDR1_PCON);
+ tmp = sh_msiof_read(p, RMDR1) & ~clr;
+ sh_msiof_write(p, RMDR1, tmp | set);
pm_runtime_put(&p->pdev->dev);
p->native_cs_high = spi->mode & SPI_CS_HIGH;
p->native_cs_inited = true;
@@ -1041,21 +1046,21 @@ static const struct sh_msiof_chipdata sh_data = {
.tx_fifo_size = 64,
.rx_fifo_size = 64,
.master_flags = 0,
- .min_div = 1,
+ .min_div_pow = 0,
};
static const struct sh_msiof_chipdata rcar_gen2_data = {
.tx_fifo_size = 64,
.rx_fifo_size = 64,
.master_flags = SPI_MASTER_MUST_TX,
- .min_div = 1,
+ .min_div_pow = 0,
};
static const struct sh_msiof_chipdata rcar_gen3_data = {
.tx_fifo_size = 64,
.rx_fifo_size = 64,
.master_flags = SPI_MASTER_MUST_TX,
- .min_div = 2,
+ .min_div_pow = 1,
};
static const struct of_device_id sh_msiof_match[] = {
@@ -1319,7 +1324,7 @@ static int sh_msiof_spi_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, p);
p->master = master;
p->info = info;
- p->min_div = chipdata->min_div;
+ p->min_div_pow = chipdata->min_div_pow;
init_completion(&p->done);
diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c
index ba9743fa2326..ad1e55d3d5d5 100644
--- a/drivers/spi/spi-stm32.c
+++ b/drivers/spi/spi-stm32.c
@@ -1129,7 +1129,7 @@ static int stm32_spi_probe(struct platform_device *pdev)
if (!spi->clk_rate) {
dev_err(&pdev->dev, "clk rate = 0\n");
ret = -EINVAL;
- goto err_master_put;
+ goto err_clk_disable;
}
spi->rst = devm_reset_control_get_exclusive(&pdev->dev, NULL);
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index c24d9b45a27c..5f19016bbf10 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -36,6 +36,7 @@
#include <linux/sizes.h>
#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
struct ti_qspi_regs {
u32 clkctrl;
@@ -50,6 +51,7 @@ struct ti_qspi {
struct spi_master *master;
void __iomem *base;
void __iomem *mmap_base;
+ size_t mmap_size;
struct regmap *ctrl_base;
unsigned int ctrl_reg;
struct clk *fclk;
@@ -434,12 +436,10 @@ static int ti_qspi_dma_xfer(struct ti_qspi *qspi, dma_addr_t dma_dst,
return 0;
}
-static int ti_qspi_dma_bounce_buffer(struct ti_qspi *qspi,
- struct spi_flash_read_message *msg)
+static int ti_qspi_dma_bounce_buffer(struct ti_qspi *qspi, loff_t offs,
+ void *to, size_t readsize)
{
- size_t readsize = msg->len;
- void *to = msg->buf;
- dma_addr_t dma_src = qspi->mmap_phys_base + msg->from;
+ dma_addr_t dma_src = qspi->mmap_phys_base + offs;
int ret = 0;
/*
@@ -507,13 +507,14 @@ static void ti_qspi_disable_memory_map(struct spi_device *spi)
qspi->mmap_enabled = false;
}
-static void ti_qspi_setup_mmap_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
+static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode,
+ u8 data_nbits, u8 addr_width,
+ u8 dummy_bytes)
{
struct ti_qspi *qspi = spi_master_get_devdata(spi->master);
- u32 memval = msg->read_opcode;
+ u32 memval = opcode;
- switch (msg->data_nbits) {
+ switch (data_nbits) {
case SPI_NBITS_QUAD:
memval |= QSPI_SETUP_RD_QUAD;
break;
@@ -524,48 +525,64 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi,
memval |= QSPI_SETUP_RD_NORMAL;
break;
}
- memval |= ((msg->addr_width - 1) << QSPI_SETUP_ADDR_SHIFT |
- msg->dummy_bytes << QSPI_SETUP_DUMMY_SHIFT);
+ memval |= ((addr_width - 1) << QSPI_SETUP_ADDR_SHIFT |
+ dummy_bytes << QSPI_SETUP_DUMMY_SHIFT);
ti_qspi_write(qspi, memval,
QSPI_SPI_SETUP_REG(spi->chip_select));
}
-static bool ti_qspi_spi_flash_can_dma(struct spi_device *spi,
- struct spi_flash_read_message *msg)
+static int ti_qspi_exec_mem_op(struct spi_mem *mem,
+ const struct spi_mem_op *op)
{
- return virt_addr_valid(msg->buf);
-}
-
-static int ti_qspi_spi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
-{
- struct ti_qspi *qspi = spi_master_get_devdata(spi->master);
+ struct ti_qspi *qspi = spi_master_get_devdata(mem->spi->master);
+ u32 from = 0;
int ret = 0;
+ /* Only optimize read path. */
+ if (!op->data.nbytes || op->data.dir != SPI_MEM_DATA_IN ||
+ !op->addr.nbytes || op->addr.nbytes > 4)
+ return -ENOTSUPP;
+
+ /* Address exceeds MMIO window size, fall back to regular mode. */
+ from = op->addr.val;
+ if (from + op->data.nbytes > qspi->mmap_size)
+ return -ENOTSUPP;
+
mutex_lock(&qspi->list_lock);
if (!qspi->mmap_enabled)
- ti_qspi_enable_memory_map(spi);
- ti_qspi_setup_mmap_read(spi, msg);
+ ti_qspi_enable_memory_map(mem->spi);
+ ti_qspi_setup_mmap_read(mem->spi, op->cmd.opcode, op->data.buswidth,
+ op->addr.nbytes, op->dummy.nbytes);
if (qspi->rx_chan) {
- if (msg->cur_msg_mapped)
- ret = ti_qspi_dma_xfer_sg(qspi, msg->rx_sg, msg->from);
- else
- ret = ti_qspi_dma_bounce_buffer(qspi, msg);
- if (ret)
- goto err_unlock;
+ struct sg_table sgt;
+
+ if (virt_addr_valid(op->data.buf.in) &&
+ !spi_controller_dma_map_mem_op_data(mem->spi->master, op,
+ &sgt)) {
+ ret = ti_qspi_dma_xfer_sg(qspi, sgt, from);
+ spi_controller_dma_unmap_mem_op_data(mem->spi->master,
+ op, &sgt);
+ } else {
+ ret = ti_qspi_dma_bounce_buffer(qspi, from,
+ op->data.buf.in,
+ op->data.nbytes);
+ }
} else {
- memcpy_fromio(msg->buf, qspi->mmap_base + msg->from, msg->len);
+ memcpy_fromio(op->data.buf.in, qspi->mmap_base + from,
+ op->data.nbytes);
}
- msg->retlen = msg->len;
-err_unlock:
mutex_unlock(&qspi->list_lock);
return ret;
}
+static const struct spi_controller_mem_ops ti_qspi_mem_ops = {
+ .exec_op = ti_qspi_exec_mem_op,
+};
+
static int ti_qspi_start_transfer_one(struct spi_master *master,
struct spi_message *m)
{
@@ -672,7 +689,7 @@ static int ti_qspi_probe(struct platform_device *pdev)
master->dev.of_node = pdev->dev.of_node;
master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) |
SPI_BPW_MASK(8);
- master->spi_flash_read = ti_qspi_spi_flash_read;
+ master->mem_ops = &ti_qspi_mem_ops;
if (!of_property_read_u32(np, "num-cs", &num_cs))
master->num_chipselect = num_cs;
@@ -702,6 +719,9 @@ static int ti_qspi_probe(struct platform_device *pdev)
}
}
+ if (res_mmap)
+ qspi->mmap_size = resource_size(res_mmap);
+
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
dev_err(&pdev->dev, "no irq resource?\n");
@@ -770,7 +790,6 @@ static int ti_qspi_probe(struct platform_device *pdev)
dma_release_channel(qspi->rx_chan);
goto no_dma;
}
- master->spi_flash_can_dma = ti_qspi_spi_flash_can_dma;
master->dma_rx = qspi->rx_chan;
init_completion(&qspi->transfer_complete);
if (res_mmap)
@@ -784,7 +803,7 @@ no_dma:
"mmap failed with error %ld using PIO mode\n",
PTR_ERR(qspi->mmap_base));
qspi->mmap_base = NULL;
- master->spi_flash_read = NULL;
+ master->mem_ops = NULL;
}
}
qspi->mmap_enabled = false;
diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c
index 18aeaceee286..cc4d31033494 100644
--- a/drivers/spi/spi-zynqmp-gqspi.c
+++ b/drivers/spi/spi-zynqmp-gqspi.c
@@ -20,6 +20,7 @@
#include <linux/of_irq.h>
#include <linux/of_address.h>
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/spi/spi.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
@@ -135,6 +136,7 @@
#define GQSPI_DMA_UNALIGN 0x3
#define GQSPI_DEFAULT_NUM_CS 1 /* Default number of chip selects */
+#define SPI_AUTOSUSPEND_TIMEOUT 3000
enum mode_type {GQSPI_MODE_IO, GQSPI_MODE_DMA};
/**
@@ -356,21 +358,9 @@ static void zynqmp_qspi_copy_read_data(struct zynqmp_qspi *xqspi,
static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
{
struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
- int ret;
-
- ret = clk_enable(xqspi->refclk);
- if (ret)
- return ret;
-
- ret = clk_enable(xqspi->pclk);
- if (ret)
- goto clk_err;
zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, GQSPI_EN_MASK);
return 0;
-clk_err:
- clk_disable(xqspi->refclk);
- return ret;
}
/**
@@ -387,8 +377,6 @@ static int zynqmp_unprepare_transfer_hardware(struct spi_master *master)
struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0);
- clk_disable(xqspi->refclk);
- clk_disable(xqspi->pclk);
return 0;
}
@@ -918,8 +906,7 @@ static int zynqmp_qspi_start_transfer(struct spi_master *master,
*/
static int __maybe_unused zynqmp_qspi_suspend(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
- struct spi_master *master = platform_get_drvdata(pdev);
+ struct spi_master *master = dev_get_drvdata(dev);
spi_master_suspend(master);
@@ -939,8 +926,7 @@ static int __maybe_unused zynqmp_qspi_suspend(struct device *dev)
*/
static int __maybe_unused zynqmp_qspi_resume(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
- struct spi_master *master = platform_get_drvdata(pdev);
+ struct spi_master *master = dev_get_drvdata(dev);
struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
int ret = 0;
@@ -959,11 +945,67 @@ static int __maybe_unused zynqmp_qspi_resume(struct device *dev)
spi_master_resume(master);
+ clk_disable(xqspi->refclk);
+ clk_disable(xqspi->pclk);
return 0;
}
-static SIMPLE_DEV_PM_OPS(zynqmp_qspi_dev_pm_ops, zynqmp_qspi_suspend,
- zynqmp_qspi_resume);
+/**
+ * zynqmp_runtime_suspend - Runtime suspend method for the SPI driver
+ * @dev: Address of the platform_device structure
+ *
+ * This function disables the clocks
+ *
+ * Return: Always 0
+ */
+static int __maybe_unused zynqmp_runtime_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct spi_master *master = platform_get_drvdata(pdev);
+ struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
+
+ clk_disable(xqspi->refclk);
+ clk_disable(xqspi->pclk);
+
+ return 0;
+}
+
+/**
+ * zynqmp_runtime_resume - Runtime resume method for the SPI driver
+ * @dev: Address of the platform_device structure
+ *
+ * This function enables the clocks
+ *
+ * Return: 0 on success and error value on error
+ */
+static int __maybe_unused zynqmp_runtime_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct spi_master *master = platform_get_drvdata(pdev);
+ struct zynqmp_qspi *xqspi = spi_master_get_devdata(master);
+ int ret;
+
+ ret = clk_enable(xqspi->pclk);
+ if (ret) {
+ dev_err(dev, "Cannot enable APB clock.\n");
+ return ret;
+ }
+
+ ret = clk_enable(xqspi->refclk);
+ if (ret) {
+ dev_err(dev, "Cannot enable device clock.\n");
+ clk_disable(xqspi->pclk);
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct dev_pm_ops zynqmp_qspi_dev_pm_ops = {
+ SET_RUNTIME_PM_OPS(zynqmp_runtime_suspend,
+ zynqmp_runtime_resume, NULL)
+ SET_SYSTEM_SLEEP_PM_OPS(zynqmp_qspi_suspend, zynqmp_qspi_resume)
+};
/**
* zynqmp_qspi_probe: Probe method for the QSPI driver
@@ -1023,9 +1065,15 @@ static int zynqmp_qspi_probe(struct platform_device *pdev)
goto clk_dis_pclk;
}
+ pm_runtime_use_autosuspend(&pdev->dev);
+ pm_runtime_set_autosuspend_delay(&pdev->dev, SPI_AUTOSUSPEND_TIMEOUT);
+ pm_runtime_set_active(&pdev->dev);
+ pm_runtime_enable(&pdev->dev);
/* QSPI controller initializations */
zynqmp_qspi_init_hw(xqspi);
+ pm_runtime_mark_last_busy(&pdev->dev);
+ pm_runtime_put_autosuspend(&pdev->dev);
xqspi->irq = platform_get_irq(pdev, 0);
if (xqspi->irq <= 0) {
ret = -ENXIO;
@@ -1063,6 +1111,8 @@ static int zynqmp_qspi_probe(struct platform_device *pdev)
return 0;
clk_dis_all:
+ pm_runtime_set_suspended(&pdev->dev);
+ pm_runtime_disable(&pdev->dev);
clk_disable_unprepare(xqspi->refclk);
clk_dis_pclk:
clk_disable_unprepare(xqspi->pclk);
@@ -1090,6 +1140,8 @@ static int zynqmp_qspi_remove(struct platform_device *pdev)
zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0);
clk_disable_unprepare(xqspi->refclk);
clk_disable_unprepare(xqspi->pclk);
+ pm_runtime_set_suspended(&pdev->dev);
+ pm_runtime_disable(&pdev->dev);
spi_unregister_master(master);
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7b213faa0a2b..20b5b2754830 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/mod_devicetable.h>
#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
#include <linux/of_gpio.h>
#include <linux/pm_runtime.h>
#include <linux/pm_domain.h>
@@ -46,6 +47,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/spi.h>
+#include "internals.h"
+
static DEFINE_IDR(spi_master_idr);
static void spidev_release(struct device *dev)
@@ -740,9 +743,9 @@ static void spi_set_cs(struct spi_device *spi, bool enable)
}
#ifdef CONFIG_HAS_DMA
-static int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
- struct sg_table *sgt, void *buf, size_t len,
- enum dma_data_direction dir)
+int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
+ struct sg_table *sgt, void *buf, size_t len,
+ enum dma_data_direction dir)
{
const bool vmalloced_buf = is_vmalloc_addr(buf);
unsigned int max_seg_size = dma_get_max_seg_size(dev);
@@ -821,8 +824,8 @@ static int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
return 0;
}
-static void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
- struct sg_table *sgt, enum dma_data_direction dir)
+void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev,
+ struct sg_table *sgt, enum dma_data_direction dir)
{
if (sgt->orig_nents) {
dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
@@ -907,19 +910,6 @@ static int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg)
return 0;
}
#else /* !CONFIG_HAS_DMA */
-static inline int spi_map_buf(struct spi_controller *ctlr, struct device *dev,
- struct sg_table *sgt, void *buf, size_t len,
- enum dma_data_direction dir)
-{
- return -EINVAL;
-}
-
-static inline void spi_unmap_buf(struct spi_controller *ctlr,
- struct device *dev, struct sg_table *sgt,
- enum dma_data_direction dir)
-{
-}
-
static inline int __spi_map_msg(struct spi_controller *ctlr,
struct spi_message *msg)
{
@@ -1222,6 +1212,7 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
if (!was_busy && ctlr->auto_runtime_pm) {
ret = pm_runtime_get_sync(ctlr->dev.parent);
if (ret < 0) {
+ pm_runtime_put_noidle(ctlr->dev.parent);
dev_err(&ctlr->dev, "Failed to power device: %d\n",
ret);
mutex_unlock(&ctlr->io_mutex);
@@ -1533,6 +1524,22 @@ err_init_queue:
return ret;
}
+/**
+ * spi_flush_queue - Send all pending messages in the queue from the callers'
+ * context
+ * @ctlr: controller to process queue for
+ *
+ * This should be used when one wants to ensure all pending messages have been
+ * sent before doing something. Is used by the spi-mem code to make sure SPI
+ * memory operations do not preempt regular SPI transfers that have been queued
+ * before the spi-mem operation.
+ */
+void spi_flush_queue(struct spi_controller *ctlr)
+{
+ if (ctlr->transfer == spi_queued_transfer)
+ __spi_pump_messages(ctlr, false);
+}
+
/*-------------------------------------------------------------------------*/
#if defined(CONFIG_OF)
@@ -2063,6 +2070,26 @@ static int of_spi_register_master(struct spi_controller *ctlr)
}
#endif
+static int spi_controller_check_ops(struct spi_controller *ctlr)
+{
+ /*
+ * The controller may implement only the high-level SPI-memory like
+ * operations if it does not support regular SPI transfers, and this is
+ * valid use case.
+ * If ->mem_ops is NULL, we request that at least one of the
+ * ->transfer_xxx() method be implemented.
+ */
+ if (ctlr->mem_ops) {
+ if (!ctlr->mem_ops->exec_op)
+ return -EINVAL;
+ } else if (!ctlr->transfer && !ctlr->transfer_one &&
+ !ctlr->transfer_one_message) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/**
* spi_register_controller - register SPI master or slave controller
* @ctlr: initialized master, originally from spi_alloc_master() or
@@ -2096,6 +2123,14 @@ int spi_register_controller(struct spi_controller *ctlr)
if (!dev)
return -ENODEV;
+ /*
+ * Make sure all necessary hooks are implemented before registering
+ * the SPI controller.
+ */
+ status = spi_controller_check_ops(ctlr);
+ if (status)
+ return status;
+
if (!spi_controller_is_slave(ctlr)) {
status = of_spi_register_master(ctlr);
if (status)
@@ -2161,10 +2196,14 @@ int spi_register_controller(struct spi_controller *ctlr)
spi_controller_is_slave(ctlr) ? "slave" : "master",
dev_name(&ctlr->dev));
- /* If we're using a queued driver, start the queue */
- if (ctlr->transfer)
+ /*
+ * If we're using a queued driver, start the queue. Note that we don't
+ * need the queueing logic if the driver is only supporting high-level
+ * memory operations.
+ */
+ if (ctlr->transfer) {
dev_info(dev, "controller is unqueued, this is deprecated\n");
- else {
+ } else if (ctlr->transfer_one || ctlr->transfer_one_message) {
status = spi_controller_initialize_queue(ctlr);
if (status) {
device_del(&ctlr->dev);
@@ -2894,6 +2933,13 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
{
struct spi_controller *ctlr = spi->controller;
+ /*
+ * Some controllers do not support doing regular SPI transfers. Return
+ * ENOTSUPP when this is the case.
+ */
+ if (!ctlr->transfer)
+ return -ENOTSUPP;
+
message->spi = spi;
SPI_STATISTICS_INCREMENT_FIELD(&ctlr->statistics, spi_async);
@@ -3010,63 +3056,6 @@ int spi_async_locked(struct spi_device *spi, struct spi_message *message)
}
EXPORT_SYMBOL_GPL(spi_async_locked);
-
-int spi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg)
-
-{
- struct spi_controller *master = spi->controller;
- struct device *rx_dev = NULL;
- int ret;
-
- if ((msg->opcode_nbits == SPI_NBITS_DUAL ||
- msg->addr_nbits == SPI_NBITS_DUAL) &&
- !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD)))
- return -EINVAL;
- if ((msg->opcode_nbits == SPI_NBITS_QUAD ||
- msg->addr_nbits == SPI_NBITS_QUAD) &&
- !(spi->mode & SPI_TX_QUAD))
- return -EINVAL;
- if (msg->data_nbits == SPI_NBITS_DUAL &&
- !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD)))
- return -EINVAL;
- if (msg->data_nbits == SPI_NBITS_QUAD &&
- !(spi->mode & SPI_RX_QUAD))
- return -EINVAL;
-
- if (master->auto_runtime_pm) {
- ret = pm_runtime_get_sync(master->dev.parent);
- if (ret < 0) {
- dev_err(&master->dev, "Failed to power device: %d\n",
- ret);
- return ret;
- }
- }
-
- mutex_lock(&master->bus_lock_mutex);
- mutex_lock(&master->io_mutex);
- if (master->dma_rx && master->spi_flash_can_dma(spi, msg)) {
- rx_dev = master->dma_rx->device->dev;
- ret = spi_map_buf(master, rx_dev, &msg->rx_sg,
- msg->buf, msg->len,
- DMA_FROM_DEVICE);
- if (!ret)
- msg->cur_msg_mapped = true;
- }
- ret = master->spi_flash_read(spi, msg);
- if (msg->cur_msg_mapped)
- spi_unmap_buf(master, rx_dev, &msg->rx_sg,
- DMA_FROM_DEVICE);
- mutex_unlock(&master->io_mutex);
- mutex_unlock(&master->bus_lock_mutex);
-
- if (master->auto_runtime_pm)
- pm_runtime_put(master->dev.parent);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(spi_flash_read);
-
/*-------------------------------------------------------------------------*/
/* Utility methods for SPI protocol drivers, layered on
diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a201af..5471b2212a62 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
long elapsed;
__poll_t mask;
- mask = f->f_op->poll(f, &table.pt);
+ mask = vfs_poll(f, &table.pt);
if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
EPOLLHUP | EPOLLERR)) {
break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
result = -1;
if (!IS_ERR(f)) {
- if (f->f_op->poll) {
+ if (file_can_poll(f)) {
serial2002_tty_read_poll_wait(f, timeout);
if (kernel_read(f, &ch, 1, &pos) == 1)
diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c
index 5703dd176787..208b5c161631 100644
--- a/drivers/staging/ipx/af_ipx.c
+++ b/drivers/staging/ipx/af_ipx.c
@@ -1965,7 +1965,7 @@ static const struct proto_ops ipx_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = ipx_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = ipx_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ipx_compat_ioctl,
diff --git a/drivers/staging/ncpfs/dir.c b/drivers/staging/ncpfs/dir.c
index 0c57c5c5d40a..072bcb12898f 100644
--- a/drivers/staging/ncpfs/dir.c
+++ b/drivers/staging/ncpfs/dir.c
@@ -823,12 +823,11 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
struct ncp_server *server = NCP_SERVER(dir);
struct inode *inode = NULL;
struct ncp_entry_info finfo;
- int error, res, len;
+ int res, len;
__u8 __name[NCP_MAXPATHLEN + 1];
- error = -EIO;
if (!ncp_conn_valid(server))
- goto finished;
+ return ERR_PTR(-EIO);
ncp_vdbg("server lookup for %pd2\n", dentry);
@@ -847,31 +846,20 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
res = ncp_obtain_info(server, dir, __name, &(finfo.i));
}
ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
- /*
- * If we didn't find an entry, make a negative dentry.
- */
- if (res)
- goto add_entry;
-
- /*
- * Create an inode for the entry.
- */
- finfo.opened = 0;
- finfo.ino = iunique(dir->i_sb, 2);
- finfo.volume = finfo.i.volNumber;
- error = -EACCES;
- inode = ncp_iget(dir->i_sb, &finfo);
-
- if (inode) {
- ncp_new_dentry(dentry);
-add_entry:
- d_add(dentry, inode);
- error = 0;
+ if (!res) {
+ /*
+ * Entry found; create an inode for it.
+ */
+ finfo.opened = 0;
+ finfo.ino = iunique(dir->i_sb, 2);
+ finfo.volume = finfo.i.volNumber;
+ inode = ncp_iget(dir->i_sb, &finfo);
+ if (unlikely(!inode))
+ inode = ERR_PTR(-EACCES);
+ else
+ ncp_new_dentry(dentry);
}
-
-finished:
- ncp_vdbg("result=%d\n", error);
- return ERR_PTR(error);
+ return d_splice_alias(inode, dentry);
}
/*
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f1be10..2a1be859ee71 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
- events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
+ events = vfs_poll(irqfd.file, &virqfd->pt);
/*
* Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f0be5f35ab28..895eaa25807c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
if (poll->wqh)
return 0;
- mask = file->f_op->poll(file, &poll->table);
+ mask = vfs_poll(file, &poll->table);
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c
index 075d120e7b88..0364d3329c52 100644
--- a/drivers/w1/w1_io.c
+++ b/drivers/w1/w1_io.c
@@ -194,6 +194,7 @@ static u8 w1_read_bit(struct w1_master *dev)
* bit 0 = id_bit
* bit 1 = comp_bit
* bit 2 = dir_taken
+ *
* If both bits 0 & 1 are set, the search should be restarted.
*
* Return: bit fields - see above
diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index 47728477297e..67fa900572a9 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -101,6 +101,7 @@ static void __init mark_region(unsigned long start, unsigned long end,
end = end > Z2RAM_END ? Z2RAM_SIZE : end-Z2RAM_START;
while (start < end) {
u32 chunk = start>>Z2RAM_CHUNKSHIFT;
+
if (flag)
set_bit(chunk, zorro_unused_z2ram);
else
@@ -117,6 +118,7 @@ static struct resource __init *zorro_find_parent_resource(
for (i = 0; i < bridge->num_resources; i++) {
struct resource *r = &bridge->resource[i];
+
if (zorro_resource_start(z) >= r->start &&
zorro_resource_end(z) <= r->end)
return r;
@@ -168,6 +170,7 @@ static int __init amiga_zorro_probe(struct platform_device *pdev)
if (z->id == ZORRO_PROD_GVP_EPC_BASE) {
/* GVP quirk */
unsigned long magic = zi->boardaddr + 0x8000;
+
z->id |= *(u16 *)ZTWO_VADDR(magic) & GVP_PRODMASK;
}
z->slotaddr = zi->slotaddr;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9ee534159cc6..42e102e2e74a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -823,28 +823,21 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
if (IS_ERR(dfid))
return ERR_CAST(dfid);
- name = dentry->d_name.name;
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- if (fid == ERR_PTR(-ENOENT)) {
- d_add(dentry, NULL);
- return NULL;
- }
- return ERR_CAST(fid);
- }
/*
* Make sure we don't use a wrong inode due to parallel
* unlink. For cached mode create calls request for new
* inode. But with cache disabled, lookup should do this.
*/
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ name = dentry->d_name.name;
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (fid == ERR_PTR(-ENOENT))
+ inode = NULL;
+ else if (IS_ERR(fid))
+ inode = ERR_CAST(fid);
+ else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- p9_client_clunk(fid);
- return ERR_CAST(inode);
- }
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -853,12 +846,14 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
* k/b.
*/
res = d_splice_alias(inode, dentry);
- if (!res)
- v9fs_fid_add(dentry, fid);
- else if (!IS_ERR(res))
- v9fs_fid_add(res, fid);
- else
- p9_client_clunk(fid);
+ if (!IS_ERR(fid)) {
+ if (!res)
+ v9fs_fid_add(dentry, fid);
+ else if (!IS_ERR(res))
+ v9fs_fid_add(res, fid);
+ else
+ p9_client_clunk(fid);
+ }
return res;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..ac4ac908f001 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -196,7 +196,7 @@ config HUGETLBFS
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
- <file:Documentation/vm/hugetlbpage.txt> for details.
+ <file:Documentation/admin-guide/mm/hugetlbpage.rst> for details.
If unsure, say N.
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 29444c83da48..e18eff854e1a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -146,20 +146,6 @@ adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct obje
obj->parent_id = inode->i_ino;
- /*
- * '.' is handled by reserved_lookup() in fs/namei.c
- */
- if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') {
- /*
- * Currently unable to fill in the rest of 'obj',
- * but this is better than nothing. We need to
- * ascend one level to find it's parent.
- */
- obj->name_len = 0;
- obj->file_id = obj->parent_id;
- goto free_out;
- }
-
read_lock(&adfs_dir_lock);
ret = ops->setpos(&dir, 0);
@@ -266,17 +252,17 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
if (error == 0) {
- error = -EACCES;
/*
* This only returns NULL if get_empty_inode
* fails.
*/
inode = adfs_iget(dir->i_sb, &obj);
- if (inode)
- error = 0;
+ if (!inode)
+ inode = ERR_PTR(-EACCES);
+ } else if (error != -ENOENT) {
+ inode = ERR_PTR(error);
}
- d_add(dentry, inode);
- return ERR_PTR(error);
+ return d_splice_alias(inode, dentry);
}
/*
diff --git a/fs/affs/super.c b/fs/affs/super.c
index e602619aed9d..d1ad11a8a4a5 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -241,6 +241,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
+ kfree(*prefix);
*prefix = match_strdup(&args[0]);
if (!*prefix)
return 0;
diff --git a/fs/aio.c b/fs/aio.c
index 49f53516eef0..b850e92ee0d5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
*
* See ../COPYING for licensing terms.
*/
@@ -46,6 +47,8 @@
#include "internal.h"
+#define KIOCB_KEY 0
+
#define AIO_RING_MAGIC 0xa10a10a1
#define AIO_RING_COMPAT_FEATURES 1
#define AIO_RING_INCOMPAT_FEATURES 0
@@ -156,21 +159,29 @@ struct kioctx {
unsigned id;
};
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED ((void *) (~0ULL))
+struct fsync_iocb {
+ struct work_struct work;
+ struct file *file;
+ bool datasync;
+};
+
+struct poll_iocb {
+ struct file *file;
+ __poll_t events;
+ struct wait_queue_head *head;
+
+ union {
+ struct wait_queue_entry wait;
+ struct work_struct work;
+ };
+};
struct aio_kiocb {
- struct kiocb common;
+ union {
+ struct kiocb rw;
+ struct fsync_iocb fsync;
+ struct poll_iocb poll;
+ };
struct kioctx *ki_ctx;
kiocb_cancel_fn *ki_cancel;
@@ -264,9 +275,6 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-
- pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
-
return 0;
}
__initcall(aio_setup);
@@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+ struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
-
- if (!req->ki_list.next)
- list_add(&req->ki_list, &ctx->active_reqs);
+ if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
+ return;
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
-
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct aio_kiocb *kiocb)
-{
- kiocb_cancel_fn *old, *cancel;
-
- /*
- * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
- * actually has a cancel function, hence the cmpxchg()
- */
-
- cancel = READ_ONCE(kiocb->ki_cancel);
- do {
- if (!cancel || cancel == KIOCB_CANCELLED)
- return -EINVAL;
-
- old = cancel;
- cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
- } while (cancel != old);
-
- return cancel(&kiocb->common);
-}
-
/*
* free_ioctx() should be RCU delayed to synchronize against the RCU
* protected lookup_ioctx() and also needs process context to call
@@ -634,7 +620,7 @@ static void free_ioctx_users(struct percpu_ref *ref)
while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
struct aio_kiocb, ki_list);
- kiocb_cancel(req);
+ req->ki_cancel(&req->rw);
list_del_init(&req->ki_list);
}
@@ -1041,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
goto out_put;
percpu_ref_get(&ctx->reqs);
-
+ INIT_LIST_HEAD(&req->ki_list);
req->ki_ctx = ctx;
return req;
out_put:
@@ -1049,15 +1035,6 @@ out_put:
return NULL;
}
-static void kiocb_free(struct aio_kiocb *req)
-{
- if (req->common.ki_filp)
- fput(req->common.ki_filp);
- if (req->ki_eventfd != NULL)
- eventfd_ctx_put(req->ki_eventfd);
- kmem_cache_free(kiocb_cachep, req);
-}
-
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
struct aio_ring __user *ring = (void __user *)ctx_id;
@@ -1088,44 +1065,14 @@ out:
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-static void aio_complete(struct kiocb *kiocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
{
- struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
unsigned long flags;
- if (kiocb->ki_flags & IOCB_WRITE) {
- struct file *file = kiocb->ki_filp;
-
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (S_ISREG(file_inode(file)->i_mode))
- __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
- file_end_write(file);
- }
-
- /*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
- */
- BUG_ON(is_sync_kiocb(kiocb));
-
- if (iocb->ki_list.next) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- }
-
/*
* Add a completion event to the ring buffer. Must be done holding
* ctx->completion_lock to prevent other code from messing with the tail
@@ -1179,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
+ if (iocb->ki_eventfd) {
eventfd_signal(iocb->ki_eventfd, 1);
+ eventfd_ctx_put(iocb->ki_eventfd);
+ }
- /* everything turned out well, dispose of the aiocb. */
- kiocb_free(iocb);
+ kmem_cache_free(kiocb_cachep, iocb);
/*
* We have to order our ring_info tail store above and test
@@ -1249,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
if (head == tail)
break;
- avail = min(avail, nr - ret);
- avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
- ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-
pos = head + AIO_EVENTS_OFFSET;
page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
+
ev = kmap(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
@@ -1327,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
wait_event_interruptible_hrtimeout(ctx->wait,
aio_read_events(ctx, min_nr, nr, event, &ret),
until);
-
- if (!ret && signal_pending(current))
- ret = -EINTR;
-
return ret;
}
@@ -1446,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
return -EINVAL;
}
+static void aio_remove_iocb(struct aio_kiocb *iocb)
+{
+ struct kioctx *ctx = iocb->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&iocb->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+
+static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+
+ if (!list_empty_careful(&iocb->ki_list))
+ aio_remove_iocb(iocb);
+
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+
+ fput(kiocb->ki_filp);
+ aio_complete(iocb, res, res2);
+}
+
+static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+{
+ int ret;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp))
+ return -EBADF;
+ req->ki_complete = aio_complete_rw;
+ req->ki_pos = iocb->aio_offset;
+ req->ki_flags = iocb_flags(req->ki_filp);
+ if (iocb->aio_flags & IOCB_FLAG_RESFD)
+ req->ki_flags |= IOCB_EVENTFD;
+ req->ki_hint = file_write_hint(req->ki_filp);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ if (unlikely(ret))
+ fput(req->ki_filp);
+ return ret;
+}
+
static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
{
@@ -1465,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
}
-static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
+static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
{
switch (ret) {
case -EIOCBQUEUED:
- return ret;
+ break;
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
@@ -1481,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
ret = -EINTR;
/*FALLTHRU*/
default:
- aio_complete(req, ret, 0);
- return 0;
+ aio_complete_rw(req, ret, 0);
}
}
static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
- ret = aio_ret(req, call_read_iter(file, req, &iter));
+ aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
+out_fput:
+ if (unlikely(ret))
+ fput(file);
return ret;
}
static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->write_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
- req->ki_flags |= IOCB_WRITE;
- file_start_write(file);
- ret = aio_ret(req, call_write_iter(file, req, &iter));
/*
- * We release freeze protection in aio_complete(). Fool lockdep
- * by telling it the lock got released so that it doesn't
- * complain about held lock when we return to userspace.
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * aio_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
*/
- if (S_ISREG(file_inode(file)->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ }
+ req->ki_flags |= IOCB_WRITE;
+ aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
+out_fput:
+ if (unlikely(ret))
+ fput(file);
return ret;
}
+static void aio_fsync_work(struct work_struct *work)
+{
+ struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+ int ret;
+
+ ret = vfs_fsync(req->file, req->datasync);
+ fput(req->file);
+ aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+}
+
+static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+{
+ if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
+ iocb->aio_rw_flags))
+ return -EINVAL;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+ if (unlikely(!req->file->f_op->fsync)) {
+ fput(req->file);
+ return -EINVAL;
+ }
+
+ req->datasync = datasync;
+ INIT_WORK(&req->work, aio_fsync_work);
+ schedule_work(&req->work);
+ return 0;
+}
+
+/* need to use list_del_init so we can check if item was present */
+static inline bool __aio_poll_remove(struct poll_iocb *req)
+{
+ if (list_empty(&req->wait.entry))
+ return false;
+ list_del_init(&req->wait.entry);
+ return true;
+}
+
+static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+ fput(iocb->poll.file);
+ aio_complete(iocb, mangle_poll(mask), 0);
+}
+
+static void aio_poll_work(struct work_struct *work)
+{
+ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
+
+ if (!list_empty_careful(&iocb->ki_list))
+ aio_remove_iocb(iocb);
+ __aio_poll_complete(iocb, iocb->poll.events);
+}
+
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+ struct poll_iocb *req = &aiocb->poll;
+ struct wait_queue_head *head = req->head;
+ bool found = false;
+
+ spin_lock(&head->lock);
+ found = __aio_poll_remove(req);
+ spin_unlock(&head->lock);
+
+ if (found) {
+ req->events = 0;
+ INIT_WORK(&req->work, aio_poll_work);
+ schedule_work(&req->work);
+ }
+ return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+ struct file *file = req->file;
+ __poll_t mask = key_to_poll(key);
+
+ assert_spin_locked(&req->head->lock);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & req->events))
+ return 0;
+
+ mask = file->f_op->poll_mask(file, req->events);
+ if (!mask)
+ return 0;
+
+ __aio_poll_remove(req);
+
+ /*
+ * Try completing without a context switch if we can acquire ctx_lock
+ * without spinning. Otherwise we need to defer to a workqueue to
+ * avoid a deadlock due to the lock order.
+ */
+ if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ list_del_init(&iocb->ki_list);
+ spin_unlock(&iocb->ki_ctx->ctx_lock);
+
+ __aio_poll_complete(iocb, mask);
+ } else {
+ req->events = mask;
+ INIT_WORK(&req->work, aio_poll_work);
+ schedule_work(&req->work);
+ }
+
+ return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+ struct kioctx *ctx = aiocb->ki_ctx;
+ struct poll_iocb *req = &aiocb->poll;
+ __poll_t mask;
+
+ /* reject any unknown events outside the normal event mask. */
+ if ((u16)iocb->aio_buf != iocb->aio_buf)
+ return -EINVAL;
+ /* reject fields that are not defined for poll */
+ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+ return -EINVAL;
+
+ req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+ if (!file_has_poll_mask(req->file))
+ goto out_fail;
+
+ req->head = req->file->f_op->get_poll_head(req->file, req->events);
+ if (!req->head)
+ goto out_fail;
+ if (IS_ERR(req->head)) {
+ mask = EPOLLERR;
+ goto done;
+ }
+
+ init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+ aiocb->ki_cancel = aio_poll_cancel;
+
+ spin_lock_irq(&ctx->ctx_lock);
+ spin_lock(&req->head->lock);
+ mask = req->file->f_op->poll_mask(req->file, req->events);
+ if (!mask) {
+ __add_wait_queue(req->head, &req->wait);
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ }
+ spin_unlock(&req->head->lock);
+ spin_unlock_irq(&ctx->ctx_lock);
+done:
+ if (mask)
+ __aio_poll_complete(aiocb, mask);
+ return 0;
+out_fail:
+ fput(req->file);
+ return -EINVAL; /* same as no support for IOCB_CMD_POLL */
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, bool compat)
+ bool compat)
{
struct aio_kiocb *req;
- struct file *file;
+ struct iocb iocb;
ssize_t ret;
+ if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+ return -EFAULT;
+
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved2)) {
+ if (unlikely(iocb.aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
/* prevent overflows */
if (unlikely(
- (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
- (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
- ((ssize_t)iocb->aio_nbytes < 0)
+ (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+ (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+ ((ssize_t)iocb.aio_nbytes < 0)
)) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
@@ -1569,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!req))
return -EAGAIN;
- req->common.ki_filp = file = fget(iocb->aio_fildes);
- if (unlikely(!req->common.ki_filp)) {
- ret = -EBADF;
- goto out_put_req;
- }
- req->common.ki_pos = iocb->aio_offset;
- req->common.ki_complete = aio_complete;
- req->common.ki_flags = iocb_flags(req->common.ki_filp);
- req->common.ki_hint = file_write_hint(file);
-
- if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+ if (iocb.aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+ req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
if (IS_ERR(req->ki_eventfd)) {
ret = PTR_ERR(req->ki_eventfd);
req->ki_eventfd = NULL;
goto out_put_req;
}
-
- req->common.ki_flags |= IOCB_EVENTFD;
- }
-
- ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
- if (unlikely(ret)) {
- pr_debug("EINVAL: aio_rw_flags\n");
- goto out_put_req;
}
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1609,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb->aio_data;
+ req->ki_user_data = iocb.aio_data;
- get_file(file);
- switch (iocb->aio_lio_opcode) {
+ switch (iocb.aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->common, iocb, false, compat);
+ ret = aio_read(&req->rw, &iocb, false, compat);
break;
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->common, iocb, false, compat);
+ ret = aio_write(&req->rw, &iocb, false, compat);
break;
case IOCB_CMD_PREADV:
- ret = aio_read(&req->common, iocb, true, compat);
+ ret = aio_read(&req->rw, &iocb, true, compat);
break;
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->common, iocb, true, compat);
+ ret = aio_write(&req->rw, &iocb, true, compat);
+ break;
+ case IOCB_CMD_FSYNC:
+ ret = aio_fsync(&req->fsync, &iocb, false);
+ break;
+ case IOCB_CMD_FDSYNC:
+ ret = aio_fsync(&req->fsync, &iocb, true);
+ break;
+ case IOCB_CMD_POLL:
+ ret = aio_poll(req, &iocb);
break;
default:
- pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
+ pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
ret = -EINVAL;
break;
}
- fput(file);
- if (ret && ret != -EIOCBQUEUED)
+ /*
+ * If ret is 0, we'd either done aio_complete() ourselves or have
+ * arranged for that to be done asynchronously. Anything non-zero
+ * means that we need to destroy req ourselves.
+ */
+ if (ret)
goto out_put_req;
return 0;
out_put_req:
put_reqs_available(ctx, 1);
percpu_ref_put(&ctx->reqs);
- kiocb_free(req);
+ if (req->ki_eventfd)
+ eventfd_ctx_put(req->ki_eventfd);
+ kmem_cache_free(kiocb_cachep, req);
return ret;
}
-static long do_io_submit(aio_context_t ctx_id, long nr,
- struct iocb __user *__user *iocbpp, bool compat)
+/* sys_io_submit:
+ * Queue the nr iocbs pointed to by iocbpp for processing. Returns
+ * the number of iocbs queued. May return -EINVAL if the aio_context
+ * specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ * *iocbpp[0] is not properly initialized, if the operation specified
+ * is invalid for the file descriptor in the iocb. May fail with
+ * -EFAULT if any of the data structures point to invalid data. May
+ * fail with -EBADF if the file descriptor specified in the first
+ * iocb is invalid. May fail with -EAGAIN if insufficient resources
+ * are available to queue any iocbs. Will return 0 if nr is 0. Will
+ * fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+ struct iocb __user * __user *, iocbpp)
{
struct kioctx *ctx;
long ret = 0;
@@ -1653,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
if (unlikely(nr < 0))
return -EINVAL;
- if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
- nr = LONG_MAX/sizeof(*iocbpp);
-
- if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
- return -EFAULT;
-
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- blk_start_plug(&plug);
+ if (nr > ctx->nr_events)
+ nr = ctx->nr_events;
- /*
- * AKPM: should this return a partial result if some of the IOs were
- * successfully submitted?
- */
- for (i=0; i<nr; i++) {
+ blk_start_plug(&plug);
+ for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
- struct iocb tmp;
- if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+ if (unlikely(get_user(user_iocb, iocbpp + i))) {
ret = -EFAULT;
break;
}
- if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
- ret = -EFAULT;
- break;
- }
-
- ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+ ret = io_submit_one(ctx, user_iocb, false);
if (ret)
break;
}
@@ -1695,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
return i ? i : ret;
}
-/* sys_io_submit:
- * Queue the nr iocbs pointed to by iocbpp for processing. Returns
- * the number of iocbs queued. May return -EINVAL if the aio_context
- * specified by ctx_id is invalid, if nr is < 0, if the iocb at
- * *iocbpp[0] is not properly initialized, if the operation specified
- * is invalid for the file descriptor in the iocb. May fail with
- * -EFAULT if any of the data structures point to invalid data. May
- * fail with -EBADF if the file descriptor specified in the first
- * iocb is invalid. May fail with -EAGAIN if insufficient resources
- * are available to queue any iocbs. Will return 0 if nr is 0. Will
- * fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
- struct iocb __user * __user *, iocbpp)
-{
- return do_io_submit(ctx_id, nr, iocbpp, 0);
-}
-
#ifdef CONFIG_COMPAT
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+ int, nr, compat_uptr_t __user *, iocbpp)
{
- compat_uptr_t uptr;
- int i;
+ struct kioctx *ctx;
+ long ret = 0;
+ int i = 0;
+ struct blk_plug plug;
- for (i = 0; i < nr; ++i) {
- if (get_user(uptr, ptr32 + i))
- return -EFAULT;
- if (put_user(compat_ptr(uptr), ptr64 + i))
- return -EFAULT;
+ if (unlikely(nr < 0))
+ return -EINVAL;
+
+ ctx = lookup_ioctx(ctx_id);
+ if (unlikely(!ctx)) {
+ pr_debug("EINVAL: invalid context id\n");
+ return -EINVAL;
}
- return 0;
-}
-#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
+ if (nr > ctx->nr_events)
+ nr = ctx->nr_events;
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
- int, nr, u32 __user *, iocb)
-{
- struct iocb __user * __user *iocb64;
- long ret;
+ blk_start_plug(&plug);
+ for (i = 0; i < nr; i++) {
+ compat_uptr_t user_iocb;
- if (unlikely(nr < 0))
- return -EINVAL;
+ if (unlikely(get_user(user_iocb, iocbpp + i))) {
+ ret = -EFAULT;
+ break;
+ }
- if (nr > MAX_AIO_SUBMITS)
- nr = MAX_AIO_SUBMITS;
+ ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
+ if (ret)
+ break;
+ }
+ blk_finish_plug(&plug);
- iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
- ret = copy_iocb(nr, iocb, iocb64);
- if (!ret)
- ret = do_io_submit(ctx_id, nr, iocb64, 1);
- return ret;
+ percpu_ref_put(&ctx->users);
+ return i ? i : ret;
}
#endif
@@ -1755,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
* Finds a given iocb for cancellation.
*/
static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
{
struct aio_kiocb *kiocb;
assert_spin_locked(&ctx->ctx_lock);
- if (key != KIOCB_KEY)
- return NULL;
-
/* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_user_iocb == iocb)
@@ -1787,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
{
struct kioctx *ctx;
struct aio_kiocb *kiocb;
+ int ret = -EINVAL;
u32 key;
- int ret;
- ret = get_user(key, &iocb->aio_key);
- if (unlikely(ret))
+ if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
+ if (unlikely(key != KIOCB_KEY))
+ return -EINVAL;
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx))
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
-
- kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb)
- ret = kiocb_cancel(kiocb);
- else
- ret = -EINVAL;
-
+ kiocb = lookup_kiocb(ctx, iocb);
+ if (kiocb) {
+ ret = kiocb->ki_cancel(&kiocb->rw);
+ list_del_init(&kiocb->ki_list);
+ }
spin_unlock_irq(&ctx->ctx_lock);
if (!ret) {
@@ -1860,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
struct timespec __user *, timeout)
{
struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
+
+SYSCALL_DEFINE6(io_pgetevents,
+ aio_context_t, ctx_id,
+ long, min_nr,
+ long, nr,
+ struct io_event __user *, events,
+ struct timespec __user *, timeout,
+ const struct __aio_sigset __user *, usig)
+{
+ struct __aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
- if (timeout) {
- if (unlikely(get_timespec64(&ts, timeout)))
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -1877,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
struct compat_timespec __user *, timeout)
{
struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
+
+
+struct __compat_aio_sigset {
+ compat_sigset_t __user *sigmask;
+ compat_size_t sigsetsize;
+};
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+ compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct compat_timespec __user *, timeout,
+ const struct __compat_aio_sigset __user *, usig)
+{
+ struct __compat_aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
- if (timeout) {
- if (compat_get_timespec64(&t, timeout))
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (get_compat_sigset(&ksigmask, ksig.sigmask))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ return ret;
}
#endif
diff --git a/fs/attr.c b/fs/attr.c
index 12ffdb6fb63c..d0b4d34878fb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,32 @@
#include <linux/evm.h>
#include <linux/ima.h>
+static bool chown_ok(const struct inode *inode, kuid_t uid)
+{
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+ uid_eq(uid, inode->i_uid))
+ return true;
+ if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ return true;
+ if (uid_eq(inode->i_uid, INVALID_UID) &&
+ ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+ return true;
+ return false;
+}
+
+static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+{
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+ (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+ return true;
+ if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ return true;
+ if (gid_eq(inode->i_gid, INVALID_GID) &&
+ ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+ return true;
+ return false;
+}
+
/**
* setattr_prepare - check if attribute changes to a dentry are allowed
* @dentry: dentry to check
@@ -52,17 +78,11 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) &&
- (!uid_eq(current_fsuid(), inode->i_uid) ||
- !uid_eq(attr->ia_uid, inode->i_uid)) &&
- !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) &&
- (!uid_eq(current_fsuid(), inode->i_uid) ||
- (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
- !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
return -EPERM;
/* Make sure a caller can chmod. */
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ee832ca5f734..f32f21c3bbc7 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -21,10 +21,9 @@
#define dprintf(x...)
#endif
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
- int namelen, int ino);
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino);
static struct buffer_head *bfs_find_entry(struct inode *dir,
- const unsigned char *name, int namelen,
+ const struct qstr *child,
struct bfs_dirent **res_dir);
static int bfs_readdir(struct file *f, struct dir_context *ctx)
@@ -111,8 +110,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
mark_inode_dirty(inode);
bfs_dump_imap("create", s);
- err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
- inode->i_ino);
+ err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino);
if (err) {
inode_dec_link_count(inode);
mutex_unlock(&info->bfs_lock);
@@ -136,19 +134,14 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
mutex_lock(&info->bfs_lock);
- bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+ bh = bfs_find_entry(dir, &dentry->d_name, &de);
if (bh) {
unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
brelse(bh);
inode = bfs_iget(dir->i_sb, ino);
- if (IS_ERR(inode)) {
- mutex_unlock(&info->bfs_lock);
- return ERR_CAST(inode);
- }
}
mutex_unlock(&info->bfs_lock);
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int bfs_link(struct dentry *old, struct inode *dir,
@@ -159,8 +152,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
int err;
mutex_lock(&info->bfs_lock);
- err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
- inode->i_ino);
+ err = bfs_add_entry(dir, &new->d_name, inode->i_ino);
if (err) {
mutex_unlock(&info->bfs_lock);
return err;
@@ -183,7 +175,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
mutex_lock(&info->bfs_lock);
- bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+ bh = bfs_find_entry(dir, &dentry->d_name, &de);
if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
goto out_brelse;
@@ -228,27 +220,21 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
info = BFS_SB(old_inode->i_sb);
mutex_lock(&info->bfs_lock);
- old_bh = bfs_find_entry(old_dir,
- old_dentry->d_name.name,
- old_dentry->d_name.len, &old_de);
+ old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de);
if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
goto end_rename;
error = -EPERM;
new_inode = d_inode(new_dentry);
- new_bh = bfs_find_entry(new_dir,
- new_dentry->d_name.name,
- new_dentry->d_name.len, &new_de);
+ new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de);
if (new_bh && !new_inode) {
brelse(new_bh);
new_bh = NULL;
}
if (!new_bh) {
- error = bfs_add_entry(new_dir,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
+ error = bfs_add_entry(new_dir, &new_dentry->d_name,
old_inode->i_ino);
if (error)
goto end_rename;
@@ -278,9 +264,10 @@ const struct inode_operations bfs_dir_inops = {
.rename = bfs_rename,
};
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
- int namelen, int ino)
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
{
+ const unsigned char *name = child->name;
+ int namelen = child->len;
struct buffer_head *bh;
struct bfs_dirent *de;
int block, sblock, eblock, off, pos;
@@ -332,12 +319,14 @@ static inline int bfs_namecmp(int len, const unsigned char *name,
}
static struct buffer_head *bfs_find_entry(struct inode *dir,
- const unsigned char *name, int namelen,
+ const struct qstr *child,
struct bfs_dirent **res_dir)
{
unsigned long block = 0, offset = 0;
struct buffer_head *bh = NULL;
struct bfs_dirent *de;
+ const unsigned char *name = child->name;
+ int namelen = child->len;
*res_dir = NULL;
if (namelen > BFS_NAMELEN)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 234bae55b85d..7e075343daa5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,17 +19,17 @@
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
*/
-#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
-#define BTRFS_INODE_ORPHAN_META_RESERVED 1
-#define BTRFS_INODE_DUMMY 2
-#define BTRFS_INODE_IN_DEFRAG 3
-#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
-#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
-#define BTRFS_INODE_NEEDS_FULL_SYNC 6
-#define BTRFS_INODE_COPY_EVERYTHING 7
-#define BTRFS_INODE_IN_DELALLOC_LIST 8
-#define BTRFS_INODE_READDIO_NEED_LOCK 9
-#define BTRFS_INODE_HAS_PROPS 10
+enum {
+ BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+ BTRFS_INODE_DUMMY,
+ BTRFS_INODE_IN_DEFRAG,
+ BTRFS_INODE_HAS_ASYNC_EXTENT,
+ BTRFS_INODE_NEEDS_FULL_SYNC,
+ BTRFS_INODE_COPY_EVERYTHING,
+ BTRFS_INODE_IN_DELALLOC_LIST,
+ BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_INODE_HAS_PROPS,
+};
/* in memory btrfs inode */
struct btrfs_inode {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1061575a7d25..d3e447b45bf7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace,
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(total_ws);
wake:
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(ws_wait))
- wake_up(ws_wait);
+ cond_wake_up(ws_wait);
}
static void free_workspace(int type, struct list_head *ws)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index cc605f7b23fb..ddda9b80bf20 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
+#include <linux/sizes.h>
+
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8c68961925b1..4bc326df472e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
no_skips = 1;
t = path->nodes[i];
- if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+ if (i >= lowest_unlock && i > skip_level) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
@@ -2432,7 +2432,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
- free_extent_buffer(tmp);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
@@ -2446,7 +2445,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!btrfs_buffer_uptodate(tmp, 0, 0))
+ if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
} else {
@@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
return 0;
}
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
+ struct btrfs_path *p,
+ int write_lock_level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *b;
+ int root_lock;
+ int level = 0;
+
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
+ if (p->search_commit_root) {
+ /* The commit roots are read only so we always do read locks */
+ if (p->need_commit_sem)
+ down_read(&fs_info->commit_root_sem);
+ b = root->commit_root;
+ extent_buffer_get(b);
+ level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * Ensure that all callers have set skip_locking when
+ * p->search_commit_root = 1.
+ */
+ ASSERT(p->skip_locking == 1);
+
+ goto out;
+ }
+
+ if (p->skip_locking) {
+ b = btrfs_root_node(root);
+ level = btrfs_header_level(b);
+ goto out;
+ }
+
+ /*
+ * If the level is set to maximum, we can skip trying to get the read
+ * lock.
+ */
+ if (write_lock_level < BTRFS_MAX_LEVEL) {
+ /*
+ * We don't know the level of the root node until we actually
+ * have it read locked
+ */
+ b = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(b);
+ if (level > write_lock_level)
+ goto out;
+
+ /* Whoops, must trade for write lock */
+ btrfs_tree_read_unlock(b);
+ free_extent_buffer(b);
+ }
+
+ b = btrfs_lock_root_node(root);
+ root_lock = BTRFS_WRITE_LOCK;
+
+ /* The level might have changed, check again */
+ level = btrfs_header_level(b);
+
+out:
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
+ /*
+ * Callers are responsible for dropping b's references.
+ */
+ return b;
+}
+
+
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
* modifications to preserve tree invariants.
@@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int err;
int level;
int lowest_unlock = 1;
- int root_lock;
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
@@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
- /*
- * we try very hard to do read locks on the root
- */
- root_lock = BTRFS_READ_LOCK;
- level = 0;
- if (p->search_commit_root) {
- /*
- * the commit roots are read only
- * so we always do read locks
- */
- if (p->need_commit_sem)
- down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
- up_read(&fs_info->commit_root_sem);
- if (!p->skip_locking)
- btrfs_tree_read_lock(b);
- } else {
- if (p->skip_locking) {
- b = btrfs_root_node(root);
- level = btrfs_header_level(b);
- } else {
- /* we don't know the level of the root node
- * until we actually have it read locked
- */
- b = btrfs_read_lock_root_node(root);
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- /* whoops, must trade for write lock */
- btrfs_tree_read_unlock(b);
- free_extent_buffer(b);
- b = btrfs_lock_root_node(root);
- root_lock = BTRFS_WRITE_LOCK;
-
- /* the level might have changed, check again */
- level = btrfs_header_level(b);
- }
- }
- }
- p->nodes[level] = b;
- if (!p->skip_locking)
- p->locks[level] = root_lock;
+ b = btrfs_search_slot_get_root(root, p, write_lock_level);
while (b) {
level = btrfs_header_level(b);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d422c9908b8..f4bf7874c24a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -739,6 +739,12 @@ struct btrfs_delayed_root;
*/
#define BTRFS_FS_NEED_ASYNC_COMMIT 17
+/*
+ * Indicate that balance has been set up from the ioctl and is in the main
+ * phase. The fs_info::balance_ctl is initialized.
+ */
+#define BTRFS_FS_BALANCE_RUNNING 18
+
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -838,7 +844,6 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex volume_mutex;
/*
* this is taken to make sure we don't set block groups ro after
@@ -1004,7 +1009,6 @@ struct btrfs_fs_info {
/* restriper state */
spinlock_t balance_lock;
struct mutex balance_mutex;
- atomic_t balance_running;
atomic_t balance_pause_req;
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
@@ -1219,9 +1223,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- spinlock_t orphan_lock;
- atomic_t orphan_inodes;
- struct btrfs_block_rsv *orphan_block_rsv;
int orphan_cleanup_state;
spinlock_t inode_lock;
@@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode);
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
- int nitems,
- u64 *qgroup_reserved, bool use_global_rsv);
+ int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
@@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, const u64 type);
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end);
+ u64 start, u64 end);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
@@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_map *em);
/* inode.c */
-struct btrfs_delalloc_work {
- struct inode *inode;
- int delay_iput;
- struct completion completion;
- struct list_head list;
- struct btrfs_work work;
-};
-
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int delay_iput);
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
-
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start,
u64 len, int create);
@@ -3193,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *dir, u64 objectid,
- const char *name, int name_len);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
@@ -3204,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
- int nr);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state, int dedupe);
@@ -3240,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -3262,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode);
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_update_iflags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff);
@@ -3767,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
return 0;
}
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+ /*
+ * This implies a full smp_mb barrier, see comments for
+ * waitqueue_active why.
+ */
+ if (wq_has_sleeper(wq))
+ wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+ /*
+ * Special case for conditional wakeup where the barrier required for
+ * waitqueue_active is implied by some of the preceding code. Eg. one
+ * of such atomic operations (atomic_dec_and_return, ...), or a
+ * unlock/lock sequence, etc.
+ */
+ if (waitqueue_active(wq))
+ wake_up(wq);
+}
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a8d492dbd3e7..fe6caa7e698b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
- /*
- * atomic_dec_return implies a barrier for waitqueue_active
- */
+ /* atomic_dec_return implies a barrier */
if ((atomic_dec_return(&delayed_root->items) <
- BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
- waitqueue_active(&delayed_root->wait))
- wake_up(&delayed_root->wait);
+ BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
+ cond_wake_up_nomb(&delayed_root->wait);
}
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e1b0651686f7..03dec673d12a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -286,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
}
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
u64 seq = 0;
@@ -323,9 +323,7 @@ again:
}
}
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq)
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
struct seq_list *elem;
int ret = 0;
@@ -336,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct seq_list, list);
if (seq >= elem->seq) {
btrfs_debug(fs_info,
- "holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)",
+ "holding back delayed_ref %#x.%x, lowest is %#x.%x",
(u32)(seq >> 32), (u32)seq,
- (u32)(elem->seq >> 32), (u32)elem->seq,
- delayed_refs);
+ (u32)(elem->seq >> 32), (u32)elem->seq);
ret = 1;
}
}
@@ -529,33 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
spin_unlock(&existing->lock);
}
-/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
- */
-static noinline struct btrfs_delayed_ref_head *
-add_delayed_ref_head(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
- int action, int is_data, int is_system,
- int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
-
+static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ u64 reserved, int action, bool is_data,
+ bool is_system)
{
- struct btrfs_delayed_ref_head *existing;
- struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
int must_insert_reserved = 0;
- int qrecord_inserted = 0;
/* If reserved is provided, it must be a data extent. */
BUG_ON(!is_data && reserved);
/*
- * the head node stores the sum of all the mods, so dropping a ref
+ * The head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
*/
if (action == BTRFS_UPDATE_DELAYED_HEAD)
@@ -564,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
count_mod = -1;
/*
- * BTRFS_ADD_DELAYED_EXTENT means that we need to update
- * the reserved accounting when the extent is finally added, or
- * if a later modification deletes the delayed ref without ever
- * inserting the extent into the extent allocation tree.
- * ref->must_insert_reserved is the flag used to record
- * that accounting mods are required.
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
+ * accounting when the extent is finally added, or if a later
+ * modification deletes the delayed ref without ever inserting the
+ * extent into the extent allocation tree. ref->must_insert_reserved
+ * is the flag used to record that accounting mods are required.
*
* Once we record must_insert_reserved, switch the action to
* BTRFS_ADD_DELAYED_REF because other special casing is not required.
@@ -579,8 +562,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
else
must_insert_reserved = 0;
- delayed_refs = &trans->transaction->delayed_refs;
-
refcount_set(&head_ref->refs, 1);
head_ref->bytenr = bytenr;
head_ref->num_bytes = num_bytes;
@@ -598,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
- /* Record qgroup extent info if provided */
if (qrecord) {
if (ref_root && reserved) {
head_ref->qgroup_ref_root = ref_root;
@@ -608,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
+ }
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ int action, int *qrecord_inserted_ret,
+ int *old_ref_mod, int *new_ref_mod)
+{
+ struct btrfs_delayed_ref_head *existing;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int qrecord_inserted = 0;
- if(btrfs_qgroup_trace_extent_nolock(fs_info,
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
delayed_refs, qrecord))
kfree(qrecord);
else
qrecord_inserted = 1;
}
- trace_add_delayed_ref_head(fs_info, head_ref, action);
+ trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ WARN_ON(qrecord && head_ref->qgroup_ref_root
+ && head_ref->qgroup_reserved
+ && existing->qgroup_ref_root
&& existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, existing, head_ref,
old_ref_mod);
@@ -634,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
} else {
if (old_ref_mod)
*old_ref_mod = 0;
- if (is_data && count_mod < 0)
- delayed_refs->pending_csums += num_bytes;
+ if (head_ref->is_data && head_ref->ref_mod < 0)
+ delayed_refs->pending_csums += head_ref->num_bytes;
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -645,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
*qrecord_inserted_ret = qrecord_inserted;
if (new_ref_mod)
*new_ref_mod = head_ref->total_ref_mod;
- return head_ref;
-}
-
-/*
- * helper to insert a delayed tree ref into the rbtree.
- */
-static noinline void
-add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action)
-{
- struct btrfs_delayed_tree_ref *full_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- u64 seq = 0;
- int ret;
-
- if (action == BTRFS_ADD_DELAYED_EXTENT)
- action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(ref_root))
- seq = atomic64_read(&fs_info->tree_mod_seq);
- delayed_refs = &trans->transaction->delayed_refs;
-
- /* first set the basic ref node struct up */
- refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
- ref->ref_mod = 1;
- ref->action = action;
- ref->is_head = 0;
- ref->in_tree = 1;
- ref->seq = seq;
- RB_CLEAR_NODE(&ref->ref_node);
- INIT_LIST_HEAD(&ref->add_list);
-
- full_ref = btrfs_delayed_node_to_tree_ref(ref);
- full_ref->parent = parent;
- full_ref->root = ref_root;
- if (parent)
- ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
- else
- ref->type = BTRFS_TREE_BLOCK_REF_KEY;
- full_ref->level = level;
-
- trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
-
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-
- /*
- * XXX: memory should be freed at the same level allocated.
- * But bad practice is anywhere... Follow it now. Need cleanup.
- */
- if (ret > 0)
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+ return head_ref;
}
/*
- * helper to insert a delayed data ref into the rbtree.
+ * init_delayed_ref_common - Initialize the structure which represents a
+ * modification to a an extent.
+ *
+ * @fs_info: Internal to the mounted filesystem mount structure.
+ *
+ * @ref: The structure which is going to be initialized.
+ *
+ * @bytenr: The logical address of the extent for which a modification is
+ * going to be recorded.
+ *
+ * @num_bytes: Size of the extent whose modification is being recorded.
+ *
+ * @ref_root: The id of the root where this modification has originated, this
+ * can be either one of the well-known metadata trees or the
+ * subvolume id which references this extent.
+ *
+ * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
+ * BTRFS_ADD_DELAYED_EXTENT
+ *
+ * @ref_type: Holds the type of the extent which is being recorded, can be
+ * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
+ * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
+ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
*/
-static noinline void
-add_delayed_data_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action)
+static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ int action, u8 ref_type)
{
- struct btrfs_delayed_data_ref *full_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
- int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- delayed_refs = &trans->transaction->delayed_refs;
-
if (is_fstree(ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
- /* first set the basic ref node struct up */
refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
@@ -737,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
+ ref->type = ref_type;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
-
- full_ref = btrfs_delayed_node_to_data_ref(ref);
- full_ref->parent = parent;
- full_ref->root = ref_root;
- if (parent)
- ref->type = BTRFS_SHARED_DATA_REF_KEY;
- else
- ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-
- full_ref->objectid = owner;
- full_ref->offset = offset;
-
- trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
-
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
- if (ret > 0)
- kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
}
/*
@@ -775,13 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
- int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ int ret;
+ u8 ref_type;
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
+ if (parent)
+ ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ ref_type = BTRFS_TREE_BLOCK_REF_KEY;
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ ref_root, action, ref_type);
+ ref->root = ref_root;
+ ref->parent = parent;
+ ref->level = level;
+
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
goto free_ref;
@@ -793,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
goto free_head_ref;
}
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+ ref_root, 0, action, false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -802,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
- bytenr, num_bytes, 0, 0, action, 0,
- is_system, &qrecord_inserted,
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted,
old_ref_mod, new_ref_mod);
- add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, level, action);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
if (qrecord_inserted)
btrfs_qgroup_trace_extent_post(fs_info, record);
@@ -839,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
+ int ret;
+ u8 ref_type;
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
+ if (parent)
+ ref_type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ ref_type = BTRFS_EXTENT_DATA_REF_KEY;
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ ref_root, action, ref_type);
+ ref->root = ref_root;
+ ref->parent = parent;
+ ref->objectid = owner;
+ ref->offset = offset;
+
+
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -861,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
}
}
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
+ reserved, action, true, false);
head_ref->extent_op = NULL;
delayed_refs = &trans->transaction->delayed_refs;
@@ -870,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
- bytenr, num_bytes, ref_root, reserved,
- action, 1, 0, &qrecord_inserted,
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted,
old_ref_mod, new_ref_mod);
- add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, owner, offset,
- action);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+
+
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(fs_info, record);
return 0;
@@ -897,19 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
if (!head_ref)
return -ENOMEM;
+ init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
+ false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- /*
- * extent_ops just modify the flags of an extent and they don't result
- * in ref count changes, hence it's safe to pass false/0 for is_system
- * argument
- */
- add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
- num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data, 0, NULL, NULL, NULL);
+ add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
+ NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 7f00db50bd24..ea1aecb6a50d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
@@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
struct btrfs_delayed_ref_head *
btrfs_select_ref_head(struct btrfs_trans_handle *trans);
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq);
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
/*
* helper functions to cast a node into its container
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f82be266ba4b..e2ba0419297a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
-
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
@@ -179,6 +177,105 @@ out:
}
/*
+ * Initialize a new device for device replace target from a given source dev
+ * and path.
+ *
+ * Return 0 and new device in @device_out, otherwise return < 0
+ */
+static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ const char *device_path,
+ struct btrfs_device *srcdev,
+ struct btrfs_device **device_out)
+{
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct list_head *devices;
+ struct rcu_string *name;
+ u64 devid = BTRFS_DEV_REPLACE_DEVID;
+ int ret = 0;
+
+ *device_out = NULL;
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+ return -EINVAL;
+ }
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
+ return PTR_ERR(bdev);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ btrfs_err(fs_info,
+ "target device is in the filesystem!");
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info,
+ "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
+ device = btrfs_alloc_device(NULL, &devid, NULL);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ device->generation = 0;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ device->fs_devices = fs_info->fs_devices;
+ list_add(&device->dev_list, &fs_info->fs_devices->devices);
+ fs_info->fs_devices->num_devices++;
+ fs_info->fs_devices->open_devices++;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ *device_out = device;
+ return 0;
+
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ return ret;
+}
+
+/*
* called from commit_transaction. Writes changed device replace state to
* disk.
*/
@@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- /* the disk copy procedure reuses the scrub code */
- mutex_lock(&fs_info->volume_mutex);
ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name, &src_device);
- if (ret) {
- mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
return ret;
- }
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret)
return ret;
@@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
- WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
btrfs_info_in_rcu(fs_info,
@@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
@@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
@@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_write_unlock(dev_replace);
- WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+ /*
+ * This could collide with a paused balance, but the exclusive op logic
+ * should never allow both to start and pause. We don't want to allow
+ * dev-replace to start anyway.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ btrfs_info(fs_info,
+ "cannot resume dev-replace, other exclusive operation running");
+ return 0;
+ }
+
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
@@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 progress;
+ int ret;
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
@@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
- btrfs_dev_replace_continue_on_mount(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-
- return 0;
-}
-
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int ret;
-
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
+
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
@@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking(
ASSERT(atomic_read(&dev_replace->read_locks) > 0);
ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
read_lock(&dev_replace->lock);
- if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
- waitqueue_active(&dev_replace->read_lock_wq))
- wake_up(&dev_replace->read_lock_wq);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
@@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->bio_counter, amount);
-
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ cond_wake_up_nomb(&fs_info->replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c3504b4d281b..205092dc9390 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,7 +55,6 @@
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -416,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
static int verify_level_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int level,
- struct btrfs_key *first_key)
+ struct btrfs_key *first_key, u64 parent_transid)
{
int found_level;
struct btrfs_key found_key;
@@ -454,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
if (ret) {
WARN_ON(1);
btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
- eb->start, first_key->objectid, first_key->type,
- first_key->offset, found_key.objectid,
- found_key.type, found_key.offset);
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, parent_transid, first_key->objectid,
+ first_key->type, first_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
}
#endif
return ret;
@@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
parent_transid, 0))
ret = -EIO;
else if (verify_level_key(fs_info, eb, level,
- first_key))
+ first_key, parent_transid))
ret = -EUCLEAN;
else
break;
@@ -1185,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
root->block_rsv = NULL;
- root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1195,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
- spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
@@ -1216,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
- atomic_set(&root->orphan_inodes, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
root->log_transid = 0;
@@ -2164,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
spin_lock_init(&fs_info->balance_lock);
mutex_init(&fs_info->balance_mutex);
- atomic_set(&fs_info->balance_running, 0);
atomic_set(&fs_info->balance_pause_req, 0);
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
@@ -2442,6 +2438,211 @@ out:
return ret;
}
+/*
+ * Real super block validation
+ * NOTE: super csum type and incompat features will not be checked here.
+ *
+ * @sb: super block to check
+ * @mirror_num: the super block number to check its bytenr:
+ * 0 the primary (1st) sb
+ * 1, 2 2nd and 3rd backup copy
+ * -1 skip bytenr check
+ */
+static int validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
+{
+ u64 nodesize = btrfs_super_nodesize(sb);
+ u64 sectorsize = btrfs_super_sectorsize(sb);
+ int ret = 0;
+
+ if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+ btrfs_err(fs_info, "no valid FS found");
+ ret = -EINVAL;
+ }
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+ btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "tree_root level too big: %d >= %d",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "log_root level too big: %d >= %d",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Check sectorsize and nodesize first, other check will need it.
+ * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
+ */
+ if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
+ ret = -EINVAL;
+ }
+ /* Only PAGE SIZE is supported yet */
+ if (sectorsize != PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "sectorsize %llu not supported yet, only support %lu",
+ sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+ nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
+ ret = -EINVAL;
+ }
+ if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+ btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
+ le32_to_cpu(sb->__unused_leafsize), nodesize);
+ ret = -EINVAL;
+ }
+
+ /* Root alignment check */
+ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "tree_root block unaligned: %llu",
+ btrfs_super_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
+ btrfs_super_chunk_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "log_root block unaligned: %llu",
+ btrfs_super_log_root(sb));
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+ "dev_item UUID does not match fsid: %pU != %pU",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+ btrfs_err(fs_info, "bytes_used is too small %llu",
+ btrfs_super_bytes_used(sb));
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+ btrfs_err(fs_info, "invalid stripesize %u",
+ btrfs_super_stripesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
+ btrfs_warn(fs_info, "suspicious number of devices: %llu",
+ btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ btrfs_err(fs_info, "number of devices is 0");
+ ret = -EINVAL;
+ }
+
+ if (mirror_num >= 0 &&
+ btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
+ btrfs_err(fs_info, "super offset mismatch %llu != %u",
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ btrfs_err(fs_info, "system chunk array too big %u > %u",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ btrfs_err(fs_info, "system chunk array too small %u < %zu",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+ btrfs_warn(fs_info,
+ "suspicious: generation < chunk_root_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
+ btrfs_warn(fs_info,
+ "suspicious: generation < cache_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_cache_generation(sb));
+
+ return ret;
+}
+
+/*
+ * Validation of super block at mount time.
+ * Some checks already done early at mount time, like csum type and incompat
+ * flags will be skipped.
+ */
+static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
+{
+ return validate_super(fs_info, fs_info->super_copy, 0);
+}
+
+/*
+ * Validation of super block at write time.
+ * Some checks like bytenr check will be skipped as their values will be
+ * overwritten soon.
+ * Extra checks like csum type and incompat flags will be done here.
+ */
+static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb)
+{
+ int ret;
+
+ ret = validate_super(fs_info, sb, -1);
+ if (ret < 0)
+ goto out;
+ if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info, "invalid csum type, has %u want %u",
+ btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
+ goto out;
+ }
+ if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "invalid incompat flags, has 0x%llx valid mask 0x%llx",
+ btrfs_super_incompat_flags(sb),
+ (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
+ goto out;
+ }
+out:
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "super block corruption detected before writing it to disk");
+ return ret;
+}
+
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -2601,7 +2802,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
- mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
@@ -2668,7 +2868,7 @@ int open_ctree(struct super_block *sb,
memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- ret = btrfs_check_super_valid(fs_info);
+ ret = btrfs_validate_mount_super(fs_info);
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
@@ -3523,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
if (raid_type == BTRFS_RAID_SINGLE)
continue;
- if (!(flags & btrfs_raid_group[raid_type]))
+ if (!(flags & btrfs_raid_array[raid_type].bg_flag))
continue;
min_tolerated = min(min_tolerated,
btrfs_raid_array[raid_type].
@@ -3603,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
flags = btrfs_super_flags(sb);
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+ ret = btrfs_validate_write_super(fs_info, sb);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_handle_fs_error(fs_info, -EUCLEAN,
+ "unexpected superblock corruption detected");
+ return -EUCLEAN;
+ }
+
ret = write_dev_supers(dev, sb, max_mirrors);
if (ret)
total_errors++;
@@ -3674,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root)
{
iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
if (root->subv_writers)
@@ -3766,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
void close_ctree(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
@@ -3862,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
- __btrfs_free_block_rsv(root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
-
while (!list_empty(&fs_info->pinned_chunks)) {
struct extent_map *em;
@@ -3975,155 +4177,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
level, first_key);
}
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_super_block *sb = fs_info->super_copy;
- u64 nodesize = btrfs_super_nodesize(sb);
- u64 sectorsize = btrfs_super_sectorsize(sb);
- int ret = 0;
-
- if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
- btrfs_err(fs_info, "no valid FS found");
- ret = -EINVAL;
- }
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
- btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
- btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
- ret = -EINVAL;
- }
- if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "tree_root level too big: %d >= %d",
- btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
- if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
- btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
- if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "log_root level too big: %d >= %d",
- btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
-
- /*
- * Check sectorsize and nodesize first, other check will need it.
- * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
- */
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
- sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
- btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
- ret = -EINVAL;
- }
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
- btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
- sectorsize, PAGE_SIZE);
- ret = -EINVAL;
- }
- if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
- nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
- btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
- ret = -EINVAL;
- }
- if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
- btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
- le32_to_cpu(sb->__unused_leafsize), nodesize);
- ret = -EINVAL;
- }
-
- /* Root alignment check */
- if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "tree_root block unaligned: %llu",
- btrfs_super_root(sb));
- ret = -EINVAL;
- }
- if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
- btrfs_super_chunk_root(sb));
- ret = -EINVAL;
- }
- if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "log_root block unaligned: %llu",
- btrfs_super_log_root(sb));
- ret = -EINVAL;
- }
-
- if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
- btrfs_err(fs_info,
- "dev_item UUID does not match fsid: %pU != %pU",
- fs_info->fsid, sb->dev_item.fsid);
- ret = -EINVAL;
- }
-
- /*
- * Hint to catch really bogus numbers, bitflips or so, more exact checks are
- * done later
- */
- if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
- btrfs_err(fs_info, "bytes_used is too small %llu",
- btrfs_super_bytes_used(sb));
- ret = -EINVAL;
- }
- if (!is_power_of_2(btrfs_super_stripesize(sb))) {
- btrfs_err(fs_info, "invalid stripesize %u",
- btrfs_super_stripesize(sb));
- ret = -EINVAL;
- }
- if (btrfs_super_num_devices(sb) > (1UL << 31))
- btrfs_warn(fs_info, "suspicious number of devices: %llu",
- btrfs_super_num_devices(sb));
- if (btrfs_super_num_devices(sb) == 0) {
- btrfs_err(fs_info, "number of devices is 0");
- ret = -EINVAL;
- }
-
- if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
- btrfs_err(fs_info, "super offset mismatch %llu != %u",
- btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
- ret = -EINVAL;
- }
-
- /*
- * Obvious sys_chunk_array corruptions, it must hold at least one key
- * and one chunk
- */
- if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- btrfs_err(fs_info, "system chunk array too big %u > %u",
- btrfs_super_sys_array_size(sb),
- BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
- ret = -EINVAL;
- }
- if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
- + sizeof(struct btrfs_chunk)) {
- btrfs_err(fs_info, "system chunk array too small %u < %zu",
- btrfs_super_sys_array_size(sb),
- sizeof(struct btrfs_disk_key)
- + sizeof(struct btrfs_chunk));
- ret = -EINVAL;
- }
-
- /*
- * The generation is a global counter, we'll trust it more than the others
- * but it's still possible that it's the one that's wrong.
- */
- if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
- btrfs_warn(fs_info,
- "suspicious: generation < chunk_root_generation: %llu < %llu",
- btrfs_super_generation(sb),
- btrfs_super_chunk_root_generation(sb));
- if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
- && btrfs_super_cache_generation(sb) != (u64)-1)
- btrfs_warn(fs_info,
- "suspicious: generation < cache_generation: %llu < %llu",
- btrfs_super_generation(sb),
- btrfs_super_cache_generation(sb));
-
- return ret;
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51b5e2da708c..3d9fe58c0080 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -66,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- u64 parent, u64 root_objectid,
- u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins);
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 flags,
int force);
@@ -256,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
ret = btrfs_rmap_block(fs_info, cache->key.objectid,
- bytenr, 0, &logical, &nr, &stripe_len);
+ bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -343,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group)
* since their free space will be released as soon as the transaction commits.
*/
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+ u64 start, u64 end)
{
+ struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -489,8 +488,7 @@ next:
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
- total_found += add_new_free_space(block_group,
- fs_info, last,
+ total_found += add_new_free_space(block_group, last,
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
@@ -508,7 +506,7 @@ next:
}
ret = 0;
- total_found += add_new_free_space(block_group, fs_info, last,
+ total_found += add_new_free_space(block_group, last,
block_group->key.objectid +
block_group->key.offset);
caching_ctl->progress = (u64)-1;
@@ -744,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
}
static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
- u64 owner, u64 root_objectid)
+ bool metadata, u64 root_objectid)
{
struct btrfs_space_info *space_info;
u64 flags;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (metadata) {
if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
@@ -2200,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
&old_ref_mod, &new_ref_mod);
}
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+ if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
+ bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
+ }
return ret;
}
@@ -2428,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_delayed_tree_ref *ref;
- struct btrfs_key ins;
u64 parent = 0;
u64 ref_root = 0;
- bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
@@ -2440,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- ins.objectid = node->bytenr;
- if (skinny_metadata) {
- ins.offset = ref->level;
- ins.type = BTRFS_METADATA_ITEM_KEY;
- } else {
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- }
-
if (node->ref_mod != 1) {
btrfs_err(fs_info,
"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
@@ -2458,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
- ret = alloc_reserved_tree_block(trans, fs_info,
- parent, ref_root,
- extent_op->flags_to_set,
- &extent_op->key,
- ref->level, &ins);
+ ret = alloc_reserved_tree_block(trans, node, extent_op);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, fs_info, node,
parent, ref_root,
@@ -2594,8 +2580,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads--;
rb_erase(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
- spin_unlock(&delayed_refs->lock);
spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
atomic_dec(&delayed_refs->num_entries);
trace_run_delayed_ref_head(fs_info, head, 0);
@@ -2700,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* insert_inline_extent_backref()).
*/
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
- /*
- * locked_ref is the head node, so we have to go one
- * node back for any delayed ref updates
- */
ref = select_delayed_ref(locked_ref);
if (ref && ref->seq &&
- btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+ btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
locked_ref = NULL;
@@ -3291,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
path = btrfs_alloc_path();
if (!path)
- return -ENOENT;
+ return -ENOMEM;
do {
ret = check_committed_ref(root, path, objectid,
@@ -4026,8 +4007,7 @@ static const char *alloc_name(u64 flags)
};
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags,
- struct btrfs_space_info **new)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
struct btrfs_space_info *space_info;
@@ -4065,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags,
return ret;
}
- *new = space_info;
list_add_rcu(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -4122,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* returns target flags in extended format or 0 if restripe for this
* chunk_type is not in progress
*
- * should be called with either volume_mutex or balance_lock held
+ * should be called with balance_lock held
*/
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
@@ -4178,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
/* First, mask out the RAID levels which aren't possible */
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
if (num_devices >= btrfs_raid_array[raid_type].devs_min)
- allowed |= btrfs_raid_group[raid_type];
+ allowed |= btrfs_raid_array[raid_type].bg_flag;
}
allowed &= flags;
@@ -4341,7 +4320,7 @@ commit_trans:
need_commit--;
if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, 0, -1);
+ btrfs_start_delalloc_roots(fs_info, -1);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
(u64)-1);
}
@@ -4678,12 +4657,14 @@ again:
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
- if (ret < 0 && ret != -ENOSPC)
- goto out;
- if (ret)
- space_info->full = 1;
- else
+ if (ret < 0) {
+ if (ret == -ENOSPC)
+ space_info->full = 1;
+ else
+ goto out;
+ } else {
ret = 1;
+ }
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
@@ -4792,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_roots(fs_info, 0, nr_items);
+ btrfs_start_delalloc_roots(fs_info, nr_items);
if (!current->journal_info)
btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
}
@@ -5949,44 +5930,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
trans->chunk_bytes_reserved = 0;
}
-/* Can only return 0 or -ENOSPC */
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- /*
- * We always use trans->block_rsv here as we will have reserved space
- * for our orphan when starting the transaction, using get_block_rsv()
- * here will sometimes make us choose the wrong block rsv as we could be
- * doing a reloc inode for a non refcounted root.
- */
- struct btrfs_block_rsv *src_rsv = trans->block_rsv;
- struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
-
- /*
- * We need to hold space in order to delete our orphan item once we've
- * added it, so this takes the reservation so we can release it later
- * when we are truly done with the orphan item.
- */
- u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
- num_bytes, 1);
- return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-}
-
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
- num_bytes, 0);
- btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
-}
-
/*
* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
* root: the root of the parent directory
@@ -6004,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int items,
- u64 *qgroup_reserved,
bool use_global_rsv)
{
u64 num_bytes;
@@ -6022,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
num_bytes = 0;
}
- *qgroup_reserved = num_bytes;
-
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
@@ -6033,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
- if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
+ if (ret && num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
return ret;
}
@@ -6354,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&info->unused_bgs_lock);
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
@@ -6511,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int found_type;
int i;
+ int ret = 0;
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
return 0;
@@ -6527,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
continue;
key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- __exclude_logged_extent(fs_info, key.objectid, key.offset);
+ ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void
@@ -7122,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7266,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
out:
if (pin)
- add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
+ add_pinned_bytes(fs_info, buf->len, true,
root->root_key.objectid);
if (last_ref) {
@@ -7320,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
&old_ref_mod, &new_ref_mod);
}
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+ if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
+ bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
+ }
return ret;
}
@@ -7373,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = "raid10",
- [BTRFS_RAID_RAID1] = "raid1",
- [BTRFS_RAID_DUP] = "dup",
- [BTRFS_RAID_RAID0] = "raid0",
- [BTRFS_RAID_SINGLE] = "single",
- [BTRFS_RAID_RAID5] = "raid5",
- [BTRFS_RAID_RAID6] = "raid6",
-};
-
-static const char *get_raid_name(enum btrfs_raid_types type)
-{
- if (type >= BTRFS_NR_RAID_TYPES)
- return NULL;
-
- return btrfs_raid_type_names[type];
-}
-
enum btrfs_loop_type {
LOOP_CACHING_NOWAIT = 0,
LOOP_CACHING_WAIT = 1,
@@ -7662,7 +7591,7 @@ have_block_group:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(fs_info,
+ trace_btrfs_reserve_extent_cluster(
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
@@ -7735,7 +7664,7 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(fs_info,
+ trace_btrfs_reserve_extent_cluster(
block_group, search_start,
num_bytes);
goto checks;
@@ -7835,8 +7764,7 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
- trace_btrfs_reserve_extent(fs_info, block_group,
- search_start, num_bytes);
+ trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
@@ -8184,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
- ins->offset);
+ ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
if (ret)
return ret;
@@ -8200,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- u64 parent, u64 root_objectid,
- u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins)
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_extent_item *extent_item;
+ struct btrfs_key extent_key;
struct btrfs_tree_block_info *block_info;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes = ins->offset;
+ u64 num_bytes;
+ u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
- if (!skinny_metadata)
+ ref = btrfs_delayed_node_to_tree_ref(node);
+
+ extent_key.objectid = node->bytenr;
+ if (skinny_metadata) {
+ extent_key.offset = ref->level;
+ extent_key.type = BTRFS_METADATA_ITEM_KEY;
+ num_bytes = fs_info->nodesize;
+ } else {
+ extent_key.offset = node->num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
+ num_bytes = node->num_bytes;
+ }
path = btrfs_alloc_path();
if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ btrfs_free_and_pin_reserved_extent(fs_info,
+ extent_key.objectid,
fs_info->nodesize);
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- ins, size);
+ &extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ btrfs_free_and_pin_reserved_extent(fs_info,
+ extent_key.objectid,
fs_info->nodesize);
return ret;
}
@@ -8245,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = fs_info->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
- btrfs_set_tree_block_key(leaf, block_info, key);
- btrfs_set_tree_block_level(leaf, block_info, level);
+ btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
+ btrfs_set_tree_block_level(leaf, block_info, ref->level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
- if (parent > 0) {
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ret = remove_from_free_space_tree(trans, extent_key.objectid,
num_bytes);
if (ret)
return ret;
- ret = update_block_group(trans, fs_info, ins->objectid,
+ ret = update_block_group(trans, fs_info, extent_key.objectid,
fs_info->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
+ extent_key.objectid, extent_key.offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
+ trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
fs_info->nodesize);
return ret;
}
@@ -10173,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
} else if (btrfs_block_group_used(&cache->item) == 0) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, info,
- found_key.objectid,
+ add_new_free_space(cache, found_key.objectid,
found_key.objectid +
found_key.offset);
free_excluded_extents(info, cache);
@@ -10204,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
/* Should always be true but just in case. */
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
@@ -10269,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
key.offset);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, fs_info, block_group);
+ add_block_group_free_space(trans, block_group);
/* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
@@ -10310,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
- add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
+ add_new_free_space(cache, chunk_offset, chunk_offset + size);
free_excluded_extents(fs_info, cache);
@@ -10391,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
+ trace_btrfs_remove_block_group(block_group);
/*
* Free the reserved super bytes from this block group before
* remove it.
@@ -10648,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->chunk_mutex);
- ret = remove_block_group_free_space(trans, fs_info, block_group);
+ ret = remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -10755,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* the ro check in case balance is currently acting on
* this block group.
*/
+ trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
up_write(&space_info->groups_sem);
goto next;
@@ -10877,7 +10820,6 @@ next:
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
- struct btrfs_space_info *space_info;
struct btrfs_super_block *disk_super;
u64 features;
u64 flags;
@@ -10893,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
}
out:
return ret;
@@ -11092,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(&root->subv_writers->wait))
- wake_up(&root->subv_writers->wait);
+ cond_wake_up(&root->subv_writers->wait);
}
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 56d32bb462f9..51fc015c7d2c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4106,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
return ret;
}
-int extent_writepages(struct extent_io_tree *tree,
- struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
+ .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4123,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree,
return ret;
}
-int extent_readpages(struct extent_io_tree *tree,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+ unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
@@ -4133,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct page *pagepool[16];
struct page *page;
struct extent_map *em_cached = NULL;
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
@@ -4199,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_map_tree *map,
- struct extent_io_tree *tree,
+static int try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
@@ -4235,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask)
+int try_release_extent_mapping(struct page *page, gfp_t mask)
{
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+ struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree;
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
@@ -4275,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
free_extent_map(em);
}
}
- return try_release_extent_state(map, tree, page, mask);
+ return try_release_extent_state(tree, page, mask);
}
/*
@@ -5617,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
-void le_bitmap_set(u8 *map, unsigned int start, int len)
-{
- u8 *p = map + BIT_BYTE(start);
- const unsigned int size = start + len;
- int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
- u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
-
- while (len - bits_to_set >= 0) {
- *p |= mask_to_set;
- len -= bits_to_set;
- bits_to_set = BITS_PER_BYTE;
- mask_to_set = ~0;
- p++;
- }
- if (len) {
- mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
- *p |= mask_to_set;
- }
-}
-
-void le_bitmap_clear(u8 *map, unsigned int start, int len)
-{
- u8 *p = map + BIT_BYTE(start);
- const unsigned int size = start + len;
- int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
- u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
-
- while (len - bits_to_clear >= 0) {
- *p &= ~mask_to_clear;
- len -= bits_to_clear;
- bits_to_clear = BITS_PER_BYTE;
- mask_to_clear = ~0;
- p++;
- }
- if (len) {
- mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
- *p &= ~mask_to_clear;
- }
-}
-
/*
* eb_bitmap_offset() - calculate the page and offset of the byte containing the
* given bit number
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a53009694b16..0bfd4aeb822d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -79,14 +79,6 @@
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-static inline int le_test_bit(int nr, const u8 *addr)
-{
- return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
-}
-
-void le_bitmap_set(u8 *map, unsigned int start, int len);
-void le_bitmap_clear(u8 *map, unsigned int start, int len);
-
struct extent_state;
struct btrfs_root;
struct btrfs_inode;
@@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int create);
void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
-int try_release_extent_mapping(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask);
+int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
@@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
-int extent_writepages(struct extent_io_tree *tree,
- struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct extent_io_tree *tree,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages);
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+ unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1b8a078f92eb..6648d55e5339 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
/**
* btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @fs_info - used for tracepoint
* @em_tree - the extent tree into which we want to insert the extent mapping
* @em_in - extent we are inserting
* @start - start of the logical range btrfs_get_extent() is requesting
@@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
@@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
existing = search_extent_mapping(em_tree, start, len);
- trace_btrfs_handle_em_exist(existing, em, start, len);
+ trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
/*
* existing will always be non-NULL, since there must be
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5fcb80a6ce37..25d985e7532a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e5b569bebc73..d5f80cb300be 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
truncate_pagecache(inode, 0);
/*
- * We don't need an orphan item because truncating the free space cache
- * will never be split across transactions.
- * We don't need to check for -EAGAIN because we're a free space
- * cache inode
+ * We skip the throttling logic for free space cache inodes, so we don't
+ * need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 32a0f6cb5594..b5950aacd697 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -12,7 +12,6 @@
#include "transaction.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
@@ -45,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -138,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
}
-static u8 *alloc_bitmap(u32 bitmap_size)
+static unsigned long *alloc_bitmap(u32 bitmap_size)
{
- u8 *ret;
+ unsigned long *ret;
unsigned int nofs_flag;
+ u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
/*
* GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
@@ -152,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size)
* know that recursion is unsafe.
*/
nofs_flag = memalloc_nofs_save();
- ret = kvzalloc(bitmap_size, GFP_KERNEL);
+ ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
return ret;
}
+static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
+{
+ u8 *p = ((u8 *)map) + BIT_BYTE(start);
+ const unsigned int size = start + len;
+ int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
+ u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
+
+ while (len - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0;
+ p++;
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ *p |= mask_to_set;
+ }
+}
+
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
- u8 *bitmap, *bitmap_cursor;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
u64 start, end;
u64 bitmap_range, i;
u32 bitmap_size, flags, expected_extent_count;
@@ -255,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
goto out;
}
- bitmap_cursor = bitmap;
+ bitmap_cursor = (char *)bitmap;
bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
i = start;
while (i < end) {
@@ -296,21 +316,18 @@ out:
}
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
- u8 *bitmap;
+ unsigned long *bitmap;
u64 start, end;
- /* Initialize to silence GCC. */
- u64 extent_start = 0;
- u64 offset;
u32 bitmap_size, flags, expected_extent_count;
- int prev_bit = 0, bit, bitnr;
+ unsigned long nrbits, start_bit, end_bit;
u32 extent_count = 0;
int done = 0, nr;
int ret;
@@ -348,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
unsigned long ptr;
- u8 *bitmap_cursor;
+ char *bitmap_cursor;
u32 bitmap_pos, data_size;
ASSERT(found_key.objectid >= start);
@@ -358,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap_pos = div_u64(found_key.objectid - start,
fs_info->sectorsize *
BITS_PER_BYTE);
- bitmap_cursor = bitmap + bitmap_pos;
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
data_size = free_space_bitmap_size(found_key.offset,
fs_info->sectorsize);
@@ -392,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- offset = start;
- bitnr = 0;
- while (offset < end) {
- bit = !!le_test_bit(bitnr, bitmap);
- if (prev_bit == 0 && bit == 1) {
- extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
- key.objectid = extent_start;
- key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = offset - extent_start;
-
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret)
- goto out;
- btrfs_release_path(path);
+ nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+ start_bit = find_next_bit_le(bitmap, nrbits, 0);
- extent_count++;
- }
- prev_bit = bit;
- offset += fs_info->sectorsize;
- bitnr++;
- }
- if (prev_bit == 1) {
- key.objectid = extent_start;
+ while (start_bit < nrbits) {
+ end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
+ ASSERT(start_bit < end_bit);
+
+ key.objectid = start + start_bit * block_group->fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = end - extent_start;
+ key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
@@ -425,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
extent_count++;
+
+ start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
if (extent_count != expected_extent_count) {
@@ -446,7 +449,6 @@ out:
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
int new_extents)
@@ -459,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ info = search_free_space_info(trans, trans->fs_info, block_group, path,
+ 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
@@ -474,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
- path);
+ ret = convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, fs_info, block_group,
- path);
+ ret = convert_free_space_to_extents(trans, block_group, path);
}
out:
@@ -576,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = block_group->fs_info->free_space_root;
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
@@ -682,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -690,12 +690,11 @@ out:
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
@@ -769,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -777,7 +776,6 @@ out:
}
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
@@ -786,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
int ret;
if (block_group->needs_free_space) {
- ret = __add_block_group_free_space(trans, fs_info, block_group,
- path);
+ ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
}
- info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ info = search_free_space_info(NULL, trans->fs_info, block_group, path,
+ 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- return modify_free_space_bitmap(trans, fs_info, block_group,
- path, start, size, 1);
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 1);
} else {
- return remove_free_space_extent(trans, fs_info, block_group,
- path, start, size);
+ return remove_free_space_extent(trans, block_group, path,
+ start, size);
}
}
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_path *path;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
@@ -824,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
goto out;
}
- block_group = btrfs_lookup_block_group(fs_info, start);
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
ASSERT(0);
ret = -ENOENT;
@@ -832,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
- start, size);
+ ret = __remove_from_free_space_tree(trans, block_group, path, start,
+ size);
mutex_unlock(&block_group->free_space_lock);
btrfs_put_block_group(block_group);
@@ -845,12 +842,11 @@ out:
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
@@ -965,7 +961,7 @@ insert:
goto out;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -973,17 +969,16 @@ out:
}
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_free_space_info *info;
u32 flags;
int ret;
if (block_group->needs_free_space) {
- ret = __add_block_group_free_space(trans, fs_info, block_group,
- path);
+ ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
}
@@ -995,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- return modify_free_space_bitmap(trans, fs_info, block_group,
- path, start, size, 0);
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 0);
} else {
- return add_free_space_extent(trans, fs_info, block_group, path,
- start, size);
+ return add_free_space_extent(trans, block_group, path, start,
+ size);
}
}
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_path *path;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
@@ -1020,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
goto out;
}
- block_group = btrfs_lookup_block_group(fs_info, start);
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
ASSERT(0);
ret = -ENOENT;
@@ -1028,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
- size);
+ ret = __add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
btrfs_put_block_group(block_group);
@@ -1046,10 +1039,9 @@ out:
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct btrfs_path *path, *path2;
struct btrfs_key key;
u64 start, end;
@@ -1066,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
goto out;
@@ -1099,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans, fs_info,
+ ret = __add_to_free_space_tree(trans,
block_group,
path2, start,
key.objectid -
@@ -1109,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
}
start = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- start += fs_info->nodesize;
+ start += trans->fs_info->nodesize;
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
@@ -1124,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, fs_info, block_group,
- path2, start, end - start);
+ ret = __add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1165,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
while (node) {
block_group = rb_entry(node, struct btrfs_block_group_cache,
cache_node);
- ret = populate_free_space_tree(trans, fs_info, block_group);
+ ret = populate_free_space_tree(trans, block_group);
if (ret)
goto abort;
node = rb_next(node);
@@ -1269,7 +1261,6 @@ abort:
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
@@ -1277,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
block_group->needs_free_space = 0;
- ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ ret = add_new_free_space_info(trans, block_group, path);
if (ret)
return ret;
- return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ return __add_to_free_space_tree(trans, block_group, path,
block_group->key.objectid,
block_group->key.offset);
}
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path = NULL;
int ret = 0;
@@ -1306,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
goto out;
}
- ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+ ret = __add_block_group_free_space(trans, block_group, path);
out:
btrfs_free_path(path);
@@ -1317,10 +1308,9 @@ out:
}
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -1328,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
if (block_group->needs_free_space) {
@@ -1439,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
total_found += add_new_free_space(block_group,
- fs_info,
extent_start,
offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1453,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
}
if (prev_bit == 1) {
- total_found += add_new_free_space(block_group, fs_info,
- extent_start, end);
+ total_found += add_new_free_space(block_group, extent_start,
+ end);
extent_count++;
}
@@ -1511,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
caching_ctl->progress = key.objectid;
- total_found += add_new_free_space(block_group, fs_info,
- key.objectid,
+ total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 874b4feecad2..3133651d7d70 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int free_space_test_bit(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0b86cf10cf2a..89b208201783 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1018,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode,
ram_size, /* ram_bytes */
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
goto out_reserve;
+ }
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -1156,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /*
- * atomic_sub_return implies a barrier for waitqueue_active
- */
+ /* atomic_sub_return implies a barrier */
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M &&
- waitqueue_active(&fs_info->async_submit_wait))
- wake_up(&fs_info->async_submit_wait);
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
if (async_cow->inode)
submit_compressed_extents(async_cow->inode, async_cow);
@@ -1373,6 +1372,13 @@ next_slot:
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out_check;
+ /*
+ * Do the same check as in btrfs_cross_ref_exist but
+ * without the unnecessary search.
+ */
+ if (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
if (btrfs_extent_readonly(fs_info, disk_bytenr))
@@ -1754,6 +1760,7 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
&inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
+ ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
@@ -3158,6 +3165,9 @@ out:
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
+ /* Try to release some metadata so we don't get an OOM but don't wait */
+ btrfs_btree_balance_dirty_nodelay(fs_info);
+
return ret;
}
@@ -3300,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
}
/*
- * This is called in transaction commit time. If there are no orphan
- * files in the subvolume, it removes orphan item and frees block_rsv
- * structure.
- */
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *block_rsv;
- int ret;
-
- if (atomic_read(&root->orphan_inodes) ||
- root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
- return;
-
- spin_lock(&root->orphan_lock);
- if (atomic_read(&root->orphan_inodes)) {
- spin_unlock(&root->orphan_lock);
- return;
- }
-
- if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
- spin_unlock(&root->orphan_lock);
- return;
- }
-
- block_rsv = root->orphan_block_rsv;
- root->orphan_block_rsv = NULL;
- spin_unlock(&root->orphan_lock);
-
- if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
- btrfs_root_refs(&root->root_item) > 0) {
- ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
- root->root_key.objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
- else
- clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
- &root->state);
- }
-
- if (block_rsv) {
- WARN_ON(block_rsv->size > 0);
- btrfs_free_block_rsv(fs_info, block_rsv);
- }
-}
-
-/*
- * This creates an orphan entry for the given inode in case something goes
- * wrong in the middle of an unlink/truncate.
- *
- * NOTE: caller of this function should reserve 5 units of metadata for
- * this function.
+ * This creates an orphan entry for the given inode in case something goes wrong
+ * in the middle of an unlink.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- struct btrfs_block_rsv *block_rsv = NULL;
- int reserve = 0;
- bool insert = false;
int ret;
- if (!root->orphan_block_rsv) {
- block_rsv = btrfs_alloc_block_rsv(fs_info,
- BTRFS_BLOCK_RSV_TEMP);
- if (!block_rsv)
- return -ENOMEM;
- }
-
- if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags))
- insert = true;
-
- if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- reserve = 1;
-
- spin_lock(&root->orphan_lock);
- /* If someone has created ->orphan_block_rsv, be happy to use it. */
- if (!root->orphan_block_rsv) {
- root->orphan_block_rsv = block_rsv;
- } else if (block_rsv) {
- btrfs_free_block_rsv(fs_info, block_rsv);
- block_rsv = NULL;
- }
-
- if (insert)
- atomic_inc(&root->orphan_inodes);
- spin_unlock(&root->orphan_lock);
-
- /* grab metadata reservation from transaction handle */
- if (reserve) {
- ret = btrfs_orphan_reserve_metadata(trans, inode);
- ASSERT(!ret);
- if (ret) {
- /*
- * dec doesn't need spin_lock as ->orphan_block_rsv
- * would be released only if ->orphan_inodes is
- * zero.
- */
- atomic_dec(&root->orphan_inodes);
- clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags);
- if (insert)
- clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags);
- return ret;
- }
- }
-
- /* insert an orphan item to track this unlinked/truncated file */
- if (insert) {
- ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
- if (ret) {
- if (reserve) {
- clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags);
- btrfs_orphan_release_metadata(inode);
- }
- /*
- * btrfs_orphan_commit_root may race with us and set
- * ->orphan_block_rsv to zero, in order to avoid that,
- * decrease ->orphan_inodes after everything is done.
- */
- atomic_dec(&root->orphan_inodes);
- if (ret != -EEXIST) {
- clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags);
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
- }
- ret = 0;
+ ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
return 0;
}
/*
- * We have done the truncate/delete so we can go ahead and remove the orphan
- * item for this particular inode.
+ * We have done the delete so we can go ahead and remove the orphan item for
+ * this particular inode.
*/
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
- int delete_item = 0;
- int ret = 0;
-
- if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags))
- delete_item = 1;
-
- if (delete_item && trans)
- ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
- if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- btrfs_orphan_release_metadata(inode);
-
- /*
- * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
- * to zero, in order to avoid that, decrease ->orphan_inodes after
- * everything is done.
- */
- if (delete_item)
- atomic_dec(&root->orphan_inodes);
-
- return ret;
+ return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
}
/*
@@ -3486,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
struct inode *inode;
u64 last_objectid = 0;
- int ret = 0, nr_unlink = 0, nr_truncate = 0;
+ int ret = 0, nr_unlink = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return 0;
@@ -3586,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = found_key.objectid - 1;
continue;
}
+
}
+
/*
- * Inode is already gone but the orphan item is still there,
- * kill the orphan item.
+ * If we have an inode with links, there are a couple of
+ * possibilities. Old kernels (before v3.12) used to create an
+ * orphan item for truncate indicating that there were possibly
+ * extent items past i_size that needed to be deleted. In v3.12,
+ * truncate was changed to update i_size in sync with the extent
+ * items, but the (useless) orphan item was still created. Since
+ * v4.18, we don't create the orphan item for truncate at all.
+ *
+ * So, this item could mean that we need to do a truncate, but
+ * only if this filesystem was last used on a pre-v3.12 kernel
+ * and was not cleanly unmounted. The odds of that are quite
+ * slim, and it's a pain to do the truncate now, so just delete
+ * the orphan item.
+ *
+ * It's also possible that this orphan item was supposed to be
+ * deleted but wasn't. The inode number may have been reused,
+ * but either way, we can delete the orphan item.
*/
- if (ret == -ENOENT) {
+ if (ret == -ENOENT || inode->i_nlink) {
+ if (!ret)
+ iput(inode);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3607,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
- /*
- * add this inode to the orphan list so btrfs_orphan_del does
- * the proper thing when we hit it
- */
- set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags);
- atomic_inc(&root->orphan_inodes);
-
- /* if we have links, this was a truncate, lets do that */
- if (inode->i_nlink) {
- if (WARN_ON(!S_ISREG(inode->i_mode))) {
- iput(inode);
- continue;
- }
- nr_truncate++;
-
- /* 1 for the orphan item deletion. */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- iput(inode);
- ret = PTR_ERR(trans);
- goto out;
- }
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- btrfs_end_transaction(trans);
- if (ret) {
- iput(inode);
- goto out;
- }
-
- ret = btrfs_truncate(inode, false);
- if (ret)
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- } else {
- nr_unlink++;
- }
+ nr_unlink++;
/* this will do delete_inode and everything for us */
iput(inode);
@@ -3654,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
- if (root->orphan_block_rsv)
- btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
- (u64)-1);
-
- if (root->orphan_block_rsv ||
- test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans);
@@ -3667,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (nr_unlink)
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
- if (nr_truncate)
- btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
out:
if (ret)
@@ -3931,7 +3772,7 @@ cache_acl:
break;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
return 0;
make_bad:
@@ -4245,7 +4086,7 @@ out:
return ret;
}
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len)
@@ -4326,6 +4167,262 @@ out:
return ret;
}
+/*
+ * Helper to check if the subvolume references other subvolumes or if it's
+ * default.
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 dir_id;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Make sure this root isn't set as the default subvol */
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
+ dir_id, "default", 7, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.objectid == root->root_key.objectid) {
+ ret = -EPERM;
+ btrfs_err(fs_info,
+ "deleting default subvolume %llu is not allowed",
+ key.objectid);
+ goto out;
+ }
+ btrfs_release_path(path);
+ }
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* Delete all dentries for inodes belonging to the root */
+static void btrfs_prune_dentries(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+}
+
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *dest = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
+ int ret;
+ int err;
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the inode lock so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ return -EPERM;
+ }
+
+ down_write(&fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode,
+ * two for dir entries,
+ * two for root ref/backref.
+ */
+ err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_release;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (ret) {
+ err = ret;
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+ ret = btrfs_insert_orphan_item(trans,
+ fs_info->tree_root,
+ dest->root_key.objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+ ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_remove(trans,
+ dest->root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ ret = btrfs_end_transaction(trans);
+ if (ret && !err)
+ err = ret;
+ inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ d_invalidate(dentry);
+ btrfs_prune_dentries(dest);
+ ASSERT(dest->send_in_progress == 0);
+
+ /* the last ref */
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
+ }
+ }
+
+ return err;
+}
+
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
@@ -4337,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
- return -EPERM;
+ return btrfs_delete_subvolume(dir, dentry);
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4449,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_slot = 0;
int extent_type = -1;
int ret;
- int err = 0;
u64 ino = btrfs_ino(BTRFS_I(inode));
u64 bytes_deleted = 0;
bool be_nice = false;
@@ -4501,22 +4597,19 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > SZ_32M) {
- if (btrfs_should_end_transaction(trans)) {
- err = -EAGAIN;
- goto error;
- }
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
}
-
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0) {
+ ret = 0;
/* there are no items in the tree for us to truncate, we're
* done
*/
@@ -4627,7 +4720,7 @@ search_again:
* We have to bail so the last_size is set to
* just before this extent.
*/
- err = NEED_TRUNCATE_BLOCK;
+ ret = NEED_TRUNCATE_BLOCK;
break;
}
@@ -4666,7 +4759,10 @@ delete:
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
if (btrfs_should_throttle_delayed_refs(trans, fs_info))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
@@ -4694,7 +4790,7 @@ delete:
pending_del_nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error;
+ break;
}
pending_del_nr = 0;
}
@@ -4705,8 +4801,8 @@ delete:
trans->delayed_ref_updates = 0;
ret = btrfs_run_delayed_refs(trans,
updates * 2);
- if (ret && !err)
- err = ret;
+ if (ret)
+ break;
}
}
/*
@@ -4714,8 +4810,8 @@ delete:
* and let the transaction restart
*/
if (should_end) {
- err = -EAGAIN;
- goto error;
+ ret = -EAGAIN;
+ break;
}
goto search_again;
} else {
@@ -4723,32 +4819,37 @@ delete:
}
}
out:
- if (pending_del_nr) {
- ret = btrfs_del_items(trans, root, path, pending_del_slot,
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
}
-error:
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ASSERT(last_size >= new_size);
- if (!err && last_size > new_size)
+ if (!ret && last_size > new_size)
last_size = new_size;
btrfs_ordered_update_i_size(inode, last_size, NULL);
}
btrfs_free_path(path);
- if (be_nice && bytes_deleted > SZ_32M) {
+ if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
unsigned long updates = trans->delayed_ref_updates;
+ int err;
+
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, updates * 2);
- if (ret && !err)
- err = ret;
+ err = btrfs_run_delayed_refs(trans, updates * 2);
+ if (err)
+ ret = err;
}
}
- return err;
+ return ret;
}
/*
@@ -5090,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
- /*
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion.
- */
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * We need to do this in case we fail at _any_ point during the
- * actual truncate. Once we do the truncate_setsize we could
- * invalidate pages which forces any outstanding ordered io to
- * be instantly completed which will give us extents that need
- * to be truncated. If we fail to get an orphan inode down we
- * could have left over extents that were never meant to live,
- * so we need to guarantee from this point on that everything
- * will be consistent.
- */
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- btrfs_end_transaction(trans);
- if (ret)
- return ret;
-
- /* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
/* Disable nonlocked read DIO to avoid the end less truncate */
@@ -5125,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (ret && inode->i_nlink) {
int err;
- /* To get a stable disk_i_size */
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (err) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- return err;
- }
-
/*
- * failed to truncate, disk_i_size is only adjusted down
- * as we remove extents, so it should represent the true
- * size of the inode, so reset the in memory size and
- * delete our orphan entry.
+ * Truncate failed, so fix up the in-memory size. We
+ * adjusted disk_i_size down as we removed extents, so
+ * wait for disk_i_size to be stable and then update the
+ * in-memory size to match.
*/
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- return ret;
- }
- i_size_write(inode, BTRFS_I(inode)->disk_i_size);
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
+ err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (err)
- btrfs_abort_transaction(trans, err);
- btrfs_end_transaction(trans);
+ return err;
+ i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5277,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode)
spin_unlock(&io_tree->lock);
}
+static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 min_size)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ int failures = 0;
+
+ for (;;) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+
+ if (ret && ++failures > 2) {
+ btrfs_warn(fs_info,
+ "could not allocate space for a delete; will truncate on mount");
+ return ERR_PTR(-ENOSPC);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans) || !ret)
+ return trans;
+
+ /*
+ * Try to steal from the global reserve if there is space for
+ * it.
+ */
+ if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ return trans;
+
+ /* If not, commit and try again. */
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+}
+
void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv, *global_rsv;
- int steal_from_global = 0;
+ struct btrfs_block_rsv *rsv;
u64 min_size;
int ret;
@@ -5304,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto no_delete;
- if (is_bad_inode(inode)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (is_bad_inode(inode))
goto no_delete;
- }
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
if (!special_file(inode->i_mode))
btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
- BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags));
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
goto no_delete;
- }
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
@@ -5327,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode)
}
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
- if (ret) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (ret)
goto no_delete;
- }
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (!rsv)
goto no_delete;
- }
rsv->size = min_size;
rsv->failfast = 1;
- global_rsv = &fs_info->global_block_rsv;
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * This is a bit simpler than btrfs_truncate since we've already
- * reserved our space for our orphan item in the unlink, so we just
- * need to reserve some slack space in case we add bytes and update
- * inode item when doing the truncate.
- */
while (1) {
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
- BTRFS_RESERVE_FLUSH_LIMIT);
-
- /*
- * Try and steal from the global reserve since we will
- * likely not use this space anyway, we want to try as
- * hard as possible to get this to work.
- */
- if (ret)
- steal_from_global++;
- else
- steal_from_global = 0;
- ret = 0;
-
- /*
- * steal_from_global == 0: we reserved stuff, hooray!
- * steal_from_global == 1: we didn't reserve stuff, boo!
- * steal_from_global == 2: we've committed, still not a lot of
- * room but maybe we'll have room in the global reserve this
- * time.
- * steal_from_global == 3: abandon all hope!
- */
- if (steal_from_global > 2) {
- btrfs_warn(fs_info,
- "Could not get space for a delete, will truncate on mount %d",
- ret);
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
-
- /*
- * We can't just steal from the global reserve, we need to make
- * sure there is room to do it, if not we need to commit and try
- * again.
- */
- if (steal_from_global) {
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
- ret = btrfs_block_rsv_migrate(global_rsv, rsv,
- min_size, 0);
- else
- ret = -ENOSPC;
- }
-
- /*
- * Couldn't steal from the global reserve, we have too much
- * pending stuff built up, commit the transaction and try it
- * again.
- */
- if (ret) {
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
- continue;
- } else {
- steal_from_global = 0;
- }
+ trans = evict_refill_and_join(root, rsv, min_size);
+ if (IS_ERR(trans))
+ goto free_rsv;
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -ENOSPC && ret != -EAGAIN)
- break;
-
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
- trans = NULL;
btrfs_btree_balance_dirty(fs_info);
+ if (ret && ret != -ENOSPC && ret != -EAGAIN)
+ goto free_rsv;
+ else if (!ret)
+ break;
}
- btrfs_free_block_rsv(fs_info, rsv);
-
/*
- * Errors here aren't a big deal, it just means we leave orphan items
- * in the tree. They will be cleaned up on the next mount.
+ * Errors here aren't a big deal, it just means we leave orphan items in
+ * the tree. They will be cleaned up on the next mount. If the inode
+ * number gets reused, cleanup deletes the orphan item without doing
+ * anything, and unlink reuses the existing orphan item.
+ *
+ * If it turns out that we are dropping too many of these, we might want
+ * to add a mechanism for retrying these after a commit.
*/
- if (ret == 0) {
- trans->block_rsv = root->orphan_block_rsv;
+ trans = evict_refill_and_join(root, rsv, min_size);
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
- } else {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans);
}
- trans->block_rsv = &fs_info->trans_block_rsv;
if (!(root == fs_info->tree_root ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(fs_info);
+free_rsv:
+ btrfs_free_block_rsv(fs_info, rsv);
no_delete:
+ /*
+ * If we didn't successfully delete, the orphan item will still be in
+ * the tree and we'll retry on the next mount. Again, we might also want
+ * to retry these periodically in the future.
+ */
btrfs_remove_delayed_node(BTRFS_I(inode));
clear_inode(inode);
}
@@ -5626,69 +5657,6 @@ static void inode_tree_del(struct inode *inode)
}
}
-void btrfs_invalidate_inodes(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
-
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- WARN_ON(btrfs_root_refs(&root->root_item) != 0);
-
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
- node = node->rb_left;
- else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from
- * the inode cache when its usage count
- * hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
- }
- spin_unlock(&root->inode_lock);
-}
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
@@ -5850,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
return 0;
}
-static void btrfs_dentry_release(struct dentry *dentry)
-{
- kfree(dentry->d_fsdata);
-}
-
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -6270,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -6705,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
* 2 items for inode and inode ref
* 2 items for dir items
* 1 item for parent inode
+ * 1 item for orphan item deletion if O_TMPFILE
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
@@ -7083,7 +7047,7 @@ insert:
err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
@@ -7368,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
btrfs_file_extent_other_encoding(leaf, fi))
goto out;
+ /*
+ * Do the same check as in btrfs_cross_ref_exist but without the
+ * unnecessary search.
+ */
+ if (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
backref_offset = btrfs_file_extent_offset(leaf, fi);
if (orig_start) {
@@ -7568,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
return em;
}
+
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+ struct buffer_head *bh_result,
+ struct inode *inode,
+ u64 start, u64 len)
+{
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return -ENOENT;
+
+ len = min(len, em->len - (start - em->start));
+
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+
+ return 0;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct buffer_head *bh_result,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em = *map;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1 &&
+ btrfs_inc_nocow_writers(fs_info, block_start)) {
+ struct extent_map *em2;
+
+ em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(fs_info, block_start);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em = em2;
+ }
+
+ if (em2 && IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+ /*
+ * For inode marked NODATACOW or extent marked PREALLOC,
+ * use the existing or preallocated extent, so does not
+ * need to adjust btrfs_space_info's bytes_may_use.
+ */
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ len);
+ goto skip_cow;
+ }
+ }
+
+ /* this will cow the extent */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ *map = em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+skip_cow:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (!dio_data->overwrite && start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+
+ WARN_ON(dio_data->reserve < len);
+ dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
+ current->journal_info = dio_data;
+out:
+ return ret;
+}
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -7636,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- /* Just a good old fashioned hole, return */
- if (!create && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- free_extent_map(em);
- goto unlock_err;
- }
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if (!create) {
- len = min(len, em->len - (start - em->start));
- lockstart = start + len;
- goto unlock;
- }
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start)) {
- struct extent_map *em2;
-
- em2 = btrfs_create_dio_extent(inode, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- em = em2;
- }
- if (em2 && IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto unlock_err;
- }
- /*
- * For inode marked NODATACOW or extent marked PREALLOC,
- * use the existing or preallocated extent, so does not
- * need to adjust btrfs_space_info's bytes_may_use.
- */
- btrfs_free_reserved_data_space_noquota(inode,
- start, len);
- goto unlock;
- }
- }
-
- /*
- * this will cow the extent, reset the len in case we changed
- * it above
- */
- len = bh_result->b_size;
- free_extent_map(em);
- em = btrfs_new_extent_direct(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
- len = min(len, em->len - (start - em->start));
-unlock:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
- set_buffer_mapped(bh_result);
if (create) {
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
+ ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+ dio_data, start, len);
+ if (ret < 0)
+ goto unlock_err;
+ /* clear and unlock the entire range */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, 1, 0, &cached_state);
+ } else {
+ ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+ start, len);
+ /* Can be negative only if we read from a hole */
+ if (ret < 0) {
+ ret = 0;
+ free_extent_map(em);
+ goto unlock_err;
+ }
/*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
+ * We need to unlock only the end area that we aren't using.
+ * The rest is going to be unlocked by the endio routine.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-
- WARN_ON(dio_data->reserve < len);
- dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
- }
-
- /*
- * In the case of write we need to clear and unlock the entire range,
- * in the case of read we need to unlock only the end area that we
- * aren't using if there is any left over space.
- */
- if (lockstart < lockend) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, unlock_bits, 1, 0,
- &cached_state);
- } else {
- free_extent_state(cached_state);
+ lockstart = start + bh_result->b_size;
+ if (lockstart < lockend) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state);
+ } else {
+ free_extent_state(cached_state);
+ }
}
free_extent_map(em);
@@ -8131,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode,
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- int ret;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
wq = fs_info->endio_freespace_worker;
@@ -8141,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode,
func = btrfs_endio_write_helper;
}
-again:
- last_offset = ordered_offset;
- ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate);
- if (!ret)
- goto out_test;
-
- btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered->work);
-out_test:
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered extent
- * in the range, we can exit.
- */
- if (ordered_offset == last_offset)
- return;
- /*
- * our bio might span multiple ordered extents. If we haven't
- * completed the accounting for the whole dio, go back and try again
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- goto again;
+ while (ordered_offset < offset + bytes) {
+ last_offset = ordered_offset;
+ if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
+ &ordered_offset,
+ ordered_bytes,
+ uptodate)) {
+ btrfs_init_work(&ordered->work, func,
+ finish_ordered_fn,
+ NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
+ }
+ /*
+ * If btrfs_dec_test_ordered_pending does not find any ordered
+ * extent in the range, we can exit.
+ */
+ if (ordered_offset == last_offset)
+ return;
+ /*
+ * Our bio might span multiple ordered extents. In this case
+ * we keep goin until we have accounted the whole dio.
+ */
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
+ ordered = NULL;
+ }
}
}
@@ -8705,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
static int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree;
-
- tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_writepages(tree, mapping, wbc);
+ return extent_writepages(mapping, wbc);
}
static int
btrfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_readpages(tree, mapping, pages, nr_pages);
+ return extent_readpages(mapping, pages, nr_pages);
}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- struct extent_map_tree *map;
- int ret;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- map = &BTRFS_I(page->mapping->host)->extent_tree;
- ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+ int ret = try_release_extent_mapping(page, gfp_flags);
if (ret == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -8868,8 +8867,8 @@ again:
*
* We are not allowed to take the i_mutex here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
- * vmtruncate() writes the inode size before removing pages, once we have the
- * page lock we can determine safely if the page is beyond EOF. If it is not
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*/
@@ -9031,8 +9030,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret = 0;
- int err = 0;
+ int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
@@ -9045,39 +9043,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
/*
- * Yes ladies and gentlemen, this is indeed ugly. The fact is we have
- * 3 things going on here
+ * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
+ * things going on here:
*
- * 1) We need to reserve space for our orphan item and the space to
- * delete our orphan item. Lord knows we don't want to have a dangling
- * orphan item because we didn't reserve space to remove it.
+ * 1) We need to reserve space to update our inode.
*
- * 2) We need to reserve space to update our inode.
- *
- * 3) We need to have something to cache all the space that is going to
+ * 2) We need to have something to cache all the space that is going to
* be free'd up by the truncate operation, but also have some slack
* space reserved in case it uses space during the truncate (thank you
* very much snapshotting).
*
- * And we need these to all be separate. The fact is we can use a lot of
+ * And we need these to be separate. The fact is we can use a lot of
* space doing the truncate, and we have no earthly idea how much space
* we will use, so we need the truncate reservation to be separate so it
- * doesn't end up using space reserved for updating the inode or
- * removing the orphan item. We also need to be able to stop the
- * transaction and start a new one, which means we need to be able to
- * update the inode several times, and we have no idea of knowing how
- * many times that will be, so we can't just reserve 1 item for the
- * entirety of the operation, so that has to be done separately as well.
- * Then there is the orphan item, which does indeed need to be held on
- * to for the whole operation, and we need nobody to touch this reserved
- * space except the orphan code.
+ * doesn't end up using space reserved for updating the inode. We also
+ * need to be able to stop the transaction and start a new one, which
+ * means we need to be able to update the inode several times, and we
+ * have no idea of knowing how many times that will be, so we can't just
+ * reserve 1 item for the entirety of the operation, so that has to be
+ * done separately as well.
*
* So that leaves us with
*
- * 1) root->orphan_block_rsv - for the orphan deletion.
- * 2) rsv - for the truncate reservation, which we will steal from the
+ * 1) rsv - for the truncate reservation, which we will steal from the
* transaction reservation.
- * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+ * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
@@ -9092,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out;
}
@@ -9116,24 +9106,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
trans->block_rsv = &fs_info->trans_block_rsv;
- if (ret != -ENOSPC && ret != -EAGAIN) {
- if (ret < 0)
- err = ret;
+ if (ret != -ENOSPC && ret != -EAGAIN)
break;
- }
ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- err = ret;
+ if (ret)
break;
- }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
- ret = err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
break;
}
@@ -9166,29 +9151,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
}
- if (ret == 0 && inode->i_nlink > 0) {
- trans->block_rsv = root->orphan_block_rsv;
- ret = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (ret)
- err = ret;
- }
-
if (trans) {
+ int ret2;
+
trans->block_rsv = &fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && !err)
- err = ret;
+ ret2 = btrfs_update_inode(trans, root, inode);
+ if (ret2 && !ret)
+ ret = ret2;
- ret = btrfs_end_transaction(trans);
+ ret2 = btrfs_end_transaction(trans);
+ if (ret2 && !ret)
+ ret = ret2;
btrfs_btree_balance_dirty(fs_info);
}
out:
btrfs_free_block_rsv(fs_info, rsv);
- if (ret && !err)
- err = ret;
-
- return err;
+ return ret;
}
/*
@@ -9324,13 +9303,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
goto free;
- if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags)) {
- btrfs_info(fs_info, "inode %llu still on the orphan list",
- btrfs_ino(BTRFS_I(inode)));
- atomic_dec(&root->orphan_inodes);
- }
-
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
@@ -9964,6 +9936,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
@@ -9977,15 +9956,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (delalloc_work->delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ iput(inode);
complete(&delalloc_work->completion);
}
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int delay_iput)
+static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
{
struct btrfs_delalloc_work *work;
@@ -9996,7 +9971,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
@@ -10004,18 +9978,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
return work;
}
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
-{
- wait_for_completion(&work->completion);
- kfree(work);
-}
-
/*
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
- int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -10043,12 +10010,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode);
if (!work) {
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ iput(inode);
ret = -ENOMEM;
goto out;
}
@@ -10066,10 +10030,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
+ wait_for_completion(&work->completion);
+ kfree(work);
}
- if (!list_empty_careful(&splice)) {
+ if (!list_empty(&splice)) {
spin_lock(&root->delalloc_lock);
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
@@ -10078,7 +10043,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -10086,14 +10051,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = __start_delalloc_inodes(root, delay_iput, -1);
+ ret = start_delalloc_inodes(root, -1);
if (ret > 0)
ret = 0;
return ret;
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
- int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -10116,7 +10080,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = __start_delalloc_inodes(root, delay_iput, nr);
+ ret = start_delalloc_inodes(root, nr);
btrfs_put_fs_root(root);
if (ret < 0)
goto out;
@@ -10131,7 +10095,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
ret = 0;
out:
- if (!list_empty_careful(&splice)) {
+ if (!list_empty(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
@@ -10669,5 +10633,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
const struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
- .d_release = btrfs_dentry_release,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 632e26d6f7ce..d29992f7dc63 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -93,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
+ unsigned int flags)
{
- if (S_ISDIR(mode))
+ if (S_ISDIR(inode->i_mode))
return flags;
- else if (S_ISREG(mode))
+ else if (S_ISREG(inode->i_mode))
return flags & ~FS_DIRSYNC_FL;
else
return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}
/*
- * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
+ * ioctl.
*/
-static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
{
unsigned int iflags = 0;
@@ -136,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_update_iflags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
{
- struct btrfs_inode *ip = BTRFS_I(inode);
+ struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (ip->flags & BTRFS_INODE_SYNC)
+ if (binode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (ip->flags & BTRFS_INODE_IMMUTABLE)
+ if (binode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (ip->flags & BTRFS_INODE_APPEND)
+ if (binode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (ip->flags & BTRFS_INODE_NOATIME)
+ if (binode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (ip->flags & BTRFS_INODE_DIRSYNC)
+ if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
set_mask_bits(&inode->i_flags,
@@ -159,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode)
static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
{
- struct btrfs_inode *ip = BTRFS_I(file_inode(file));
- unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+ struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+ unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
if (copy_to_user(arg, &flags, sizeof(flags)))
return -EFAULT;
return 0;
}
-static int check_flags(unsigned int flags)
+/* Check if @flags are a supported and valid set of FS_*_FL flags */
+static int check_fsflags(unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -186,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *ip = BTRFS_I(inode);
- struct btrfs_root *root = ip->root;
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
- unsigned int flags, oldflags;
+ unsigned int fsflags, old_fsflags;
int ret;
- u64 ip_oldflags;
- unsigned int i_oldflags;
+ u64 old_flags;
+ unsigned int old_i_flags;
umode_t mode;
if (!inode_owner_or_capable(inode))
@@ -201,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (btrfs_root_readonly(root))
return -EROFS;
- if (copy_from_user(&flags, arg, sizeof(flags)))
+ if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
- ret = check_flags(flags);
+ ret = check_fsflags(fsflags);
if (ret)
return ret;
@@ -214,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode_lock(inode);
- ip_oldflags = ip->flags;
- i_oldflags = inode->i_flags;
+ old_flags = binode->flags;
+ old_i_flags = inode->i_flags;
mode = inode->i_mode;
- flags = btrfs_mask_flags(inode->i_mode, flags);
- oldflags = btrfs_flags_to_ioctl(ip->flags);
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+ if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
ret = -EPERM;
goto out_unlock;
}
}
- if (flags & FS_SYNC_FL)
- ip->flags |= BTRFS_INODE_SYNC;
+ if (fsflags & FS_SYNC_FL)
+ binode->flags |= BTRFS_INODE_SYNC;
else
- ip->flags &= ~BTRFS_INODE_SYNC;
- if (flags & FS_IMMUTABLE_FL)
- ip->flags |= BTRFS_INODE_IMMUTABLE;
+ binode->flags &= ~BTRFS_INODE_SYNC;
+ if (fsflags & FS_IMMUTABLE_FL)
+ binode->flags |= BTRFS_INODE_IMMUTABLE;
else
- ip->flags &= ~BTRFS_INODE_IMMUTABLE;
- if (flags & FS_APPEND_FL)
- ip->flags |= BTRFS_INODE_APPEND;
+ binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (fsflags & FS_APPEND_FL)
+ binode->flags |= BTRFS_INODE_APPEND;
else
- ip->flags &= ~BTRFS_INODE_APPEND;
- if (flags & FS_NODUMP_FL)
- ip->flags |= BTRFS_INODE_NODUMP;
+ binode->flags &= ~BTRFS_INODE_APPEND;
+ if (fsflags & FS_NODUMP_FL)
+ binode->flags |= BTRFS_INODE_NODUMP;
else
- ip->flags &= ~BTRFS_INODE_NODUMP;
- if (flags & FS_NOATIME_FL)
- ip->flags |= BTRFS_INODE_NOATIME;
+ binode->flags &= ~BTRFS_INODE_NODUMP;
+ if (fsflags & FS_NOATIME_FL)
+ binode->flags |= BTRFS_INODE_NOATIME;
else
- ip->flags &= ~BTRFS_INODE_NOATIME;
- if (flags & FS_DIRSYNC_FL)
- ip->flags |= BTRFS_INODE_DIRSYNC;
+ binode->flags &= ~BTRFS_INODE_NOATIME;
+ if (fsflags & FS_DIRSYNC_FL)
+ binode->flags |= BTRFS_INODE_DIRSYNC;
else
- ip->flags &= ~BTRFS_INODE_DIRSYNC;
- if (flags & FS_NOCOW_FL) {
+ binode->flags &= ~BTRFS_INODE_DIRSYNC;
+ if (fsflags & FS_NOCOW_FL) {
if (S_ISREG(mode)) {
/*
* It's safe to turn csums off here, no extents exist.
@@ -259,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* status of the file and will not set it.
*/
if (inode->i_size == 0)
- ip->flags |= BTRFS_INODE_NODATACOW
- | BTRFS_INODE_NODATASUM;
+ binode->flags |= BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM;
} else {
- ip->flags |= BTRFS_INODE_NODATACOW;
+ binode->flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
@@ -270,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
*/
if (S_ISREG(mode)) {
if (inode->i_size == 0)
- ip->flags &= ~(BTRFS_INODE_NODATACOW
+ binode->flags &= ~(BTRFS_INODE_NODATACOW
| BTRFS_INODE_NODATASUM);
} else {
- ip->flags &= ~BTRFS_INODE_NODATACOW;
+ binode->flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -282,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* flag may be changed automatically if compression code won't make
* things smaller.
*/
- if (flags & FS_NOCOMP_FL) {
- ip->flags &= ~BTRFS_INODE_COMPRESS;
- ip->flags |= BTRFS_INODE_NOCOMPRESS;
+ if (fsflags & FS_NOCOMP_FL) {
+ binode->flags &= ~BTRFS_INODE_COMPRESS;
+ binode->flags |= BTRFS_INODE_NOCOMPRESS;
ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA)
goto out_drop;
- } else if (flags & FS_COMPR_FL) {
+ } else if (fsflags & FS_COMPR_FL) {
const char *comp;
- ip->flags |= BTRFS_INODE_COMPRESS;
- ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ binode->flags |= BTRFS_INODE_COMPRESS;
+ binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
@@ -308,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA)
goto out_drop;
- ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
trans = btrfs_start_transaction(root, 1);
@@ -317,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
@@ -325,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_end_transaction(trans);
out_drop:
if (ret) {
- ip->flags = ip_oldflags;
- inode->i_flags = i_oldflags;
+ binode->flags = old_flags;
+ inode->i_flags = old_i_flags;
}
out_unlock:
@@ -335,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
return ret;
}
+/*
+ * Translate btrfs internal inode flags to xflags as expected by the
+ * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
+ * silently dropped.
+ */
+static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
+{
+ unsigned int xflags = 0;
+
+ if (flags & BTRFS_INODE_APPEND)
+ xflags |= FS_XFLAG_APPEND;
+ if (flags & BTRFS_INODE_IMMUTABLE)
+ xflags |= FS_XFLAG_IMMUTABLE;
+ if (flags & BTRFS_INODE_NOATIME)
+ xflags |= FS_XFLAG_NOATIME;
+ if (flags & BTRFS_INODE_NODUMP)
+ xflags |= FS_XFLAG_NODUMP;
+ if (flags & BTRFS_INODE_SYNC)
+ xflags |= FS_XFLAG_SYNC;
+
+ return xflags;
+}
+
+/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
+static int check_xflags(unsigned int flags)
+{
+ if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
+ FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+/*
+ * Set the xflags from the internal inode flags. The remaining items of fsxattr
+ * are zeroed.
+ */
+static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
+{
+ struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(fa));
+ fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
+
+ if (copy_to_user(arg, &fa, sizeof(fa)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_root *root = binode->root;
+ struct btrfs_trans_handle *trans;
+ struct fsxattr fa;
+ unsigned old_flags;
+ unsigned old_i_flags;
+ int ret = 0;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ memset(&fa, 0, sizeof(fa));
+ if (copy_from_user(&fa, arg, sizeof(fa)))
+ return -EFAULT;
+
+ ret = check_xflags(fa.fsx_xflags);
+ if (ret)
+ return ret;
+
+ if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
+ return -EOPNOTSUPP;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ old_flags = binode->flags;
+ old_i_flags = inode->i_flags;
+
+ /* We need the capabilities to change append-only or immutable inode */
+ if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
+ (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (fa.fsx_xflags & FS_XFLAG_SYNC)
+ binode->flags |= BTRFS_INODE_SYNC;
+ else
+ binode->flags &= ~BTRFS_INODE_SYNC;
+ if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
+ binode->flags |= BTRFS_INODE_IMMUTABLE;
+ else
+ binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (fa.fsx_xflags & FS_XFLAG_APPEND)
+ binode->flags |= BTRFS_INODE_APPEND;
+ else
+ binode->flags &= ~BTRFS_INODE_APPEND;
+ if (fa.fsx_xflags & FS_XFLAG_NODUMP)
+ binode->flags |= BTRFS_INODE_NODUMP;
+ else
+ binode->flags &= ~BTRFS_INODE_NODUMP;
+ if (fa.fsx_xflags & FS_XFLAG_NOATIME)
+ binode->flags |= BTRFS_INODE_NOATIME;
+ else
+ binode->flags &= ~BTRFS_INODE_NOATIME;
+
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ btrfs_sync_inode_flags_to_i_flags(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+
+ btrfs_end_transaction(trans);
+
+out_unlock:
+ if (ret) {
+ binode->flags = old_flags;
+ inode->i_flags = old_i_flags;
+ }
+
+ inode_unlock(inode);
+ mnt_drop_write_file(file);
+
+ return ret;
+}
+
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
@@ -424,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- u64 qgroup_reserved;
uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -449,8 +593,7 @@ static noinline int create_subvol(struct inode *dir,
* The same as the snapshot creation, please see the comment
* of create_snapshot().
*/
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
- 8, &qgroup_reserved, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
if (ret)
goto fail_free;
@@ -573,7 +716,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
BUG_ON(ret);
- ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -640,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
wait_event(root->subv_writers->wait,
percpu_counter_sum(&root->subv_writers->counter) == 0);
- ret = btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_delalloc_inodes(root);
if (ret)
goto dec_and_free;
@@ -658,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
*/
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv, 8,
- &pending_snapshot->qgroup_reserved,
false);
if (ret)
goto dec_and_free;
@@ -1457,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -1565,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
@@ -1832,60 +1972,6 @@ out:
return ret;
}
-/*
- * helper to check if the subvolume references other subvolumes
- */
-static noinline int may_destroy_subvol(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct btrfs_dir_item *di;
- struct btrfs_key key;
- u64 dir_id;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /* Make sure this root isn't set as the default subvol */
- dir_id = btrfs_super_root_dir(fs_info->super_copy);
- di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
- dir_id, "default", 7, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
- ret = -EPERM;
- btrfs_err(fs_info,
- "deleting default subvolume %llu is not allowed",
- key.objectid);
- goto out;
- }
- btrfs_release_path(path);
- }
-
- key.objectid = root->root_key.objectid;
- key.type = BTRFS_ROOT_REF_KEY;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- BUG_ON(ret == 0);
-
- ret = 0;
- if (path->slots[0] > 0) {
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
- ret = -ENOTEMPTY;
- }
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static noinline int key_in_sk(struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk)
{
@@ -2066,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode,
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
btrfs_free_path(path);
- return -ENOENT;
+ return PTR_ERR(root);
}
}
@@ -2200,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- btrfs_err(info, "could not find root %llu", tree_id);
- ret = -ENOENT;
+ ret = PTR_ERR(root);
goto out;
}
@@ -2256,6 +2341,165 @@ out:
return ret;
}
+static int btrfs_search_path_in_tree_user(struct inode *inode,
+ struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct super_block *sb = inode->i_sb;
+ struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+ u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 dirid = args->dirid;
+ unsigned long item_off;
+ unsigned long item_len;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key, key2;
+ struct extent_buffer *leaf;
+ struct inode *temp_inode;
+ char *ptr;
+ int slot;
+ int len;
+ int total_len = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * If the bottom subvolume does not exist directly under upper_limit,
+ * construct the path in from the bottom up.
+ */
+ if (dirid != upper_limit.objectid) {
+ ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+ key.objectid = treeid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(leaf, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < args->path) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+
+ *(ptr + len) = '/';
+ read_extent_buffer(leaf, ptr,
+ (unsigned long)(iref + 1), len);
+
+ /* Check the read+exec permission of this directory */
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_ITEM_KEY);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key2, slot);
+ if (key2.objectid != dirid) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ temp_inode = btrfs_iget(sb, &key2, root, NULL);
+ ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+ iput(temp_inode);
+ if (ret) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (key.offset == upper_limit.objectid)
+ break;
+ if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+ }
+
+ memmove(args->path, ptr, total_len);
+ args->path[total_len] = '\0';
+ btrfs_release_path(path);
+ }
+
+ /* Get the bottom subvolume's name from ROOT_REF */
+ root = fs_info->tree_root;
+ key.objectid = treeid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = args->treeid;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot);
+ item_len = btrfs_item_size_nr(leaf, slot);
+ /* Check if dirid in ROOT_REF corresponds to passed dirid */
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Copy subvolume's name */
+ item_off += sizeof(struct btrfs_root_ref);
+ item_len -= sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, args->name, item_off, item_len);
+ args->name[item_len] = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static noinline int btrfs_ioctl_ino_lookup(struct file *file,
void __user *argp)
{
@@ -2298,6 +2542,265 @@ out:
return ret;
}
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ * 1. Read + Exec permission will be checked using inode_permission() during
+ * path construction. -EACCES will be returned in case of failure.
+ * 2. Path construction will be stopped at the inode number which corresponds
+ * to the fd with which this ioctl is called. If constructed path does not
+ * exist under fd's inode, -EACCES will be returned.
+ * 3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_user_args *args;
+ struct inode *inode;
+ int ret;
+
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ inode = file_inode(file);
+
+ if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+ BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * The subvolume does not exist under fd with which this is
+ * called
+ */
+ kfree(args);
+ return -EACCES;
+ }
+
+ ret = btrfs_search_path_in_tree_user(inode, args);
+
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root_item *root_item;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long item_off;
+ unsigned long item_len;
+ struct inode *inode;
+ int slot;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+ if (!subvol_info) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ inode = file_inode(file);
+ fs_info = BTRFS_I(inode)->root->fs_info;
+
+ /* Get root_item of inode's subvolume */
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ root_item = &root->root_item;
+
+ subvol_info->treeid = key.objectid;
+
+ subvol_info->generation = btrfs_root_generation(root_item);
+ subvol_info->flags = btrfs_root_flags(root_item);
+
+ memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+ memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+ BTRFS_UUID_SIZE);
+ memcpy(subvol_info->received_uuid, root_item->received_uuid,
+ BTRFS_UUID_SIZE);
+
+ subvol_info->ctransid = btrfs_root_ctransid(root_item);
+ subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+ subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+ subvol_info->otransid = btrfs_root_otransid(root_item);
+ subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+ subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+ subvol_info->stransid = btrfs_root_stransid(root_item);
+ subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+ subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+ subvol_info->rtransid = btrfs_root_rtransid(root_item);
+ subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+ subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+ if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* Search root tree for ROOT_BACKREF of this subvolume */
+ root = fs_info->tree_root;
+
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == subvol_info->treeid &&
+ key.type == BTRFS_ROOT_BACKREF_KEY) {
+ subvol_info->parent_id = key.offset;
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot)
+ + sizeof(struct btrfs_root_ref);
+ item_len = btrfs_item_size_nr(leaf, slot)
+ - sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, subvol_info->name,
+ item_off, item_len);
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+ ret = -EFAULT;
+
+out:
+ btrfs_free_path(path);
+ kzfree(subvol_info);
+ return ret;
+}
+
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+ struct btrfs_root_ref *rref;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct inode *inode;
+ u64 objectid;
+ int slot;
+ int ret;
+ u8 found;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ rootrefs = memdup_user(argp, sizeof(*rootrefs));
+ if (IS_ERR(rootrefs)) {
+ btrfs_free_path(path);
+ return PTR_ERR(rootrefs);
+ }
+
+ inode = file_inode(file);
+ root = BTRFS_I(inode)->root->fs_info->tree_root;
+ objectid = BTRFS_I(inode)->root->root_key.objectid;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = rootrefs->min_treeid;
+ found = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ rootrefs->rootref[found].treeid = key.offset;
+ rootrefs->rootref[found].dirid =
+ btrfs_root_ref_dirid(leaf, rref);
+ found++;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+out:
+ if (!ret || ret == -EOVERFLOW) {
+ rootrefs->num_items = found;
+ /* update min_treeid for next search */
+ if (found)
+ rootrefs->min_treeid =
+ rootrefs->rootref[found - 1].treeid + 1;
+ if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+ ret = -EFAULT;
+ }
+
+ kfree(rootrefs);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
@@ -2309,12 +2812,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
- struct btrfs_block_rsv block_rsv;
- u64 root_flags;
- u64 qgroup_reserved;
int namelen;
- int ret;
int err = 0;
if (!S_ISDIR(dir->i_mode))
@@ -2398,133 +2896,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
inode_lock(inode);
-
- /*
- * Don't allow to delete a subvolume with send in progress. This is
- * inside the i_mutex so the error handling that has to drop the bit
- * again is not run concurrently.
- */
- spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
- spin_unlock(&dest->root_item_lock);
- btrfs_warn(fs_info,
- "Attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
- err = -EPERM;
- goto out_unlock_inode;
- }
-
- down_write(&fs_info->subvol_sem);
-
- err = may_destroy_subvol(dest);
- if (err)
- goto out_up_write;
-
- btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * One for dir inode, two for dir entries, two for root
- * ref/backref.
- */
- err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
- 5, &qgroup_reserved, true);
- if (err)
- goto out_up_write;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out_release;
- }
- trans->block_rsv = &block_rsv;
- trans->bytes_reserved = block_rsv.size;
-
- btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
-
- ret = btrfs_unlink_subvol(trans, root, dir,
- dest->root_key.objectid,
- dentry->d_name.name,
- dentry->d_name.len);
- if (ret) {
- err = ret;
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
-
- btrfs_record_root_in_trans(trans, dest);
-
- memset(&dest->root_item.drop_progress, 0,
- sizeof(dest->root_item.drop_progress));
- dest->root_item.drop_level = 0;
- btrfs_set_root_refs(&dest->root_item, 0);
-
- if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
- ret = btrfs_insert_orphan_item(trans,
- fs_info->tree_root,
- dest->root_key.objectid);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- }
-
- ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
- if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
- ret = btrfs_uuid_tree_rem(trans, fs_info,
- dest->root_item.received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
- if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- }
-
-out_end_trans:
- trans->block_rsv = NULL;
- trans->bytes_reserved = 0;
- ret = btrfs_end_transaction(trans);
- if (ret && !err)
- err = ret;
- inode->i_flags |= S_DEAD;
-out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
- if (err) {
- spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- btrfs_set_root_flags(&dest->root_item,
- root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- }
-out_unlock_inode:
+ err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- d_invalidate(dentry);
- btrfs_invalidate_inodes(dest);
+ if (!err)
d_delete(dentry);
- ASSERT(dest->send_in_progress == 0);
- /* the last ref */
- if (dest->ino_cache_inode) {
- iput(dest->ino_cache_inode);
- dest->ino_cache_inode = NULL;
- }
- }
out_dput:
dput(dentry);
out_unlock_dir:
@@ -2613,7 +2989,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2628,7 +3003,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
@@ -2654,8 +3028,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
/* Check for compatibility reject unknown flags */
- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
- return -EOPNOTSUPP;
+ if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2954,8 +3330,6 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
put_page(pg);
}
}
- kfree(cmp->src_pages);
- kfree(cmp->dst_pages);
}
static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
@@ -2964,40 +3338,14 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
{
int ret;
int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
- struct page **src_pgarr, **dst_pgarr;
- /*
- * We must gather up all the pages before we initiate our
- * extent locking. We use an array for the page pointers. Size
- * of the array is bounded by len, which is in turn bounded by
- * BTRFS_MAX_DEDUPE_LEN.
- */
- src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
- dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
- if (!src_pgarr || !dst_pgarr) {
- kfree(src_pgarr);
- kfree(dst_pgarr);
- return -ENOMEM;
- }
cmp->num_pages = num_pages;
- cmp->src_pages = src_pgarr;
- cmp->dst_pages = dst_pgarr;
-
- /*
- * If deduping ranges in the same inode, locking rules make it mandatory
- * to always lock pages in ascending order to avoid deadlocks with
- * concurrent tasks (such as starting writeback/delalloc).
- */
- if (src == dst && dst_loff < loff) {
- swap(src_pgarr, dst_pgarr);
- swap(loff, dst_loff);
- }
- ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
+ ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
if (ret)
goto out;
- ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
+ ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
out:
if (ret)
@@ -3067,31 +3415,23 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff,
+ struct cmp_pages *cmp)
{
int ret;
u64 len = olen;
- struct cmp_pages cmp;
bool same_inode = (src == dst);
u64 same_lock_start = 0;
u64 same_lock_len = 0;
- if (len == 0)
- return 0;
-
- if (same_inode)
- inode_lock(src);
- else
- btrfs_double_inode_lock(src, dst);
-
ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
- goto out_unlock;
+ return ret;
ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
if (ret)
- goto out_unlock;
+ return ret;
if (same_inode) {
/*
@@ -3108,32 +3448,21 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
* allow an unaligned length so long as it ends at
* i_size.
*/
- if (len != olen) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (len != olen)
+ return -EINVAL;
/* Check for overlapping ranges */
- if (dst_loff + len > loff && dst_loff < loff + len) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (dst_loff + len > loff && dst_loff < loff + len)
+ return -EINVAL;
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
}
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
again:
- ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+ ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
if (ret)
- goto out_unlock;
+ return ret;
if (same_inode)
ret = lock_extent_range(src, same_lock_start, same_lock_len,
@@ -3154,7 +3483,7 @@ again:
* Ranges in the io trees already unlocked. Now unlock all
* pages before waiting for all IO to complete.
*/
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
if (same_inode) {
btrfs_wait_ordered_range(src, same_lock_start,
same_lock_len);
@@ -3167,12 +3496,12 @@ again:
ASSERT(ret == 0);
if (WARN_ON(ret)) {
/* ranges in the io trees already unlocked */
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
return ret;
}
/* pass original length for comparison so we stay within i_size */
- ret = btrfs_cmp_data(olen, &cmp);
+ ret = btrfs_cmp_data(olen, cmp);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
@@ -3182,18 +3511,91 @@ again:
else
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
+
+ return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret;
+ struct cmp_pages cmp;
+ int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
+ bool same_inode = (src == dst);
+ u64 i, tail_len, chunk_count;
+
+ if (olen == 0)
+ return 0;
+
+ if (same_inode)
+ inode_lock(src);
+ else
+ btrfs_double_inode_lock(src, dst);
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+ if (chunk_count == 0)
+ num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
+
+ /*
+ * If deduping ranges in the same inode, locking rules make it
+ * mandatory to always lock pages in ascending order to avoid deadlocks
+ * with concurrent tasks (such as starting writeback/delalloc).
+ */
+ if (same_inode && dst_loff < loff)
+ swap(loff, dst_loff);
+
+ /*
+ * We must gather up all the pages before we initiate our extent
+ * locking. We use an array for the page pointers. Size of the array is
+ * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
+ */
+ cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_ZERO);
+ cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!cmp.src_pages || !cmp.dst_pages) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff, &cmp);
+ if (ret)
+ goto out_unlock;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst,
+ dst_loff, &cmp);
+
out_unlock:
if (same_inode)
inode_unlock(src);
else
btrfs_double_inode_unlock(src, dst);
+out_free:
+ kvfree(cmp.src_pages);
+ kvfree(cmp.dst_pages);
+
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff)
{
@@ -3202,9 +3604,6 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
ssize_t res;
- if (olen > BTRFS_MAX_DEDUPE_LEN)
- olen = BTRFS_MAX_DEDUPE_LEN;
-
if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
@@ -3826,11 +4225,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
src->i_sb != inode->i_sb)
return -EXDEV;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
return -EISDIR;
@@ -3840,6 +4234,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
inode_lock(src);
}
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* determine range to clone */
ret = -EINVAL;
if (off + len > src->i_size || off + len < off)
@@ -4007,8 +4408,8 @@ out:
return ret;
}
-void btrfs_get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space)
+static void get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
{
struct btrfs_block_group_cache *block_group;
@@ -4124,8 +4525,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) {
- btrfs_get_block_group_info(
- &info->block_groups[c], &space);
+ get_block_group_info(&info->block_groups[c],
+ &space);
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
@@ -4490,14 +4891,14 @@ out_loi:
return ret;
}
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
bargs->flags = bctl->flags;
- if (atomic_read(&fs_info->balance_running))
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
if (atomic_read(&fs_info->balance_pause_req))
bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
@@ -4508,13 +4909,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
- if (lock) {
- spin_lock(&fs_info->balance_lock);
- memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
- spin_unlock(&fs_info->balance_lock);
- } else {
- memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
- }
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
}
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
@@ -4535,7 +4932,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
again:
if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -4550,21 +4946,22 @@ again:
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
/* this is either (2) or (3) */
- if (!atomic_read(&fs_info->balance_running)) {
+ if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
- if (!mutex_trylock(&fs_info->volume_mutex))
- goto again;
+ /*
+ * Lock released to allow other waiters to continue,
+ * we'll reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl &&
- !atomic_read(&fs_info->balance_running)) {
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
/* this is (3) */
need_unlock = false;
goto locked;
}
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
goto again;
} else {
/* this is (2) */
@@ -4617,7 +5014,6 @@ locked:
goto out_bargs;
}
- bctl->fs_info = fs_info;
if (arg) {
memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
@@ -4636,14 +5032,14 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
- * goes to to btrfs_balance. bctl is freed in __cancel_balance,
- * or, if restriper was paused all the way until unmount, in
- * free_fs_info. The flag is cleared in __cancel_balance.
+ * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
+ * btrfs_balance. bctl is freed in reset_balance_state, or, if
+ * restriper was paused all the way until unmount, in free_fs_info.
+ * The flag should be cleared after reset_balance_state.
*/
need_unlock = false;
- ret = btrfs_balance(bctl, bargs);
+ ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
if (arg) {
@@ -4657,7 +5053,6 @@ out_bargs:
kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
@@ -4701,7 +5096,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
goto out;
}
- update_ioctl_balance_args(fs_info, 1, bargs);
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -5038,8 +5433,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
!btrfs_is_empty_uuid(root_item->received_uuid)) {
- ret = btrfs_uuid_tree_rem(trans, fs_info,
- root_item->received_uuid,
+ ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret && ret != -ENOENT) {
@@ -5063,7 +5457,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
+ ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
@@ -5497,7 +5891,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
@@ -5565,6 +5959,16 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(file, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
+ case FS_IOC_FSGETXATTR:
+ return btrfs_ioctl_fsgetxattr(file, argp);
+ case FS_IOC_FSSETXATTR:
+ return btrfs_ioctl_fssetxattr(file, argp);
+ case BTRFS_IOC_GET_SUBVOL_INFO:
+ return btrfs_ioctl_get_subvol_info(file, argp);
+ case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+ return btrfs_ioctl_get_subvol_rootref(file, argp);
+ case BTRFS_IOC_INO_LOOKUP_USER:
+ return btrfs_ioctl_ino_lookup_user(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e4faefac9d16..1da768e5ef75 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -66,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_writers) &&
- waitqueue_active(&eb->write_lock_wq))
- wake_up(&eb->write_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_readers) &&
- waitqueue_active(&eb->read_lock_wq))
- wake_up(&eb->read_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
}
}
@@ -221,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_readers) &&
- waitqueue_active(&eb->read_lock_wq))
- wake_up(&eb->read_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
atomic_dec(&eb->read_locks);
}
@@ -275,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
+ /* Use the lighter barrier after atomic */
smp_mb__after_atomic();
- if (waitqueue_active(&eb->write_lock_wq))
- wake_up(&eb->write_lock_wq);
+ cond_wake_up_nomb(&eb->write_lock_wq);
} else {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 0667ea07f766..b6a4cc178bee 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -17,6 +17,43 @@
#define LZO_LEN 4
+/*
+ * Btrfs LZO compression format
+ *
+ * Regular and inlined LZO compressed data extents consist of:
+ *
+ * 1. Header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size (including the header) of compressed data.
+ *
+ * 2. Segment(s)
+ * Variable size. Each segment includes one segment header, followd by data
+ * payload.
+ * One regular LZO compressed extent can have one or more segments.
+ * For inlined LZO compressed extent, only one segment is allowed.
+ * One segment represents at most one page of uncompressed data.
+ *
+ * 2.1 Segment header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size of the segment (not including the header).
+ * Segment header never crosses page boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the page.
+ *
+ * 2.2 Data Payload
+ * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
+ * which is 4419 for a 4KiB page.
+ *
+ * Example:
+ * Page 1:
+ * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
+ * 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
+ * ...
+ * 0x0ff0 | SegHdr N | Data payload N ... |00|
+ * ^^ padding zeros
+ * Page 2:
+ * 0x1000 | SegHdr N+1| Data payload N+1 ... |
+ */
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -258,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long working_bytes;
size_t in_len;
size_t out_len;
+ const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
unsigned long in_offset;
unsigned long in_page_bytes_left;
unsigned long tot_in;
@@ -271,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
+ /*
+ * Compressed data header check.
+ *
+ * The real compressed size can't exceed the maximum extent length, and
+ * all pages should be used (whole unused page with just the segment
+ * header is not possible). If this happens it means the compressed
+ * extent is corrupted.
+ */
+ if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
+ tot_len < srclen - PAGE_SIZE) {
+ ret = -EUCLEAN;
+ goto done;
+ }
tot_in = LZO_LEN;
in_offset = LZO_LEN;
- tot_len = min_t(size_t, srclen, tot_len);
in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
@@ -285,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
in_offset += LZO_LEN;
tot_in += LZO_LEN;
+ /*
+ * Segment header check.
+ *
+ * The segment length must not exceed the maximum LZO
+ * compression size, nor the total compressed size.
+ */
+ if (in_len > max_segment_len || tot_in + in_len > tot_len) {
+ ret = -EUCLEAN;
+ goto done;
+ }
+
tot_in += in_len;
working_bytes = in_len;
may_late_unmap = need_unmap = false;
@@ -335,7 +396,7 @@ cont:
}
}
- out_len = lzo1x_worst_compress(PAGE_SIZE);
+ out_len = max_segment_len;
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (need_unmap)
@@ -369,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
+ size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
int ret = 0;
char *kaddr;
unsigned long bytes;
- BUG_ON(srclen < LZO_LEN);
+ if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ return -EUCLEAN;
+ in_len = read_compress_length(data_in);
+ if (in_len != srclen)
+ return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
+ if (in_len != srclen - LZO_LEN * 2) {
+ ret = -EUCLEAN;
+ goto out;
+ }
data_in += LZO_LEN;
out_len = PAGE_SIZE;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6db8bb2f2c28..2e1a1694a33d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -343,11 +343,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /*
- * Implicit memory barrier after test_and_set_bit
- */
- if (waitqueue_active(&entry->wait))
- wake_up(&entry->wait);
+ /* test_and_set_bit implies a barrier */
+ cond_wake_up_nomb(&entry->wait);
} else {
ret = 1;
}
@@ -410,11 +407,8 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /*
- * Implicit memory barrier after test_and_set_bit
- */
- if (waitqueue_active(&entry->wait))
- wake_up(&entry->wait);
+ /* test_and_set_bit implies a barrier */
+ cond_wake_up_nomb(&entry->wait);
} else {
ret = 1;
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 21a831d3d087..a4e11cf04671 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -166,6 +166,25 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
}
}
+/*
+ * Helper to output refs and locking status of extent buffer. Useful to debug
+ * race condition related problems.
+ */
+static void print_eb_refs_lock(struct extent_buffer *eb)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ btrfs_info(eb->fs_info,
+"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
+ atomic_read(&eb->refs), atomic_read(&eb->write_locks),
+ atomic_read(&eb->read_locks),
+ atomic_read(&eb->blocking_writers),
+ atomic_read(&eb->blocking_readers),
+ atomic_read(&eb->spinning_writers),
+ atomic_read(&eb->spinning_readers),
+ eb->lock_owner, current->pid);
+#endif
+}
+
void btrfs_print_leaf(struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
@@ -193,6 +212,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
"leaf %llu gen %llu total ptrs %d free space %d owner %llu",
btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
+ print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
@@ -347,6 +367,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_header_bytenr(c), level, btrfs_header_generation(c),
nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr,
btrfs_header_owner(c));
+ print_eb_refs_lock(c);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9fb758d5077a..1874a6d2e6f5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1882,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(fs_info, qg->qgroupid,
- cur_old_count, cur_new_count);
+ trace_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
BUG_ON(!fs_info->quota_root);
- trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
- nr_old_roots, nr_new_roots);
+ trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+ num_bytes, nr_old_roots, nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
@@ -2580,6 +2580,21 @@ out:
}
/*
+ * Check if the leaf is the last leaf. Which means all node pointers
+ * are at their last position.
+ */
+static bool is_last_leaf(struct btrfs_path *path)
+{
+ int i;
+
+ for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
+ return false;
+ }
+ return true;
+}
+
+/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
*/
@@ -2590,8 +2605,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
+ bool done;
int slot;
int ret;
@@ -2620,12 +2635,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
+ done = is_last_leaf(path);
btrfs_item_key_to_cpu(path->nodes[0], &found,
btrfs_header_nritems(path->nodes[0]) - 1);
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
if (!scratch_leaf) {
ret = -ENOMEM;
@@ -2664,8 +2679,9 @@ out:
btrfs_tree_read_unlock_blocking(scratch_leaf);
free_extent_buffer(scratch_leaf);
}
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ if (done && !ret)
+ ret = 1;
return ret;
}
@@ -2681,6 +2697,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
+ /*
+ * Rescan should only search for commit root, and any later difference
+ * should be recorded by qgroup
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
err = 0;
while (!err && !btrfs_fs_closing(fs_info)) {
@@ -2760,26 +2782,36 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
{
int ret = 0;
- if (!init_flags &&
- (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
- !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
- ret = -EINVAL;
- goto err;
+ if (!init_flags) {
+ /* we're resuming qgroup rescan at mount time */
+ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN))
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup rescan is not queued");
+ return -EINVAL;
}
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ btrfs_warn(fs_info,
+ "qgroup rescan is already in progress");
ret = -EINPROGRESS;
- else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ } else if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ }
if (ret) {
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- goto err;
+ return ret;
}
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
@@ -2798,13 +2830,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_init_work(&fs_info->qgroup_rescan_work,
btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
-
- if (ret) {
-err:
- btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
- return ret;
- }
-
return 0;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9abd950e7f78..5e4ad134b9ad 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -163,6 +163,12 @@ struct btrfs_raid_bio {
* bitmap to record which horizontal stripe has data
*/
unsigned long *dbitmap;
+
+ /* allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+
+ /* allocated with stripe_npages-many bits for finish_*() calls */
+ unsigned long *finish_pbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -981,9 +987,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
- rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
- DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
- sizeof(long), GFP_NOFS);
+ rbio = kzalloc(sizeof(*rbio) +
+ sizeof(*rbio->stripe_pages) * num_pages +
+ sizeof(*rbio->bio_pages) * num_pages +
+ sizeof(*rbio->finish_pointers) * real_stripes +
+ sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
+ sizeof(*rbio->finish_pbitmap) *
+ BITS_TO_LONGS(stripe_npages),
+ GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1005,13 +1016,20 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages and bio_pages array point to the extra
+ * the stripe_pages, bio_pages, etc arrays point to the extra
* memory we allocated past the end of the rbio
*/
p = rbio + 1;
- rbio->stripe_pages = p;
- rbio->bio_pages = p + sizeof(struct page *) * num_pages;
- rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
+#define CONSUME_ALLOC(ptr, count) do { \
+ ptr = p; \
+ p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
+ } while (0)
+ CONSUME_ALLOC(rbio->stripe_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
+ CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
+ CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
+#undef CONSUME_ALLOC
if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
@@ -1180,7 +1198,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
- void *pointers[rbio->real_stripes];
+ void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
@@ -2350,8 +2368,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_bio *bbio = rbio->bbio;
- void *pointers[rbio->real_stripes];
- DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+ void **pointers = rbio->finish_pointers;
+ unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b041b945a7ae..879b76fa881a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4299,7 +4299,7 @@ out:
return inode;
}
-static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+static struct reloc_control *alloc_reloc_control(void)
{
struct reloc_control *rc;
@@ -4344,7 +4344,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
DESCRIBE_FLAG(RAID5, "raid5");
DESCRIBE_FLAG(RAID6, "raid6");
if (flags)
- snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
+ snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
#undef DESCRIBE_FLAG
}
@@ -4366,7 +4366,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
- rc = alloc_reloc_control(fs_info);
+ rc = alloc_reloc_control();
if (!rc)
return -ENOMEM;
@@ -4562,7 +4562,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control(fs_info);
+ rc = alloc_reloc_control();
if (!rc) {
err = -ENOMEM;
goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52b39a0924e9..a59005862010 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3984,6 +3984,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&fs_info->unused_bgs);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c0074d2d7d6d..c47f62b19226 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -235,6 +235,7 @@ struct orphan_dir_info {
struct rb_node node;
u64 ino;
u64 gen;
+ u64 last_dir_index_offset;
};
struct name_cache_entry {
@@ -2844,12 +2845,6 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
struct rb_node *parent = NULL;
struct orphan_dir_info *entry, *odi;
- odi = kmalloc(sizeof(*odi), GFP_KERNEL);
- if (!odi)
- return ERR_PTR(-ENOMEM);
- odi->ino = dir_ino;
- odi->gen = 0;
-
while (*p) {
parent = *p;
entry = rb_entry(parent, struct orphan_dir_info, node);
@@ -2858,11 +2853,17 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
} else if (dir_ino > entry->ino) {
p = &(*p)->rb_right;
} else {
- kfree(odi);
return entry;
}
}
+ odi = kmalloc(sizeof(*odi), GFP_KERNEL);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+ odi->last_dir_index_offset = 0;
+
rb_link_node(&odi->node, parent, p);
rb_insert_color(&odi->node, &sctx->orphan_dirs);
return odi;
@@ -2917,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_key found_key;
struct btrfs_key loc;
struct btrfs_dir_item *di;
+ struct orphan_dir_info *odi = NULL;
/*
* Don't try to rmdir the top/root subvolume dir.
@@ -2931,6 +2933,11 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+
+ odi = get_orphan_dir_info(sctx, dir);
+ if (odi)
+ key.offset = odi->last_dir_index_offset;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -2958,30 +2965,33 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- struct orphan_dir_info *odi;
-
odi = add_orphan_dir_info(sctx, dir);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
}
odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
ret = 0;
goto out;
}
if (loc.objectid > send_progress) {
- struct orphan_dir_info *odi;
-
- odi = get_orphan_dir_info(sctx, dir);
- free_orphan_dir_info(sctx, odi);
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
ret = 0;
goto out;
}
path->slots[0]++;
}
+ free_orphan_dir_info(sctx, odi);
ret = 1;
@@ -3259,13 +3269,16 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
if (rmdir_ino) {
struct orphan_dir_info *odi;
+ u64 gen;
odi = get_orphan_dir_info(sctx, rmdir_ino);
if (!odi) {
/* already deleted */
goto finish;
}
- ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
+ gen = odi->gen;
+
+ ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
if (ret < 0)
goto out;
if (!ret)
@@ -3276,13 +3289,12 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
ret = -ENOMEM;
goto out;
}
- ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ ret = get_cur_path(sctx, rmdir_ino, gen, name);
if (ret < 0)
goto out;
ret = send_rmdir(sctx, name);
if (ret < 0)
goto out;
- free_orphan_dir_info(sctx, odi);
}
finish:
@@ -6454,7 +6466,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
*/
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
- "send_in_progres unbalanced %d root %llu",
+ "send_in_progress unbalanced %d root %llu",
root->send_in_progress, root->root_key.objectid);
spin_unlock(&root->root_item_lock);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0628092b0b1b..81107ad49f3a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -323,6 +323,7 @@ enum {
Opt_ssd, Opt_nossd,
Opt_ssd_spread, Opt_nossd_spread,
Opt_subvol,
+ Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
@@ -388,6 +389,7 @@ static const match_table_t tokens = {
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd_spread, "nossd_spread"},
{Opt_subvol, "subvol=%s"},
+ {Opt_subvol_empty, "subvol="},
{Opt_subvolid, "subvolid=%s"},
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
@@ -461,6 +463,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
+ case Opt_subvol_empty:
case Opt_subvolid:
case Opt_subvolrootid:
case Opt_device:
@@ -1782,10 +1785,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
ret = btrfs_parse_options(fs_info, data, *flags);
- if (ret) {
- ret = -EINVAL;
+ if (ret)
goto restore;
- }
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4848a4318fb5..4a4e960c7c66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -210,12 +210,42 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
NULL
};
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features lists all available features of this kernel while
+ * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
+ * can be changed online.
+ */
static const struct attribute_group btrfs_feature_attr_group = {
.name = "features",
.is_visible = btrfs_feature_visible,
.attrs = btrfs_supported_feature_attrs,
};
+static ssize_t rmdir_subvol_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "0\n");
+}
+BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+
+static struct attribute *btrfs_supported_static_feature_attrs[] = {
+ BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ NULL
+};
+
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_feature_attr_group
+ */
+static const struct attribute_group btrfs_static_feature_attr_group = {
+ .name = "features",
+ .attrs = btrfs_supported_static_feature_attrs,
+};
+
static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
{
u64 val;
@@ -514,10 +544,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
}
#define NUM_FEATURE_BITS 64
-static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
-static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+#define BTRFS_FEATURE_NAME_MAX 13
+static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
-static const u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
[FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
@@ -589,7 +620,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
return;
}
- list_for_each_entry(fs_devs, fs_uuids, list) {
+ list_for_each_entry(fs_devs, fs_uuids, fs_list) {
__btrfs_sysfs_remove_fsid(fs_devs);
}
}
@@ -609,7 +640,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
-const char * const btrfs_feature_set_names[3] = {
+const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_COMPAT] = "compat",
[FEAT_COMPAT_RO] = "compat_ro",
[FEAT_INCOMPAT] = "incompat",
@@ -673,7 +704,7 @@ static void init_feature_attrs(void)
if (fa->kobj_attr.attr.name)
continue;
- snprintf(name, 13, "%s:%u",
+ snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u",
btrfs_feature_set_names[set], i);
fa->kobj_attr.attr.name = name;
@@ -900,8 +931,15 @@ int __init btrfs_init_sysfs(void)
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
if (ret)
goto out2;
+ ret = sysfs_merge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ if (ret)
+ goto out_remove_group;
return 0;
+
+out_remove_group:
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
out2:
debugfs_remove_recursive(btrfs_debugfs_root_dentry);
out1:
@@ -912,6 +950,8 @@ out1:
void __cold btrfs_exit_sysfs(void)
{
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
debugfs_remove_recursive(btrfs_debugfs_root_dentry);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index b567560d9aa9..c6ee600aff89 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -9,7 +9,7 @@
extern u64 btrfs_debugfs_test;
enum btrfs_feature_set {
- FEAT_COMPAT,
+ FEAT_COMPAT = 0,
FEAT_COMPAT_RO,
FEAT_INCOMPAT,
FEAT_MAX
@@ -77,7 +77,7 @@ attr_to_btrfs_feature_attr(struct attribute *attr)
}
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-extern const char * const btrfs_feature_set_names[3];
+extern const char * const btrfs_feature_set_names[FEAT_MAX];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 30ed438da2a9..db72b3b6209e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -219,11 +219,13 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
kfree(cache);
}
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
trans->type = __TRANS_DUMMY;
+ trans->fs_info = fs_info;
}
int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index a5a0b9500d3e..70ff9f9d86a1 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -9,7 +9,8 @@
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_run_sanity_tests(void);
-#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
struct btrfs_root;
struct btrfs_trans_handle;
@@ -28,7 +29,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
{
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 31e8a9ec228c..7d72eab6d32c 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -26,31 +26,31 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
u32 value_len = strlen(value);
int ret = 0;
- test_msg("Running btrfs_split_item tests\n");
+ test_msg("running btrfs_split_item tests");
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Could not allocate fs_info\n");
+ test_err("could not allocate fs_info");
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Could not allocate root\n");
+ test_err("could not allocate root");
ret = PTR_ERR(root);
goto out;
}
path = btrfs_alloc_path();
if (!path) {
- test_msg("Could not allocate path\n");
+ test_err("could not allocate path");
ret = -ENOMEM;
goto out;
}
path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!eb) {
- test_msg("Could not allocate dummy buffer\n");
+ test_err("could not allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -75,7 +75,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_split_item(NULL, root, path, &key, 17);
if (ret) {
- test_msg("Split item failed %d\n", ret);
+ test_err("split item failed %d", ret);
goto out;
}
@@ -86,14 +86,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 0);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 0) {
- test_msg("Invalid key at slot 0\n");
+ test_err("invalid key at slot 0");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(0);
if (btrfs_item_size(eb, item) != strlen(split1)) {
- test_msg("Invalid len in the first split\n");
+ test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
}
@@ -101,8 +101,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
strlen(split1));
if (memcmp(buf, split1, strlen(split1))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the first split have='%.*s' want '%s'\n",
+ test_err(
+"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'",
(int)strlen(split1), buf, split1);
ret = -EINVAL;
goto out;
@@ -111,14 +111,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 1);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 3) {
- test_msg("Invalid key at slot 1\n");
+ test_err("invalid key at slot 1");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(1);
if (btrfs_item_size(eb, item) != strlen(split2)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -126,8 +126,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
strlen(split2));
if (memcmp(buf, split2, strlen(split2))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the second split\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the second split");
ret = -EINVAL;
goto out;
}
@@ -136,21 +136,21 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
/* Do it again so we test memmoving the other items in the leaf */
ret = btrfs_split_item(NULL, root, path, &key, 4);
if (ret) {
- test_msg("Second split item failed %d\n", ret);
+ test_err("second split item failed %d", ret);
goto out;
}
btrfs_item_key_to_cpu(eb, &key, 0);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 0) {
- test_msg("Invalid key at slot 0\n");
+ test_err("invalid key at slot 0");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(0);
if (btrfs_item_size(eb, item) != strlen(split3)) {
- test_msg("Invalid len in the first split\n");
+ test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
}
@@ -158,8 +158,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
strlen(split3));
if (memcmp(buf, split3, strlen(split3))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the third split");
+ test_err(
+ "data in the buffer doesn't match what it should in the third split");
ret = -EINVAL;
goto out;
}
@@ -167,14 +167,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 1);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 1) {
- test_msg("Invalid key at slot 1\n");
+ test_err("invalid key at slot 1");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(1);
if (btrfs_item_size(eb, item) != strlen(split4)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -182,8 +182,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
strlen(split4));
if (memcmp(buf, split4, strlen(split4))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the fourth split\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the fourth split");
ret = -EINVAL;
goto out;
}
@@ -191,14 +191,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 2);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 3) {
- test_msg("Invalid key at slot 2\n");
+ test_err("invalid key at slot 2");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(2);
if (btrfs_item_size(eb, item) != strlen(split2)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -206,8 +206,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
strlen(split2));
if (memcmp(buf, split2, strlen(split2))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the last chunk\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the last chunk");
ret = -EINVAL;
goto out;
}
@@ -220,6 +220,6 @@ out:
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
{
- test_msg("Running extent buffer operation tests\n");
+ test_msg("running extent buffer operation tests");
return test_btrfs_split_item(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 76aa5a678a96..d9269a531a4d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -46,7 +46,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
cond_resched();
loops++;
if (loops > 100000) {
- printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+ printk(KERN_ERR
+ "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
+ start, end, nr_pages, ret);
break;
}
}
@@ -66,11 +68,11 @@ static int test_find_delalloc(u32 sectorsize)
u64 found;
int ret = -EINVAL;
- test_msg("Running find delalloc tests\n");
+ test_msg("running find delalloc tests");
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Failed to allocate test inode\n");
+ test_err("failed to allocate test inode");
return -ENOMEM;
}
@@ -84,7 +86,7 @@ static int test_find_delalloc(u32 sectorsize)
for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
- test_msg("Failed to allocate test page\n");
+ test_err("failed to allocate test page");
ret = -ENOMEM;
goto out;
}
@@ -107,11 +109,11 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Should have found at least one delalloc\n");
+ test_err("should have found at least one delalloc");
goto out_bits;
}
if (start != 0 || end != (sectorsize - 1)) {
- test_msg("Expected start 0 end %u, got start %llu end %llu\n",
+ test_err("expected start 0 end %u, got start %llu end %llu",
sectorsize - 1, start, end);
goto out_bits;
}
@@ -129,7 +131,7 @@ static int test_find_delalloc(u32 sectorsize)
locked_page = find_lock_page(inode->i_mapping,
test_start >> PAGE_SHIFT);
if (!locked_page) {
- test_msg("Couldn't find the locked page\n");
+ test_err("couldn't find the locked page");
goto out_bits;
}
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
@@ -138,17 +140,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Couldn't find delalloc in our range\n");
+ test_err("couldn't find delalloc in our range");
goto out_bits;
}
if (start != test_start || end != max_bytes - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu, end "
- "%Lu\n", test_start, max_bytes - 1, start, end);
+ test_err("expected start %llu end %llu, got start %llu, end %llu",
+ test_start, max_bytes - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end,
PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
- test_msg("There were unlocked pages in the range\n");
+ test_err("there were unlocked pages in the range");
goto out_bits;
}
unlock_extent(&tmp, start, end);
@@ -164,7 +166,7 @@ static int test_find_delalloc(u32 sectorsize)
locked_page = find_lock_page(inode->i_mapping, test_start >>
PAGE_SHIFT);
if (!locked_page) {
- test_msg("Couldn't find the locked page\n");
+ test_err("couldn't find the locked page");
goto out_bits;
}
start = test_start;
@@ -172,11 +174,11 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
- test_msg("Found range when we shouldn't have\n");
+ test_err("found range when we shouldn't have");
goto out_bits;
}
if (end != (u64)-1) {
- test_msg("Did not return the proper end offset\n");
+ test_err("did not return the proper end offset");
goto out_bits;
}
@@ -193,17 +195,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Didn't find our range\n");
+ test_err("didn't find our range");
goto out_bits;
}
if (start != test_start || end != total_dirty - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_err("expected start %llu end %llu, got start %llu end %llu",
test_start, total_dirty - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end,
PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
- test_msg("Pages in range were not all locked\n");
+ test_err("pages in range were not all locked");
goto out_bits;
}
unlock_extent(&tmp, start, end);
@@ -215,7 +217,7 @@ static int test_find_delalloc(u32 sectorsize)
page = find_get_page(inode->i_mapping,
(max_bytes + SZ_1M) >> PAGE_SHIFT);
if (!page) {
- test_msg("Couldn't find our page\n");
+ test_err("couldn't find our page");
goto out_bits;
}
ClearPageDirty(page);
@@ -234,18 +236,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Didn't find our range\n");
+ test_err("didn't find our range");
goto out_bits;
}
if (start != test_start && end != test_start + PAGE_SIZE - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
- test_start, test_start + PAGE_SIZE - 1, start,
- end);
+ test_err("expected start %llu end %llu, got start %llu end %llu",
+ test_start, test_start + PAGE_SIZE - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
PROCESS_UNLOCK)) {
- test_msg("Pages in range were not all locked\n");
+ test_err("pages in range were not all locked");
goto out_bits;
}
ret = 0;
@@ -271,14 +272,14 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb,
bit = !!test_bit(i, bitmap);
bit1 = !!extent_buffer_test_bit(eb, 0, i);
if (bit1 != bit) {
- test_msg("Bits do not match\n");
+ test_err("bits do not match");
return -EINVAL;
}
bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
i % BITS_PER_BYTE);
if (bit1 != bit) {
- test_msg("Offset bits do not match\n");
+ test_err("offset bits do not match");
return -EINVAL;
}
}
@@ -295,7 +296,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
memset(bitmap, 0, len);
memzero_extent_buffer(eb, 0, len);
if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Bitmap was not zeroed\n");
+ test_err("bitmap was not zeroed");
return -EINVAL;
}
@@ -303,7 +304,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Setting all bits failed\n");
+ test_err("setting all bits failed");
return ret;
}
@@ -311,7 +312,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Clearing all bits failed\n");
+ test_err("clearing all bits failed");
return ret;
}
@@ -324,7 +325,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
sizeof(long) * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Setting straddling pages failed\n");
+ test_err("setting straddling pages failed");
return ret;
}
@@ -337,7 +338,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
sizeof(long) * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Clearing straddling pages failed\n");
+ test_err("clearing straddling pages failed");
return ret;
}
}
@@ -361,7 +362,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Random bit pattern failed\n");
+ test_err("random bit pattern failed");
return ret;
}
@@ -376,7 +377,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
struct extent_buffer *eb;
int ret;
- test_msg("Running extent buffer bitmap tests\n");
+ test_msg("running extent buffer bitmap tests");
/*
* In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
@@ -389,13 +390,13 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
- test_msg("Couldn't allocate test bitmap\n");
+ test_err("couldn't allocate test bitmap");
return -ENOMEM;
}
eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
if (!eb) {
- test_msg("Couldn't allocate test extent buffer\n");
+ test_err("couldn't allocate test extent buffer");
kfree(bitmap);
return -ENOMEM;
}
@@ -408,7 +409,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
free_extent_buffer(eb);
eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
if (!eb) {
- test_msg("Couldn't allocate test extent buffer\n");
+ test_err("couldn't allocate test extent buffer");
kfree(bitmap);
return -ENOMEM;
}
@@ -424,7 +425,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
- test_msg("Running extent I/O tests\n");
+ test_msg("running extent I/O tests");
ret = test_find_delalloc(sectorsize);
if (ret)
@@ -432,6 +433,6 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
ret = test_eb_bitmaps(sectorsize, nodesize);
out:
- test_msg("Extent I/O tests finished\n");
+ test_msg("extent I/O tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 79e0a5f4d9c9..385a5316e4bf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -19,8 +19,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
- test_msg(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+ test_err(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -47,7 +47,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static void test_case_1(struct extent_map_tree *em_tree)
+static void test_case_1(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
struct extent_map *em;
u64 start = 0;
@@ -90,14 +91,14 @@ static void test_case_1(struct extent_map_tree *em_tree)
em->len = len;
em->block_start = start;
em->block_len = len;
- ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
if (ret)
- test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+ test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_16K ||
em->block_start != 0 || em->block_len != SZ_16K))
- test_msg(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ test_err(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
free_extent_map(em);
@@ -112,7 +113,8 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static void test_case_2(struct extent_map_tree *em_tree)
+static void test_case_2(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
struct extent_map *em;
int ret;
@@ -153,14 +155,14 @@ static void test_case_2(struct extent_map_tree *em_tree)
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
- ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
if (ret)
- test_msg("case2 [0 1K]: ret %d\n", ret);
+ test_err("case2 [0 1K]: ret %d", ret);
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_1K ||
em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
- test_msg(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ test_err(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
em->block_len);
free_extent_map(em);
@@ -169,7 +171,8 @@ out:
free_extent_map_tree(em_tree);
}
-static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
u64 len = SZ_4K;
@@ -198,9 +201,9 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
- ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
if (ret)
- test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+ test_err("case3 [0x%llx 0x%llx): ret %d",
start, start + len, ret);
/*
* Since bytes within em are contiguous, em->block_start is identical to
@@ -209,8 +212,8 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
if (em &&
(start < em->start || start + len > extent_map_end(em) ||
em->start != em->block_start || em->len != em->block_len))
- test_msg(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ test_err(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
free_extent_map(em);
@@ -235,14 +238,16 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static void test_case_3(struct extent_map_tree *em_tree)
+static void test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
- __test_case_3(em_tree, 0);
- __test_case_3(em_tree, SZ_8K);
- __test_case_3(em_tree, (12 * 1024ULL));
+ __test_case_3(fs_info, em_tree, 0);
+ __test_case_3(fs_info, em_tree, SZ_8K);
+ __test_case_3(fs_info, em_tree, (12 * 1024ULL));
}
-static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
u64 len = SZ_4K;
@@ -283,14 +288,14 @@ static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
em->len = SZ_32K;
em->block_start = 0;
em->block_len = SZ_32K;
- ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
if (ret)
- test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+ test_err("case4 [0x%llx 0x%llx): ret %d",
start, len, ret);
if (em &&
(start < em->start || start + len > extent_map_end(em)))
- test_msg(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ test_err(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, len, ret, em->start, em->len, em->block_start,
em->block_len);
free_extent_map(em);
@@ -324,30 +329,45 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static void test_case_4(struct extent_map_tree *em_tree)
+static void test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
- __test_case_4(em_tree, 0);
- __test_case_4(em_tree, SZ_4K);
+ __test_case_4(fs_info, em_tree, 0);
+ __test_case_4(fs_info, em_tree, SZ_4K);
}
int btrfs_test_extent_map(void)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
- test_msg("Running extent_map tests\n");
+ test_msg("running extent_map tests");
+
+ /*
+ * Note: the fs_info is not set up completely, we only need
+ * fs_info::fsid for the tracepoint.
+ */
+ fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info");
+ return -ENOMEM;
+ }
em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
if (!em_tree)
/* Skip the test on error. */
- return 0;
+ goto out;
extent_map_tree_init(em_tree);
- test_case_1(em_tree);
- test_case_2(em_tree);
- test_case_3(em_tree);
- test_case_4(em_tree);
+ test_case_1(fs_info, em_tree);
+ test_case_2(fs_info, em_tree);
+ test_case_3(fs_info, em_tree);
+ test_case_4(fs_info, em_tree);
kfree(em_tree);
+out:
+ btrfs_free_dummy_fs_info(fs_info);
+
return 0;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index d3c9f8a59ba5..5c2f77e9439b 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -20,63 +20,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
{
int ret = 0;
- test_msg("Running extent only tests\n");
+ test_msg("running extent only tests");
/* First just make sure we can remove an entire entry */
ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error adding initial extents %d\n", ret);
+ test_err("error adding initial extents %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error removing extent %d\n", ret);
+ test_err("error removing extent %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_4M)) {
- test_msg("Full remove left some lingering space\n");
+ test_err("full remove left some lingering space");
return -1;
}
/* Ok edge and middle cases now */
ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error adding half extent %d\n", ret);
+ test_err("error adding half extent %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
- test_msg("Error removing tail end %d\n", ret);
+ test_err("error removing tail end %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
- test_msg("Error removing front end %d\n", ret);
+ test_err("error removing front end %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
- test_msg("Error removing middle piece %d\n", ret);
+ test_err("error removing middle piece %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_1M)) {
- test_msg("Still have space at the front\n");
+ test_err("still have space at the front");
return -1;
}
if (test_check_exists(cache, SZ_2M, 4096)) {
- test_msg("Still have space in the middle\n");
+ test_err("still have space in the middle");
return -1;
}
if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
- test_msg("Still have space at the end\n");
+ test_err("still have space at the end");
return -1;
}
@@ -92,34 +92,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
u64 next_bitmap_offset;
int ret;
- test_msg("Running bitmap only tests\n");
+ test_msg("running bitmap only tests");
ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't create a bitmap entry %d\n", ret);
+ test_err("couldn't create a bitmap entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error removing bitmap full range %d\n", ret);
+ test_err("error removing bitmap full range %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_4M)) {
- test_msg("Left some space in bitmap\n");
+ test_err("left some space in bitmap");
return -1;
}
ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add to our bitmap entry %d\n", ret);
+ test_err("couldn't add to our bitmap entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
- test_msg("Couldn't remove middle chunk %d\n", ret);
+ test_err("couldn't remove middle chunk %d", ret);
return ret;
}
@@ -133,19 +133,19 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ test_err("couldn't add space that straddles two bitmaps %d",
ret);
return ret;
}
ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
- test_msg("Couldn't remove overlapping space %d\n", ret);
+ test_err("couldn't remove overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
- test_msg("Left some space when removing overlapping\n");
+ test_err("left some space when removing overlapping");
return -1;
}
@@ -161,7 +161,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
int ret;
- test_msg("Running bitmap and extent tests\n");
+ test_msg("running bitmap and extent tests");
/*
* First let's do something simple, an extent at the same offset as the
@@ -170,42 +170,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
- test_msg("Couldn't create bitmap entry %d\n", ret);
+ test_err("couldn't create bitmap entry %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
- test_msg("Couldn't remove extent entry %d\n", ret);
+ test_err("couldn't remove extent entry %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_1M)) {
- test_msg("Left remnants after our remove\n");
+ test_err("left remnants after our remove");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't re-add extent entry %d\n", ret);
+ test_err("couldn't re-add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
- test_msg("Couldn't remove from bitmap %d\n", ret);
+ test_err("couldn't remove from bitmap %d", ret);
return ret;
}
if (test_check_exists(cache, SZ_4M, SZ_1M)) {
- test_msg("Left remnants in the bitmap\n");
+ test_err("left remnants in the bitmap");
return -1;
}
@@ -215,18 +215,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add to a bitmap %d\n", ret);
+ test_err("couldn't add to a bitmap %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
- test_msg("Couldn't remove overlapping space %d\n", ret);
+ test_err("couldn't remove overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
- test_msg("Left over pieces after removing overlapping\n");
+ test_err("left over pieces after removing overlapping");
return -1;
}
@@ -235,24 +235,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add space to the bitmap %d\n", ret);
+ test_err("couldn't add space to the bitmap %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
- test_msg("Couldn't add extent to the cache %d\n", ret);
+ test_err("couldn't add extent to the cache %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
- test_msg("Problem removing overlapping space %d\n", ret);
+ test_err("problem removing overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
- test_msg("Left something behind when removing space");
+ test_err("left something behind when removing space");
return -1;
}
@@ -269,25 +269,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
__btrfs_remove_free_space_cache(cache->free_space_ctl);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add bitmap %d\n", ret);
+ test_err("couldn't add bitmap %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
5 * SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
- test_msg("Failed to free our space %d\n", ret);
+ test_err("failed to free our space %d", ret);
return ret;
}
if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
- test_msg("Left stuff over\n");
+ test_err("left stuff over");
return -1;
}
@@ -301,19 +301,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
- test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+ test_err("error removing bitmap and extent overlapping %d", ret);
return ret;
}
@@ -335,12 +335,14 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
const int num_bitmaps)
{
if (cache->free_space_ctl->free_extents != num_extents) {
- test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
cache->free_space_ctl->free_extents, num_extents);
return -EINVAL;
}
if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
- test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
cache->free_space_ctl->total_bitmaps, num_bitmaps);
return -EINVAL;
}
@@ -358,7 +360,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* allocate.
*/
if (cache->free_space_ctl->free_space != 0) {
- test_msg("Cache free space is not 0\n");
+ test_err("cache free space is not 0");
return -EINVAL;
}
@@ -366,7 +368,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
&max_extent_size);
if (offset != 0) {
- test_msg("Space allocation did not fail, returned offset: %llu",
+ test_err("space allocation did not fail, returned offset: %llu",
offset);
return -EINVAL;
}
@@ -402,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
};
const struct btrfs_free_space_op *orig_free_space_ops;
- test_msg("Running space stealing from bitmap to extent\n");
+ test_msg("running space stealing from bitmap to extent");
/*
* For this test, we want to ensure we end up with an extent entry
@@ -430,7 +432,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
@@ -438,7 +440,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
SZ_128M - SZ_512K, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
@@ -457,17 +459,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
SZ_128M + 768 * SZ_1K,
SZ_128M - 768 * SZ_1K);
if (ret) {
- test_msg("Failed to free part of bitmap space %d\n", ret);
+ test_err("failed to free part of bitmap space %d", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
@@ -477,7 +479,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
SZ_128M - 768 * SZ_1K)) {
- test_msg("Bitmap region not removed from space cache\n");
+ test_err("bitmap region not removed from space cache");
return -EINVAL;
}
@@ -486,7 +488,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* covered by the bitmap, isn't marked as free.
*/
if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -495,7 +497,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* by the bitmap too, isn't marked as free either.
*/
if (test_check_exists(cache, SZ_128M, SZ_256K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -506,12 +508,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
- test_msg("Bitmap region not marked as free\n");
+ test_err("bitmap region not marked as free");
return -ENOENT;
}
@@ -531,7 +533,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
@@ -550,12 +552,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
- test_msg("Extent region not marked as free\n");
+ test_err("extent region not marked as free");
return -ENOENT;
}
@@ -583,12 +585,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* allocate the whole free space at once.
*/
if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
- test_msg("Expected region not marked as free\n");
+ test_err("expected region not marked as free");
return -ENOENT;
}
if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
- test_msg("Cache free space is not 1Mb + %u\n", sectorsize);
+ test_err("cache free space is not 1Mb + %u", sectorsize);
return -EINVAL;
}
@@ -596,7 +598,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, SZ_1M, 0,
&max_extent_size);
if (offset != (SZ_128M - SZ_256K)) {
- test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
offset);
return -EINVAL;
}
@@ -610,7 +613,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
return ret;
if (cache->free_space_ctl->free_space != sectorsize) {
- test_msg("Cache free space is not %u\n", sectorsize);
+ test_err("cache free space is not %u", sectorsize);
return -EINVAL;
}
@@ -618,7 +621,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, sectorsize, 0,
&max_extent_size);
if (offset != (SZ_128M + SZ_16M)) {
- test_msg("Failed to allocate %u, returned offset : %llu\n",
+ test_err("failed to allocate %u, returned offset : %llu",
sectorsize, offset);
return -EINVAL;
}
@@ -640,14 +643,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
@@ -664,17 +667,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
- test_msg("Failed to free part of bitmap space %d\n", ret);
+ test_err("failed to free part of bitmap space %d", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
@@ -683,7 +686,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* as free anymore.
*/
if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
- test_msg("Bitmap region not removed from space cache\n");
+ test_err("bitmap region not removed from space cache");
return -EINVAL;
}
@@ -692,7 +695,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* covered by the bitmap, isn't marked as free.
*/
if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -703,12 +706,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
- test_msg("Bitmap region not marked as free\n");
+ test_err("bitmap region not marked as free");
return -ENOENT;
}
@@ -728,7 +731,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
@@ -739,12 +742,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
- test_msg("Extent region not marked as free\n");
+ test_err("extent region not marked as free");
return -ENOENT;
}
@@ -772,19 +775,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* allocate the whole free space at once.
*/
if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
- test_msg("Expected region not marked as free\n");
+ test_err("expected region not marked as free");
return -ENOENT;
}
if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
- test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize);
+ test_err("cache free space is not 1Mb + %u", 2 * sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
if (offset != (SZ_128M - 768 * SZ_1K)) {
- test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
offset);
return -EINVAL;
}
@@ -798,7 +802,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
return ret;
if (cache->free_space_ctl->free_space != 2 * sectorsize) {
- test_msg("Cache free space is not %u\n", 2 * sectorsize);
+ test_err("cache free space is not %u", 2 * sectorsize);
return -EINVAL;
}
@@ -806,9 +810,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, 2 * sectorsize, 0,
&max_extent_size);
if (offset != SZ_32M) {
- test_msg("Failed to allocate %u, offset: %llu\n",
- 2 * sectorsize,
- offset);
+ test_err("failed to allocate %u, offset: %llu",
+ 2 * sectorsize, offset);
return -EINVAL;
}
@@ -829,7 +832,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
- test_msg("Running btrfs free space cache tests\n");
+ test_msg("running btrfs free space cache tests");
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info)
return -ENOMEM;
@@ -843,7 +846,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
cache = btrfs_alloc_dummy_block_group(fs_info,
BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
if (!cache) {
- test_msg("Couldn't run the tests\n");
+ test_err("couldn't run the tests");
btrfs_free_dummy_fs_info(fs_info);
return 0;
}
@@ -871,6 +874,6 @@ out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
- test_msg("Free space cache tests finished\n");
+ test_msg("free space cache tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index e1f9666c4974..89346da890cf 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
info = search_free_space_info(trans, fs_info, cache, path, 0);
if (IS_ERR(info)) {
- test_msg("Could not find free space info\n");
+ test_err("could not find free space info");
ret = PTR_ERR(info);
goto out;
}
@@ -40,7 +40,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
if (extent_count != num_extents) {
- test_msg("Extent count is wrong\n");
+ test_err("extent count is wrong");
ret = -EINVAL;
goto out;
}
@@ -99,7 +99,7 @@ out:
btrfs_release_path(path);
return ret;
invalid:
- test_msg("Free space tree is invalid\n");
+ test_err("free space tree is invalid");
ret = -EINVAL;
goto out;
}
@@ -117,7 +117,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
info = search_free_space_info(trans, fs_info, cache, path, 0);
if (IS_ERR(info)) {
- test_msg("Could not find free space info\n");
+ test_err("could not find free space info");
btrfs_release_path(path);
return PTR_ERR(info);
}
@@ -131,15 +131,15 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ ret = convert_free_space_to_extents(trans, cache, path);
if (ret) {
- test_msg("Could not convert to extents\n");
+ test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ ret = convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
- test_msg("Could not convert to bitmaps\n");
+ test_err("could not convert to bitmaps");
return ret;
}
}
@@ -170,11 +170,11 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -194,10 +194,10 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid, alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -217,12 +217,12 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid +
cache->key.offset - alignment,
alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -243,11 +243,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -266,26 +266,26 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -304,27 +304,27 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -343,34 +343,34 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -391,34 +391,34 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 4 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -444,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
ret = -ENOMEM;
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate dummy root\n");
+ test_err("couldn't allocate dummy root");
ret = PTR_ERR(root);
goto out;
}
@@ -463,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -473,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
if (!cache) {
- test_msg("Couldn't allocate dummy block group cache\n");
+ test_err("couldn't allocate dummy block group cache");
ret = -ENOMEM;
goto out;
}
@@ -482,26 +482,25 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
cache->needs_free_space = 1;
cache->fs_info = root->fs_info;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, root->fs_info);
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
ret = -ENOMEM;
goto out;
}
- ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ ret = add_block_group_free_space(&trans, cache);
if (ret) {
- test_msg("Could not add block group free space\n");
+ test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
- cache, path);
+ ret = convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
- test_msg("Could not convert block group to bitmaps\n");
+ test_err("could not convert block group to bitmaps");
goto out;
}
}
@@ -510,14 +509,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ ret = remove_block_group_free_space(&trans, cache);
if (ret) {
- test_msg("Could not remove block group free space\n");
+ test_err("could not remove block group free space");
goto out;
}
if (btrfs_header_nritems(root->node) != 0) {
- test_msg("Free space tree has leftover items\n");
+ test_err("free space tree has leftover items");
ret = -EINVAL;
goto out;
}
@@ -539,14 +538,16 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
if (ret) {
- test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n",
+ test_err(
+ "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
if (ret) {
- test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n",
+ test_err(
+ "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
@@ -577,7 +578,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
*/
bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE;
- test_msg("Running free space tree tests\n");
+ test_msg("running free space tree tests");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
int ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e0ba799536b4..64043f028820 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -228,7 +228,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
@@ -238,19 +238,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
goto out;
}
@@ -268,11 +268,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
if (IS_ERR(em)) {
em = NULL;
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
free_extent_map(em);
@@ -287,20 +287,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != 0 || em->len != 5) {
- test_msg("Unexpected extent wanted start 0 len 5, got start "
- "%llu len %llu\n", em->start, em->len);
+ test_err(
+ "unexpected extent wanted start 0 len 5, got start %llu len %llu",
+ em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -308,21 +309,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_INLINE) {
- test_msg("Expected an inline, got %llu\n", em->block_start);
+ test_err("expected an inline, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != (sectorsize - 5)) {
- test_msg("Unexpected extent wanted start %llu len 1, got start "
- "%llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 1, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
/*
@@ -335,20 +337,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 4) {
- test_msg("Unexpected extent wanted start %llu len 4, got start "
- "%llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 4, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -357,24 +360,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize - 1) {
- test_msg("Unexpected extent wanted start %llu len 4095, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 4095, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -384,25 +388,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -413,21 +417,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -435,31 +439,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
orig_start, em->orig_start);
goto out;
}
disk_bytenr += (em->start - orig_start);
if (em->block_start != disk_bytenr) {
- test_msg("Wrong block start, want %llu, have %llu\n",
+ test_err("wrong block start, want %llu, have %llu",
disk_bytenr, em->block_start);
goto out;
}
@@ -469,26 +473,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -498,26 +502,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -528,30 +532,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_HOLE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+ test_err("unexpected orig offset, wanted %llu, have %llu",
orig_start, em->orig_start);
goto out;
}
if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
- test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ test_err("unexpected block start, wanted %llu, have %llu",
disk_bytenr + (em->start - em->orig_start),
em->block_start);
goto out;
@@ -561,31 +565,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+ test_err("wrong orig offset, want %llu, have %llu", orig_start,
em->orig_start);
goto out;
}
if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
- test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ test_err("unexpected block start, wanted %llu, have %llu",
disk_bytenr + (em->start - em->orig_start),
em->block_start);
goto out;
@@ -596,31 +600,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, em->orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -630,31 +634,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, em->orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -665,25 +669,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -692,32 +696,32 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != disk_bytenr) {
- test_msg("Block start does not match, want %llu got %llu\n",
+ test_err("block start does not match, want %llu got %llu",
disk_bytenr, em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -728,25 +732,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -755,11 +759,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole extent, got %llu\n", em->block_start);
+ test_err("expected a hole extent, got %llu", em->block_start);
goto out;
}
/*
@@ -768,18 +772,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* test.
*/
if (em->start != offset || em->len != 3 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 3 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
vacancy_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -788,25 +792,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -830,7 +834,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
@@ -840,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
goto out;
}
@@ -871,21 +875,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != 0 || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start 0 len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start 0 len %u, got start %llu len %llu",
sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
- test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+ test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
em->flags);
goto out;
}
@@ -894,21 +898,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
2 * sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != sectorsize) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %u len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %u len %u, got start %llu len %llu",
sectorsize, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, wanted 0 got %lu\n",
+ test_err("unexpected flags set, wanted 0 got %lu",
em->flags);
goto out;
}
@@ -931,19 +935,19 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
@@ -954,12 +958,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 1) {
ret = -EINVAL;
- test_msg("Miscount, wanted 1, got %u\n",
+ test_err("miscount, wanted 1, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -969,12 +973,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -986,12 +990,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1002,12 +1006,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
+ sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1020,12 +1024,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 4) {
ret = -EINVAL;
- test_msg("Miscount, wanted 4, got %u\n",
+ test_err("miscount, wanted 4, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1037,12 +1041,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 3) {
ret = -EINVAL;
- test_msg("Miscount, wanted 3, got %u\n",
+ test_err("miscount, wanted 3, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1054,12 +1058,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 4) {
ret = -EINVAL;
- test_msg("Miscount, wanted 4, got %u\n",
+ test_err("miscount, wanted 4, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1072,12 +1076,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 3) {
ret = -EINVAL;
- test_msg("Miscount, wanted 3, got %u\n",
+ test_err("miscount, wanted 3, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1087,12 +1091,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents) {
ret = -EINVAL;
- test_msg("Miscount, wanted 0, got %u\n",
+ test_err("miscount, wanted 0, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1115,14 +1119,14 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
- test_msg("Running btrfs_get_extent tests\n");
+ test_msg("running btrfs_get_extent tests");
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("Running hole first btrfs_get_extent test\n");
+ test_msg("running hole first btrfs_get_extent test");
ret = test_hole_first(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("Running outstanding_extents tests\n");
+ test_msg("running outstanding_extents tests");
return test_extent_accounting(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 39b95783f736..ace94db09d29 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -24,7 +24,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -32,14 +32,14 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
- test_msg("Couldn't insert ref %d\n", ret);
+ test_err("couldn't insert ref %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -74,7 +74,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -82,14 +82,14 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
- test_msg("Couldn't find extent ref\n");
+ test_err("couldn't find extent ref");
btrfs_free_path(path);
return ret;
}
@@ -111,7 +111,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
if (ret)
- test_msg("Failed to insert backref\n");
+ test_err("failed to insert backref");
btrfs_free_path(path);
return ret;
}
@@ -124,7 +124,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -132,14 +132,14 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
- test_msg("Didn't find our key %d\n", ret);
+ test_err("didn't find our key %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -158,7 +158,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -166,14 +166,14 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
- test_msg("Couldn't find extent ref\n");
+ test_err("couldn't find extent ref");
btrfs_free_path(path);
return ret;
}
@@ -195,7 +195,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
- test_msg("Couldn't find backref %d\n", ret);
+ test_err("couldn't find backref %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -213,12 +213,12 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
struct ulist *new_roots = NULL;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("Qgroup basic add\n");
+ test_msg("qgroup basic add");
ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID);
if (ret) {
- test_msg("Couldn't create a qgroup %d\n", ret);
+ test_err("couldn't create a qgroup %d", ret);
return ret;
}
@@ -231,7 +231,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -245,20 +245,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
old_roots = NULL;
@@ -268,7 +268,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -281,19 +281,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -314,9 +314,9 @@ static int test_multiple_refs(struct btrfs_root *root,
struct ulist *new_roots = NULL;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("Qgroup multiple refs test\n");
+ test_msg("qgroup multiple refs test");
/*
* We have BTRFS_FS_TREE_OBJECTID created already from the
@@ -324,7 +324,7 @@ static int test_multiple_refs(struct btrfs_root *root,
*/
ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID);
if (ret) {
- test_msg("Couldn't create a qgroup %d\n", ret);
+ test_err("couldn't create a qgroup %d", ret);
return ret;
}
@@ -332,7 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -346,20 +346,20 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -367,7 +367,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -381,26 +381,26 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
nodesize, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -408,7 +408,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -422,26 +422,26 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
0, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
ret = PTR_ERR(root);
goto out;
}
@@ -485,7 +485,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_msg("Couldn't allocate a fs root\n");
+ test_err("couldn't allocate a fs root");
ret = PTR_ERR(tmp_root);
goto out;
}
@@ -504,13 +504,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
root->fs_info->fs_root = tmp_root;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
- test_msg("Couldn't insert fs root %d\n", ret);
+ test_err("couldn't insert fs root %d", ret);
goto out;
}
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_msg("Couldn't allocate a fs root\n");
+ test_err("couldn't allocate a fs root");
ret = PTR_ERR(tmp_root);
goto out;
}
@@ -518,11 +518,11 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
- test_msg("Couldn't insert fs root %d\n", ret);
+ test_err("couldn't insert fs root %d", ret);
goto out;
}
- test_msg("Running qgroup tests\n");
+ test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);
if (ret)
goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c944b4769e3c..4485eae41e88 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -877,12 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(&cur_trans->writer_wait))
- wake_up(&cur_trans->writer_wait);
+ cond_wake_up(&cur_trans->writer_wait);
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -1250,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
- btrfs_orphan_commit_root(trans, root);
btrfs_save_ino_cache(root, trans);
@@ -1640,15 +1634,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b,
- BTRFS_UUID_KEY_SUBVOL, objectid);
+ ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+ objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
- new_root_item->received_uuid,
+ ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
if (ret && ret != -EEXIST) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8c0826bc2c7..94439482a0ec 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -139,7 +139,6 @@ struct btrfs_pending_snapshot {
struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
- u64 qgroup_reserved;
/* extra metadata reservation for relocation */
int error;
bool readonly;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8f23a94dab77..f8220ec02036 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -222,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&root->log_writer_wait))
- wake_up(&root->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
}
}
@@ -2988,11 +2985,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&log_root_tree->log_writer_wait))
- wake_up(&log_root_tree->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&log_root_tree->log_writer_wait);
}
if (ret) {
@@ -3116,10 +3110,11 @@ out_wake_log_root:
mutex_unlock(&log_root_tree->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
- wake_up(&log_root_tree->log_commit_wait[index2]);
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
mutex_lock(&root->log_mutex);
btrfs_remove_all_log_ctxs(root, index1, ret);
@@ -3128,10 +3123,11 @@ out:
mutex_unlock(&root->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&root->log_commit_wait[index1]))
- wake_up(&root->log_commit_wait[index1]);
+ cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 1ba7ca2a4200..3b2ae342e649 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -79,10 +79,10 @@ out:
return ret;
}
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid_cpu)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
@@ -144,10 +144,10 @@ out:
return ret;
}
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
@@ -239,7 +239,7 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
goto out;
}
- ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid);
+ ret = btrfs_uuid_tree_remove(trans, uuid, type, subid);
btrfs_end_transaction(trans);
out:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index be3fc701f389..e034ad9e23b4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .raid_name = "raid10",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
+ .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1] = {
.sub_stripes = 1,
@@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .raid_name = "raid1",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
@@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
+ .raid_name = "dup",
+ .bg_flag = BTRFS_BLOCK_GROUP_DUP,
+ .mindev_error = 0,
},
[BTRFS_RAID_RAID0] = {
.sub_stripes = 1,
@@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .raid_name = "raid0",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
+ .mindev_error = 0,
},
[BTRFS_RAID_SINGLE] = {
.sub_stripes = 1,
@@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .raid_name = "single",
+ .bg_flag = 0,
+ .mindev_error = 0,
},
[BTRFS_RAID_RAID5] = {
.sub_stripes = 1,
@@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 1,
.ncopies = 2,
+ .raid_name = "raid5",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
+ .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
},
[BTRFS_RAID_RAID6] = {
.sub_stripes = 1,
@@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 2,
.devs_increment = 1,
.ncopies = 3,
+ .raid_name = "raid6",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
+ .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
},
};
-const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
- [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
- [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
- [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
- [BTRFS_RAID_SINGLE] = 0,
- [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
- [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
-};
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+ if (type >= BTRFS_NR_RAID_TYPES)
+ return NULL;
-/*
- * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
- * condition is not met. Zero means there's no corresponding
- * BTRFS_ERROR_DEV_*_NOT_MET value.
- */
-const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
- [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
- [BTRFS_RAID_DUP] = 0,
- [BTRFS_RAID_RAID0] = 0,
- [BTRFS_RAID_SINGLE] = 0,
- [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
- [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
-};
+ return btrfs_raid_array[type].raid_name;
+}
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
@@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* device_list_mutex
* chunk_mutex
* balance_mutex
+ *
+ *
+ * Exclusive operations, BTRFS_FS_EXCL_OP
+ * ======================================
+ *
+ * Maintains the exclusivity of the following operations that apply to the
+ * whole filesystem and cannot run in parallel.
+ *
+ * - Balance (*)
+ * - Device add
+ * - Device remove
+ * - Device replace (*)
+ * - Resize
+ *
+ * The device operations (as above) can be in one of the following states:
+ *
+ * - Running state
+ * - Paused state
+ * - Completed state
+ *
+ * Only device operations marked with (*) can go into the Paused state for the
+ * following reasons:
+ *
+ * - ioctl (only Balance can be Paused through ioctl)
+ * - filesystem remounted as read-only
+ * - filesystem unmounted and mounted as read-only
+ * - system power-cycle and filesystem mounted as read-only
+ * - filesystem or device errors leading to forced read-only
+ *
+ * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
+ * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * A device operation in Paused or Running state can be canceled or resumed
+ * either by ioctl (Balance only) or when remounted as read-write.
+ * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * completed.
*/
DEFINE_MUTEX(uuid_mutex);
@@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
- INIT_LIST_HEAD(&fs_devs->list);
+ INIT_LIST_HEAD(&fs_devs->fs_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
return fs_devs;
}
-static void free_device(struct btrfs_device *device)
+void btrfs_free_device(struct btrfs_device *device)
{
rcu_string_free(device->name);
bio_put(device->flush_bio);
@@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- free_device(device);
+ btrfs_free_device(device);
}
kfree(fs_devices);
}
@@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, list);
- list_del(&fs_devices->list);
+ struct btrfs_fs_devices, fs_list);
+ list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
@@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
/*
* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
* Returned struct is not linked onto any lists and must be destroyed using
- * free_device.
+ * btrfs_free_device.
*/
static struct btrfs_device *__alloc_device(void)
{
@@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void)
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, const u8 *uuid)
{
- struct list_head *head = &fs_devices->devices;
struct btrfs_device *dev;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (dev->devid == devid &&
(!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
return dev;
@@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
- list_for_each_entry(fs_devices, &fs_uuids, list) {
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
@@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path,
struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
struct btrfs_device *dev, *tmp_dev;
- list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
+ list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
if (fs_devs->opened)
continue;
@@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path,
/* delete the stale device */
if (fs_devs->num_devices == 1) {
btrfs_sysfs_remove_fsid(fs_devs);
- list_del(&fs_devs->list);
+ list_del(&fs_devs->fs_list);
free_fs_devices(fs_devs);
break;
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
- free_device(dev);
+ btrfs_free_device(dev);
}
}
}
@@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- list_add(&fs_devices->list, &fs_uuids);
+ list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
@@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
- free_device(device);
+ btrfs_free_device(device);
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
@@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
- free_device(device);
+ btrfs_free_device(device);
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -938,7 +970,7 @@ again:
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- free_device(device);
+ btrfs_free_device(device);
}
if (fs_devices->seed) {
@@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head)
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
- free_device(device);
+ btrfs_free_device(device);
}
static void btrfs_close_bdev(struct btrfs_device *device)
@@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
new_device->fs_devices = device->fs_devices;
}
-static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
struct list_head pending_put;
@@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
int ret;
mutex_lock(&uuid_mutex);
- ret = __btrfs_close_devices(fs_devices);
+ ret = close_fs_devices(fs_devices);
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
@@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
while (seed_devices) {
fs_devices = seed_devices;
seed_devices = fs_devices->seed;
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
return ret;
}
-static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
- struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
int ret = 0;
flags |= FMODE_EXCL;
- list_for_each_entry(device, head, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
/* Just open everything we can; ignore failures here */
if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
@@ -1115,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
{
int ret;
- mutex_lock(&uuid_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
} else {
list_sort(NULL, &fs_devices->devices, devid_cmp);
- ret = __btrfs_open_devices(fs_devices, flags, holder);
+ ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
return ret;
}
@@ -1201,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
*/
bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
- mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
- goto error;
- }
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
ret = -EINVAL;
goto error_bdev_put;
}
+ mutex_lock(&uuid_mutex);
device = device_list_add(path, disk_super);
if (IS_ERR(device))
ret = PTR_ERR(device);
else
*fs_devices_ret = device->fs_devices;
+ mutex_unlock(&uuid_mutex);
btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
-error:
- mutex_unlock(&uuid_mutex);
+
return ret;
}
@@ -1857,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
} while (read_seqretry(&fs_info->profiles_lock, seq));
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
- if (!(all_avail & btrfs_raid_group[i]))
+ if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_mindev_error[i];
+ int ret = btrfs_raid_array[i].mindev_error;
if (ret)
return ret;
@@ -1917,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&uuid_mutex);
- num_devices = fs_info->fs_devices->num_devices;
+ num_devices = fs_devices->num_devices;
btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
@@ -1986,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* (super_copy) should hold the device list mutex.
*/
+ /*
+ * In normal cases the cur_devices == fs_devices. But in case
+ * of deleting a seed device, the cur_devices should point to
+ * its own fs_devices listed under the fs_devices->seed.
+ */
cur_devices = device->fs_devices;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
- device->fs_devices->num_devices--;
- device->fs_devices->total_devices--;
+ cur_devices->num_devices--;
+ cur_devices->total_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- device->fs_devices->missing_devices--;
+ cur_devices->missing_devices--;
btrfs_assign_next_active_device(fs_info, device, NULL);
if (device->bdev) {
- device->fs_devices->open_devices--;
+ cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(fs_devices, device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/*
* at this point, the device is zero sized and detached from
@@ -2020,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
call_rcu(&device->rcu, free_device_rcu);
if (cur_devices->open_devices == 0) {
- struct btrfs_fs_devices *fs_devices;
- fs_devices = fs_info->fs_devices;
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
@@ -2030,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
- __btrfs_close_devices(cur_devices);
+ close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
out:
mutex_unlock(&uuid_mutex);
- mutex_unlock(&fs_info->volume_mutex);
return ret;
error_undo:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
- &fs_info->fs_devices->alloc_list);
+ &fs_devices->alloc_list);
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
@@ -2112,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
tmp_fs_devices = tmp_fs_devices->seed;
}
fs_devices->seed = NULL;
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
}
@@ -2120,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
- mutex_lock(&uuid_mutex);
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+
WARN_ON(!tgtdev);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
if (tgtdev->bdev)
- fs_info->fs_devices->open_devices--;
+ fs_devices->open_devices--;
- fs_info->fs_devices->num_devices--;
+ fs_devices->num_devices--;
btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
list_del_rcu(&tgtdev->dev_list);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/*
* The update_dev_time() with in btrfs_scratch_superblocks()
@@ -2188,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
struct btrfs_device *tmp;
devices = &fs_info->fs_devices->devices;
- /*
- * It is safe to read the devices since the volume_mutex
- * is held by the caller.
- */
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
@@ -2259,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
return PTR_ERR(old_devices);
}
- list_add(&old_devices->list, &fs_uuids);
+ list_add(&old_devices->fs_list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
@@ -2570,7 +2598,7 @@ error_trans:
if (trans)
btrfs_end_transaction(trans);
error_free_device:
- free_device(device);
+ btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev && !unlocked) {
@@ -2580,99 +2608,6 @@ error:
return ret;
}
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device *srcdev,
- struct btrfs_device **device_out)
-{
- struct btrfs_device *device;
- struct block_device *bdev;
- struct list_head *devices;
- struct rcu_string *name;
- u64 devid = BTRFS_DEV_REPLACE_DEVID;
- int ret = 0;
-
- *device_out = NULL;
- if (fs_info->fs_devices->seeding) {
- btrfs_err(fs_info, "the filesystem is a seed filesystem!");
- return -EINVAL;
- }
-
- bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
- fs_info->bdev_holder);
- if (IS_ERR(bdev)) {
- btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev);
- }
-
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
- if (device->bdev == bdev) {
- btrfs_err(fs_info,
- "target device is in the filesystem!");
- ret = -EEXIST;
- goto error;
- }
- }
-
-
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
- btrfs_err(fs_info,
- "target device is smaller than source device!");
- ret = -EINVAL;
- goto error;
- }
-
-
- device = btrfs_alloc_device(NULL, &devid, NULL);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
- }
-
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- free_device(device);
- ret = -ENOMEM;
- goto error;
- }
- rcu_assign_pointer(device->name, name);
-
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- device->generation = 0;
- device->io_width = fs_info->sectorsize;
- device->io_align = fs_info->sectorsize;
- device->sector_size = fs_info->sectorsize;
- device->total_bytes = btrfs_device_get_total_bytes(srcdev);
- device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
- device->bytes_used = btrfs_device_get_bytes_used(srcdev);
- device->commit_total_bytes = srcdev->commit_total_bytes;
- device->commit_bytes_used = device->bytes_used;
- device->fs_info = fs_info;
- device->bdev = bdev;
- set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->mode = FMODE_EXCL;
- device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
- *device_out = device;
- return ret;
-
-error:
- blkdev_put(bdev, FMODE_EXCL);
- return ret;
-}
-
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -3273,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
}
/*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper. Same goes for unset_balance_control.
+ * Clear the balance status in fs_info and delete the balance item from disk.
*/
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
- struct btrfs_fs_info *fs_info = bctl->fs_info;
-
- BUG_ON(fs_info->balance_ctl);
-
- spin_lock(&fs_info->balance_lock);
- fs_info->balance_ctl = bctl;
- spin_unlock(&fs_info->balance_lock);
-}
-
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ int ret;
BUG_ON(!fs_info->balance_ctl);
@@ -3299,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
+ ret = del_balance_item(fs_info);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret, NULL);
}
/*
@@ -3835,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
atomic_read(&fs_info->balance_cancel_req) == 0);
}
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
- int ret;
-
- unset_balance_control(fs_info);
- ret = del_balance_item(fs_info);
- if (ret)
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-}
-
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
u64 allowed)
@@ -3857,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
}
/*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
*/
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
- struct btrfs_fs_info *fs_info = bctl->fs_info;
u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
@@ -3891,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
btrfs_err(fs_info,
- "with mixed groups data and metadata balance options must be the same");
+ "balance: mixed groups data and metadata options must be the same");
ret = -EINVAL;
goto out;
}
@@ -3913,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
if (validate_convert_profile(&bctl->data, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
+
btrfs_err(fs_info,
- "unable to start balance with target data profile %llu",
- bctl->data.target);
+ "balance: invalid convert data profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->meta, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
+
btrfs_err(fs_info,
- "unable to start balance with target metadata profile %llu",
- bctl->meta.target);
+ "balance: invalid convert metadata profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->sys, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
+
btrfs_err(fs_info,
- "unable to start balance with target system profile %llu",
- bctl->sys.target);
+ "balance: invalid convert system profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
@@ -3950,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
!(bctl->meta.target & allowed))) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
- "force reducing metadata integrity");
+ "balance: force reducing metadata integrity");
} else {
btrfs_err(fs_info,
- "balance will reduce metadata integrity, use force if you want this");
+ "balance: reduces metadata integrity, use --force if you want this");
ret = -EINVAL;
goto out;
}
@@ -3967,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
bctl->data.target : fs_info->avail_data_alloc_bits;
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
+ int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
+ int data_index = btrfs_bg_flags_to_raid_index(data_target);
+
btrfs_warn(fs_info,
- "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- meta_target, data_target);
+ "balance: metadata profile %s has lower redundancy than data profile %s",
+ get_raid_name(meta_index), get_raid_name(data_index));
}
ret = insert_balance_item(fs_info, bctl);
@@ -3978,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
BUG_ON(ret == -EEXIST);
- set_balance_control(bctl);
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
} else {
BUG_ON(ret != -EEXIST);
spin_lock(&fs_info->balance_lock);
@@ -3986,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
spin_unlock(&fs_info->balance_lock);
}
- atomic_inc(&fs_info->balance_running);
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
- atomic_dec(&fs_info->balance_running);
+ clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
- update_ioctl_balance_args(fs_info, 0, bargs);
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
}
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
- __cancel_balance(fs_info);
+ reset_balance_state(fs_info);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
wake_up(&fs_info->balance_wait_q);
@@ -4009,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
- __cancel_balance(fs_info);
- else {
+ reset_balance_state(fs_info);
+ else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
- }
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
return ret;
}
@@ -4022,16 +3950,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
if (fs_info->balance_ctl) {
- btrfs_info(fs_info, "continuing balance");
- ret = btrfs_balance(fs_info->balance_ctl, NULL);
+ btrfs_info(fs_info, "balance: resuming");
+ ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
}
-
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
return ret;
}
@@ -4040,15 +3964,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
- spin_lock(&fs_info->balance_lock);
+ mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
- spin_unlock(&fs_info->balance_lock);
+ mutex_unlock(&fs_info->balance_mutex);
return 0;
}
- spin_unlock(&fs_info->balance_lock);
+ mutex_unlock(&fs_info->balance_mutex);
if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
- btrfs_info(fs_info, "force skipping balance");
+ btrfs_info(fs_info, "balance: resume skipped");
return 0;
}
@@ -4100,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
- bctl->fs_info = fs_info;
bctl->flags = btrfs_balance_flags(leaf, item);
bctl->flags |= BTRFS_BALANCE_RESUME;
@@ -4111,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+ /*
+ * This should never happen, as the paused balance state is recovered
+ * during mount without any chance of other exclusive ops to collide.
+ *
+ * This gives the exclusive op status to balance and keeps in paused
+ * state until user intervention (cancel or umount). If the ownership
+ * cannot be assigned, show a message but do not fail. The balance
+ * is in a paused state and must have fs_info::balance_ctl properly
+ * set up.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ btrfs_warn(fs_info,
+ "balance: cannot set exclusive op status, resume manually");
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
- set_balance_control(bctl);
-
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
@@ -4135,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
return -ENOTCONN;
}
- if (atomic_read(&fs_info->balance_running)) {
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
atomic_inc(&fs_info->balance_pause_req);
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
- atomic_read(&fs_info->balance_running) == 0);
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
/* we are good with balance_ctl ripped off from under us */
- BUG_ON(atomic_read(&fs_info->balance_running));
+ BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_pause_req);
} else {
ret = -ENOTCONN;
@@ -4156,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
- if (sb_rdonly(fs_info->sb))
- return -EROFS;
-
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
+ /*
+ * A paused balance with the item stored on disk can be resumed at
+ * mount time if the mount is read-write. Otherwise it's still paused
+ * and we must not allow cancelling as it deletes the item.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -EROFS;
+ }
+
atomic_inc(&fs_info->balance_cancel_req);
/*
* if we are running just wait and return, balance item is
* deleted in btrfs_balance in this case
*/
- if (atomic_read(&fs_info->balance_running)) {
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
- atomic_read(&fs_info->balance_running) == 0);
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
} else {
- /* __cancel_balance needs volume_mutex */
mutex_unlock(&fs_info->balance_mutex);
- mutex_lock(&fs_info->volume_mutex);
+ /*
+ * Lock released to allow other waiters to continue, we'll
+ * reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl)
- __cancel_balance(fs_info);
-
- mutex_unlock(&fs_info->volume_mutex);
+ if (fs_info->balance_ctl) {
+ reset_balance_state(fs_info);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_info(fs_info, "balance: canceled");
+ }
}
- BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ BUG_ON(fs_info->balance_ctl ||
+ test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
@@ -4264,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data)
}
update_tree:
if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
- root_item.uuid,
+ ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
@@ -4276,7 +4220,7 @@ update_tree:
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
+ ret = btrfs_uuid_tree_add(trans,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
@@ -4482,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
+ path->reada = READA_BACK;
mutex_lock(&fs_info->chunk_mutex);
@@ -6043,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
- u64 chunk_start, u64 physical, u64 devid,
- u64 **logical, int *naddrs, int *stripe_len)
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6077,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
- if (devid && map->stripes[i].dev->devid != devid)
- continue;
if (map->stripes[i].physical > physical ||
map->stripes[i].physical + length <= physical)
continue;
@@ -6410,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
- * destroyed with free_device.
+ * destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
@@ -6433,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
ret = find_next_devid(fs_info, &tmp);
if (ret) {
- free_device(dev);
+ btrfs_free_device(dev);
return ERR_PTR(ret);
}
}
@@ -6684,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = __btrfs_open_devices(fs_devices, FMODE_READ,
- fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(ret);
@@ -6693,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
}
if (!fs_devices->seeding) {
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(-EINVAL);
goto out;
@@ -7002,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
+ /*
+ * uuid_mutex is needed only if we are mounting a sprout FS
+ * otherwise we don't need it.
+ */
mutex_lock(&uuid_mutex);
mutex_lock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 79096884654f..5139ec8daf4c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -208,6 +208,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ struct list_head fs_list;
u64 num_devices;
u64 open_devices;
@@ -229,7 +230,6 @@ struct btrfs_fs_devices {
struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
- struct list_head list;
struct btrfs_fs_devices *seed;
int seeding;
@@ -329,11 +329,12 @@ struct btrfs_raid_attr {
int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int mindev_error; /* error code if min devs requisite is unmet */
+ const char raid_name[8]; /* name of the raid */
+ u64 bg_flag; /* block group flag of the raid */
};
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
-extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
struct map_lookup {
u64 type;
@@ -351,8 +352,6 @@ struct map_lookup {
struct btrfs_balance_args;
struct btrfs_balance_progress;
struct btrfs_balance_control {
- struct btrfs_fs_info *fs_info;
-
struct btrfs_balance_args data;
struct btrfs_balance_args meta;
struct btrfs_balance_args sys;
@@ -393,9 +392,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
- u64 chunk_start, u64 physical, u64 devid,
- u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -421,6 +419,7 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
const char *device_path, u64 devid);
void __exit btrfs_cleanup_fs_uuids(void);
@@ -431,11 +430,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device *srcdev,
- struct btrfs_device **device_out);
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
@@ -553,6 +549,8 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
+const char *get_raid_name(enum btrfs_raid_types type);
+
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7e4a1e2f0696..85817991ee68 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
#
-# Makefile for Linux CIFS VFS client
+# Makefile for Linux CIFS/SMB2/SMB3 VFS client
#
+ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_CIFS) += cifs.o
-cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
- link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
+ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4bc4a7ac61d9..116146022aa1 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -42,7 +42,7 @@ cifs_dump_mem(char *label, void *data, int length)
data, length, true);
}
-void cifs_dump_detail(void *buf)
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
struct smb_hdr *smb = (struct smb_hdr *)buf;
@@ -50,7 +50,8 @@ void cifs_dump_detail(void *buf)
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
- cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
+ cifs_dbg(VFS, "smb buf %p len %u\n", smb,
+ server->ops->calc_smb_size(smb, server));
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -83,7 +84,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
mid_entry->multiRsp, mid_entry->multiEnd);
if (mid_entry->resp_buf) {
- cifs_dump_detail(mid_entry->resp_buf);
+ cifs_dump_detail(mid_entry->resp_buf, server);
cifs_dump_mem("existing buf: ",
mid_entry->resp_buf, 62);
}
@@ -113,6 +114,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, " type: %d ", dev_type);
if (tcon->seal)
seq_printf(m, " Encrypted");
+ if (tcon->nocase)
+ seq_printf(m, " nocase");
if (tcon->unix_ext)
seq_printf(m, " POSIX Extensions");
if (tcon->ses->server->ops->dump_share_caps)
@@ -237,6 +240,10 @@ skip_rdma:
server->credits, server->dialect);
if (server->sign)
seq_printf(m, " signed");
+#ifdef CONFIG_CIFS_SMB311
+ if (server->posix_ext_supported)
+ seq_printf(m, " posix");
+#endif /* 3.1.1 */
i++;
list_for_each(tmp2, &server->smb_ses_list) {
ses = list_entry(tmp2, struct cifs_ses,
@@ -489,32 +496,32 @@ cifs_proc_init(void)
cifs_debug_data_proc_show);
#ifdef CONFIG_CIFS_STATS
- proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
+ proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
#endif /* STATS */
- proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
- proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
- proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+ proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
+ proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
+ proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs,
&cifs_linux_ext_proc_fops);
- proc_create("SecurityFlags", 0, proc_fs_cifs,
+ proc_create("SecurityFlags", 0644, proc_fs_cifs,
&cifs_security_flags_proc_fops);
- proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+ proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
&cifs_lookup_cache_proc_fops);
#ifdef CONFIG_CIFS_SMB_DIRECT
- proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+ proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
&cifs_rdma_readwrite_threshold_proc_fops);
- proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+ proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs,
&cifs_smbd_max_frmr_depth_proc_fops);
- proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+ proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs,
&cifs_smbd_keep_alive_interval_proc_fops);
- proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_receive_size", 0644, proc_fs_cifs,
&cifs_smbd_max_receive_size_proc_fops);
- proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs,
&cifs_smbd_max_fragmented_recv_size_proc_fops);
- proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_send_size", 0644, proc_fs_cifs,
&cifs_smbd_max_send_size_proc_fops);
- proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+ proc_create("smbd_send_credit_target", 0644, proc_fs_cifs,
&cifs_smbd_send_credit_target_proc_fops);
- proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+ proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs,
&cifs_smbd_receive_credit_max_proc_fops);
#endif
}
@@ -572,6 +579,8 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
cifsFYI = bv;
else if ((c[0] > '1') && (c[0] <= '9'))
cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
+ else
+ return -EINVAL;
return count;
}
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 0e74690d11bc..f4f3f0853c6e 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -23,7 +23,7 @@
#define _H_CIFS_DEBUG
void cifs_dump_mem(char *label, void *data, int length);
-void cifs_dump_detail(void *);
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info);
void cifs_dump_mids(struct TCP_Server_Info *);
extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 350fa55a1bf7..9731d0d891e7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -50,6 +50,7 @@
* root mountable
*/
#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
+#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5a5a0158cc8f..eb7b6573f322 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -58,13 +58,15 @@ bool traceSMB;
bool enable_oplocks = true;
bool linuxExtEnabled = true;
bool lookupCacheEnabled = true;
+bool disable_legacy_dialects; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
-MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
+MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) "
+ "for CIFS requests. "
"Default: 16384 Range: 8192 to 130048");
unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
module_param(cifs_min_rcv, uint, 0444);
@@ -76,11 +78,21 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
module_param(cifs_max_pending, uint, 0444);
-MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
+MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
+ "CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
+module_param(disable_legacy_dialects, bool, 0644);
+MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
+ "helpful to restrict the ability to "
+ "override the default dialects (SMB2.1, "
+ "SMB3 and SMB3.02) on mount with old "
+ "dialects (CIFS/SMB1 and SMB2) since "
+ "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
+ " and less secure. Default: n/N/0");
+
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
@@ -469,10 +481,20 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",persistenthandles");
else if (tcon->use_resilient)
seq_puts(s, ",resilienthandles");
+
+#ifdef CONFIG_CIFS_SMB311
+ if (tcon->posix_extensions)
+ seq_puts(s, ",posix");
+ else if (tcon->unix_ext)
+ seq_puts(s, ",unix");
+ else
+ seq_puts(s, ",nounix");
+#else
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
seq_puts(s, ",nounix");
+#endif /* SMB311 */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -495,6 +517,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",sfu");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
seq_puts(s, ",nobrl");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
+ seq_puts(s, ",nohandlecache");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
@@ -897,6 +921,17 @@ struct file_system_type cifs_fs_type = {
/* .fs_flags */
};
MODULE_ALIAS_FS("cifs");
+
+static struct file_system_type smb3_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "smb3",
+ .mount = cifs_do_mount,
+ .kill_sb = cifs_kill_sb,
+ /* .fs_flags */
+};
+MODULE_ALIAS_FS("smb3");
+MODULE_ALIAS("smb3");
+
const struct inode_operations cifs_dir_inode_ops = {
.create = cifs_create,
.atomic_open = cifs_atomic_open,
@@ -1435,6 +1470,12 @@ init_cifs(void)
if (rc)
goto out_init_cifs_idmap;
+ rc = register_filesystem(&smb3_fs_type);
+ if (rc) {
+ unregister_filesystem(&cifs_fs_type);
+ goto out_init_cifs_idmap;
+ }
+
return 0;
out_init_cifs_idmap:
@@ -1465,8 +1506,9 @@ out_clean_proc:
static void __exit
exit_cifs(void)
{
- cifs_dbg(NOISY, "exit_cifs\n");
+ cifs_dbg(NOISY, "exit_smb3\n");
unregister_filesystem(&cifs_fs_type);
+ unregister_filesystem(&smb3_fs_type);
cifs_dfs_release_automount_timer();
#ifdef CONFIG_CIFS_ACL
exit_cifs_idmap();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 013ba2aed8d9..5f0231803431 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.11"
+#define CIFS_VERSION "2.12"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cb950a5fa078..08d1cdd96701 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -176,6 +176,7 @@ struct smb_rqst {
struct kvec *rq_iov; /* array of kvecs */
unsigned int rq_nvec; /* number of kvecs in array */
struct page **rq_pages; /* pointer to array of page ptrs */
+ unsigned int rq_offset; /* the offset to the 1st page */
unsigned int rq_npages; /* number pages in array */
unsigned int rq_pagesz; /* page size to use */
unsigned int rq_tailsz; /* length of last page */
@@ -244,7 +245,7 @@ struct smb_version_operations {
int (*map_error)(char *, bool);
/* find mid corresponding to the response message */
struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
- void (*dump_detail)(void *);
+ void (*dump_detail)(void *buf, struct TCP_Server_Info *ptcp_info);
void (*clear_stats)(struct cifs_tcon *);
void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
@@ -372,7 +373,7 @@ struct smb_version_operations {
int (*close_dir)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *);
/* calculate a size of SMB message */
- unsigned int (*calc_smb_size)(void *);
+ unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
/* check for STATUS_PENDING and process it in a positive case */
bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
@@ -417,7 +418,7 @@ struct smb_version_operations {
/* create lease context buffer for CREATE request */
char * (*create_lease_buf)(u8 *, u8);
/* parse lease context buffer and return oplock/epoch info */
- __u8 (*parse_lease_buf)(void *, unsigned int *);
+ __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey);
ssize_t (*copychunk_range)(const unsigned int,
struct cifsFileInfo *src_file,
struct cifsFileInfo *target_file,
@@ -457,7 +458,7 @@ struct smb_version_operations {
struct mid_q_entry **);
enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
enum securityEnum);
-
+ int (*next_header)(char *);
};
struct smb_version_values {
@@ -521,10 +522,12 @@ struct smb_vol {
bool sfu_remap:1; /* remap seven reserved chars ala SFU */
bool posix_paths:1; /* unset to not ask for posix pathnames. */
bool no_linux_ext:1;
+ bool linux_ext:1;
bool sfu_emul:1;
bool nullauth:1; /* attempt to authenticate with null user */
bool nocase:1; /* request case insensitive filenames */
bool nobrl:1; /* disable sending byte range locks to srv */
+ bool nohandlecache:1; /* disable caching dir handles if srvr probs */
bool mand_lock:1; /* send mandatory not posix byte range lock reqs */
bool seal:1; /* request transport encryption on share */
bool nodfs:1; /* Do not request DFS, even if available */
@@ -630,7 +633,7 @@ struct TCP_Server_Info {
bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
/* than maxReq distinct unanswered SMBs to the server when using */
- /* multiplexed reads or writes */
+ /* multiplexed reads or writes (for SMB1/CIFS only, not SMB2/SMB3) */
unsigned int maxBuf; /* maxBuf specifies the maximum */
/* message size the server can send or receive for non-raw SMBs */
/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
@@ -681,6 +684,7 @@ struct TCP_Server_Info {
__le16 cipher_type;
/* save initital negprot hash */
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+ bool posix_ext_supported;
#endif /* 3.1.1 */
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
@@ -953,9 +957,13 @@ struct cifs_tcon {
bool print:1; /* set if connection to printer share */
bool retry:1;
bool nocase:1;
+ bool nohandlecache:1; /* if strange server resource prob can turn off */
bool seal:1; /* transport encryption for this mounted share */
bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol
for this mount even if server would support */
+#ifdef CONFIG_CIFS_SMB311
+ bool posix_extensions; /* if true SMB3.11 posix extensions enabled */
+#endif /* CIFS_311 */
bool local_lease:1; /* check leases (only) on local system not remote */
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
@@ -979,6 +987,9 @@ struct cifs_tcon {
struct fscache_cookie *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
+ bool valid_root_fid:1; /* Do we have a useable root fid */
+ struct mutex prfid_mutex; /* prevents reopen race after dead ses*/
+ struct cifs_fid *prfid; /* handle to the directory at top of share */
/* BB add field for back pointer to sb struct(s)? */
};
@@ -1071,6 +1082,7 @@ struct cifs_open_parms {
int create_options;
const char *path;
struct cifs_fid *fid;
+ umode_t mode;
bool reconnect:1;
};
@@ -1169,10 +1181,11 @@ struct cifs_readdata {
struct smbd_mr *mr;
#endif
unsigned int pagesz;
+ unsigned int page_offset;
unsigned int tailsz;
unsigned int credits;
unsigned int nr_pages;
- struct page *pages[];
+ struct page **pages;
};
struct cifs_writedata;
@@ -1194,10 +1207,11 @@ struct cifs_writedata {
struct smbd_mr *mr;
#endif
unsigned int pagesz;
+ unsigned int page_offset;
unsigned int tailsz;
unsigned int credits;
unsigned int nr_pages;
- struct page *pages[];
+ struct page **pages;
};
/*
@@ -1692,16 +1706,17 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
-GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN bool lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
+extern bool enable_oplocks; /* enable or disable oplocks */
+extern bool lookupCacheEnabled;
+extern unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
-GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
-GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
-GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
-GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
-GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
+extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+extern unsigned int CIFSMaxBufSize; /* max size not including hdr */
+extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
+extern unsigned int cifs_min_small; /* min size of small buf pool */
+extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 365a414a75e9..7933c5f9c076 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -21,6 +21,7 @@
#ifndef _CIFSPROTO_H
#define _CIFSPROTO_H
#include <linux/nls.h>
+#include "trace.h"
struct statfs;
struct smb_vol;
@@ -47,6 +48,7 @@ extern void _free_xid(unsigned int);
cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \
__func__, __xid, \
from_kuid(&init_user_ns, current_fsuid())); \
+ trace_smb3_enter(__xid, __func__); \
__xid; \
})
@@ -54,7 +56,11 @@ extern void _free_xid(unsigned int);
do { \
_free_xid(curr_xid); \
cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \
- __func__, curr_xid, (int)rc); \
+ __func__, curr_xid, (int)rc); \
+ if (rc) \
+ trace_smb3_exit_err(curr_xid, __func__, (int)rc); \
+ else \
+ trace_smb3_exit_done(curr_xid, __func__); \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
@@ -124,7 +130,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-extern unsigned int smbCalcSize(void *buf);
+extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -197,7 +203,9 @@ extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
- struct page *page, unsigned int to_read);
+ struct page *page,
+ unsigned int page_offset,
+ unsigned int to_read);
extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
@@ -525,6 +533,8 @@ int cifs_async_writev(struct cifs_writedata *wdata,
void cifs_writev_complete(struct work_struct *work);
struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
work_func_t complete);
+struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages,
+ work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1529a088383d..5aca336642c0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -106,6 +106,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
open_file->oplock_break_cancelled = true;
}
spin_unlock(&tcon->open_file_lock);
+
+ mutex_lock(&tcon->prfid_mutex);
+ tcon->valid_root_fid = false;
+ memset(tcon->prfid, 0, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->prfid_mutex);
+
/*
* BB Add call to invalidate_inodes(sb) for all superblocks mounted
* to this tcon.
@@ -1946,6 +1952,7 @@ cifs_writedata_release(struct kref *refcount)
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
+ kvfree(wdata->pages);
kfree(wdata);
}
@@ -2069,12 +2076,22 @@ cifs_writev_complete(struct work_struct *work)
struct cifs_writedata *
cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
{
+ struct page **pages =
+ kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ if (pages)
+ return cifs_writedata_direct_alloc(pages, complete);
+
+ return NULL;
+}
+
+struct cifs_writedata *
+cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+{
struct cifs_writedata *wdata;
- /* writedata + number of page pointers */
- wdata = kzalloc(sizeof(*wdata) +
- sizeof(struct page *) * nr_pages, GFP_NOFS);
+ wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
if (wdata != NULL) {
+ wdata->pages = pages;
kref_init(&wdata->refcount);
INIT_LIST_HEAD(&wdata->list);
init_completion(&wdata->done);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7a10a5d0731f..e5a2fe7f0dd4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -61,6 +61,7 @@
#define RFC1001_PORT 139
extern mempool_t *cifs_req_poolp;
+extern bool disable_legacy_dialects;
/* FIXME: should these be tunable? */
#define TLINK_ERROR_EXPIRE (1 * HZ)
@@ -76,9 +77,10 @@ enum {
Opt_mapposix, Opt_nomapposix,
Opt_mapchars, Opt_nomapchars, Opt_sfu,
Opt_nosfu, Opt_nodfs, Opt_posixpaths,
- Opt_noposixpaths, Opt_nounix,
+ Opt_noposixpaths, Opt_nounix, Opt_unix,
Opt_nocase,
Opt_brl, Opt_nobrl,
+ Opt_handlecache, Opt_nohandlecache,
Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids,
Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
Opt_nohard, Opt_nosoft,
@@ -144,10 +146,16 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_noposixpaths, "noposixpaths" },
{ Opt_nounix, "nounix" },
{ Opt_nounix, "nolinux" },
+ { Opt_nounix, "noposix" },
+ { Opt_unix, "unix" },
+ { Opt_unix, "linux" },
+ { Opt_unix, "posix" },
{ Opt_nocase, "nocase" },
{ Opt_nocase, "ignorecase" },
{ Opt_brl, "brl" },
{ Opt_nobrl, "nobrl" },
+ { Opt_handlecache, "handlecache" },
+ { Opt_nohandlecache, "nohandlecache" },
{ Opt_nobrl, "nolock" },
{ Opt_forcemandatorylock, "forcemandatorylock" },
{ Opt_forcemandatorylock, "forcemand" },
@@ -591,10 +599,11 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
- unsigned int to_read)
+ unsigned int page_offset, unsigned int to_read)
{
struct msghdr smb_msg;
- struct bio_vec bv = {.bv_page = page, .bv_len = to_read};
+ struct bio_vec bv = {
+ .bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -848,6 +857,7 @@ cifs_demultiplex_thread(void *p)
int length;
struct TCP_Server_Info *server = p;
unsigned int pdu_length;
+ unsigned int next_offset;
char *buf = NULL;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
@@ -874,24 +884,29 @@ cifs_demultiplex_thread(void *p)
length = cifs_read_from_socket(server, buf, pdu_length);
if (length < 0)
continue;
- server->total_read = length;
+
+ if (server->vals->header_preamble_size == 0)
+ server->total_read = 0;
+ else
+ server->total_read = length;
/*
* The right amount was read from socket - 4 bytes,
* so we can now interpret the length field.
*/
pdu_length = get_rfc1002_length(buf);
- server->pdu_size = pdu_length;
cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
+next_pdu:
+ server->pdu_size = pdu_length;
/* make sure we have enough to get to the MID */
- if (pdu_length < HEADER_SIZE(server) - 1 -
+ if (server->pdu_size < HEADER_SIZE(server) - 1 -
server->vals->header_preamble_size) {
cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
- pdu_length);
+ server->pdu_size);
cifs_reconnect(server);
wake_up(&server->response_q);
continue;
@@ -906,6 +921,12 @@ cifs_demultiplex_thread(void *p)
continue;
server->total_read += length;
+ if (server->ops->next_header) {
+ next_offset = server->ops->next_header(buf);
+ if (next_offset)
+ server->pdu_size = next_offset;
+ }
+
if (server->ops->is_transform_hdr &&
server->ops->receive_transform &&
server->ops->is_transform_hdr(buf)) {
@@ -948,10 +969,18 @@ cifs_demultiplex_thread(void *p)
HEADER_SIZE(server));
#ifdef CONFIG_CIFS_DEBUG2
if (server->ops->dump_detail)
- server->ops->dump_detail(buf);
+ server->ops->dump_detail(buf, server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
-
+ }
+ if (pdu_length > server->pdu_size) {
+ if (!allocate_buffers(server))
+ continue;
+ pdu_length -= server->pdu_size;
+ server->total_read = 0;
+ server->large_buf = false;
+ buf = server->smallbuf;
+ goto next_pdu;
}
} /* end while !EXITING */
@@ -1143,10 +1172,18 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
switch (match_token(value, cifs_smb_version_tokens, args)) {
case Smb_1:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
break;
case Smb_20:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
vol->ops = &smb20_operations;
vol->vals = &smb20_values;
break;
@@ -1426,8 +1463,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->posix_paths = 0;
break;
case Opt_nounix:
+ if (vol->linux_ext)
+ cifs_dbg(VFS,
+ "conflicting unix mount options\n");
vol->no_linux_ext = 1;
break;
+ case Opt_unix:
+ if (vol->no_linux_ext)
+ cifs_dbg(VFS,
+ "conflicting unix mount options\n");
+ vol->linux_ext = 1;
+ break;
case Opt_nocase:
vol->nocase = 1;
break;
@@ -1445,6 +1491,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
vol->file_mode = S_IALLUGO;
break;
+ case Opt_nohandlecache:
+ vol->nohandlecache = 1;
+ break;
+ case Opt_handlecache:
+ vol->nohandlecache = 0;
+ break;
case Opt_forcemandatorylock:
vol->mand_lock = 1;
break;
@@ -2967,6 +3019,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
}
}
+#ifdef CONFIG_CIFS_SMB311
+ if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) {
+ if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+ tcon->posix_extensions = true;
+ }
+#endif /* 311 */
+
/*
* BB Do we need to wrap session_mutex around this TCon call and Unix
* SetFS as we do on SessSetup and reconnect?
@@ -3022,6 +3081,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
*/
tcon->retry = volume_info->retry;
tcon->nocase = volume_info->nocase;
+ tcon->nohandlecache = volume_info->nohandlecache;
tcon->local_lease = volume_info->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -3580,6 +3640,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
if (pvolume_info->nobrl)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+ if (pvolume_info->nohandlecache)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE;
if (pvolume_info->nostrictsync)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
if (pvolume_info->mand_lock)
@@ -3922,6 +3984,12 @@ try_mount_again:
goto remote_path_check;
}
+#ifdef CONFIG_CIFS_SMB311
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
+
/* tell server which Unix caps we support */
if (cap_unix(tcon->ses)) {
/* reset of caps checks mount to see if unix extensions
@@ -4353,6 +4421,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->UNC = master_tcon->treeName;
vol_info->retry = master_tcon->retry;
vol_info->nocase = master_tcon->nocase;
+ vol_info->nohandlecache = master_tcon->nohandlecache;
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
vol_info->sectype = master_tcon->ses->sectype;
@@ -4382,8 +4451,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
+#ifdef CONFIG_CIFS_SMB311
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+
out:
kfree(vol_info->username);
kzfree(vol_info->password);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 925844343038..ddae52bd1993 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -369,7 +369,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
oparms.path = full_path;
oparms.fid = fid;
oparms.reconnect = false;
-
+ oparms.mode = mode;
rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
@@ -780,21 +780,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
free_xid(xid);
- return (struct dentry *)tlink;
+ return ERR_CAST(tlink);
}
pTcon = tlink_tcon(tlink);
rc = check_name(direntry, pTcon);
- if (rc)
- goto lookup_out;
+ if (unlikely(rc)) {
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+ return ERR_PTR(rc);
+ }
/* can not grab the rename sem here since it would
deadlock in the cases (beginning of sys_rename itself)
in which we already have the sb rename sem */
full_path = build_path_from_dentry(direntry);
if (full_path == NULL) {
- rc = -ENOMEM;
- goto lookup_out;
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+ return ERR_PTR(-ENOMEM);
}
if (d_really_is_positive(direntry)) {
@@ -813,29 +817,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
parent_dir_inode->i_sb, xid, NULL);
}
- if ((rc == 0) && (newInode != NULL)) {
- d_add(direntry, newInode);
+ if (rc == 0) {
/* since paths are not looked up by component - the parent
directories are presumed to be good here */
renew_parental_timestamps(direntry);
-
} else if (rc == -ENOENT) {
- rc = 0;
cifs_set_time(direntry, jiffies);
- d_add(direntry, NULL);
- /* if it was once a directory (but how can we tell?) we could do
- shrink_dcache_parent(direntry); */
- } else if (rc != -EACCES) {
- cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
- /* We special case check for Access Denied - since that
- is a common return code */
+ newInode = NULL;
+ } else {
+ if (rc != -EACCES) {
+ cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
+ /* We special case check for Access Denied - since that
+ is a common return code */
+ }
+ newInode = ERR_PTR(rc);
}
-
-lookup_out:
kfree(full_path);
cifs_put_tlink(tlink);
free_xid(xid);
- return ERR_PTR(rc);
+ return d_splice_alias(newInode, direntry);
}
static int
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 23fd430fe74a..87eece6fbd48 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2880,13 +2880,13 @@ out:
}
static struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+cifs_readdata_direct_alloc(struct page **pages, work_func_t complete)
{
struct cifs_readdata *rdata;
- rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
- GFP_KERNEL);
+ rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
if (rdata != NULL) {
+ rdata->pages = pages;
kref_init(&rdata->refcount);
INIT_LIST_HEAD(&rdata->list);
init_completion(&rdata->done);
@@ -2896,6 +2896,22 @@ cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
return rdata;
}
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+ struct page **pages =
+ kzalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+ struct cifs_readdata *ret = NULL;
+
+ if (pages) {
+ ret = cifs_readdata_direct_alloc(pages, complete);
+ if (!ret)
+ kfree(pages);
+ }
+
+ return ret;
+}
+
void
cifs_readdata_release(struct kref *refcount)
{
@@ -2910,6 +2926,7 @@ cifs_readdata_release(struct kref *refcount)
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
+ kvfree(rdata->pages);
kfree(rdata);
}
@@ -3009,12 +3026,20 @@ uncached_fill_pages(struct TCP_Server_Info *server,
int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
+ unsigned int page_offset = rdata->page_offset;
rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
size_t n;
+ unsigned int segment_size = rdata->pagesz;
+
+ if (i == 0)
+ segment_size -= page_offset;
+ else
+ page_offset = 0;
+
if (len <= 0) {
/* no need to hold page hostage */
@@ -3023,24 +3048,25 @@ uncached_fill_pages(struct TCP_Server_Info *server,
put_page(page);
continue;
}
+
n = len;
- if (len >= PAGE_SIZE) {
+ if (len >= segment_size)
/* enough data to fill the page */
- n = PAGE_SIZE;
- len -= n;
- } else {
- zero_user(page, len, PAGE_SIZE - len);
+ n = segment_size;
+ else
rdata->tailsz = len;
- len = 0;
- }
+ len -= n;
+
if (iter)
- result = copy_page_from_iter(page, 0, n, iter);
+ result = copy_page_from_iter(
+ page, page_offset, n, iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
else if (rdata->mr)
result = n;
#endif
else
- result = cifs_read_page_from_socket(server, page, n);
+ result = cifs_read_page_from_socket(
+ server, page, page_offset, n);
if (result < 0)
break;
@@ -3113,6 +3139,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->bytes = cur_len;
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
@@ -3557,6 +3584,7 @@ readpages_fill_pages(struct TCP_Server_Info *server,
u64 eof;
pgoff_t eof_index;
unsigned int nr_pages = rdata->nr_pages;
+ unsigned int page_offset = rdata->page_offset;
/* determine the eof that the server (probably) has */
eof = CIFS_I(rdata->mapping->host)->server_eof;
@@ -3567,13 +3595,21 @@ readpages_fill_pages(struct TCP_Server_Info *server,
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
- size_t n = PAGE_SIZE;
+ unsigned int to_read = rdata->pagesz;
+ size_t n;
+
+ if (i == 0)
+ to_read -= page_offset;
+ else
+ page_offset = 0;
+
+ n = to_read;
- if (len >= PAGE_SIZE) {
- len -= PAGE_SIZE;
+ if (len >= to_read) {
+ len -= to_read;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
- zero_user(page, len, PAGE_SIZE - len);
+ zero_user(page, len + page_offset, to_read - len);
n = rdata->tailsz = len;
len = 0;
} else if (page->index > eof_index) {
@@ -3605,13 +3641,15 @@ readpages_fill_pages(struct TCP_Server_Info *server,
}
if (iter)
- result = copy_page_from_iter(page, 0, n, iter);
+ result = copy_page_from_iter(
+ page, page_offset, n, iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
else if (rdata->mr)
result = n;
#endif
else
- result = cifs_read_page_from_socket(server, page, n);
+ result = cifs_read_page_from_socket(
+ server, page, page_offset, n);
if (result < 0)
break;
@@ -3790,6 +3828,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->bytes = bytes;
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
rdata->credits = credits;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3c371f7f5963..745fd7fe8d0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -746,7 +746,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_dbg(FYI, "Getting info on %s\n", full_path);
if ((data == NULL) && (*inode != NULL)) {
- if (CIFS_CACHE_READ(CIFS_I(*inode))) {
+ if (CIFS_CACHE_READ(CIFS_I(*inode)) &&
+ CIFS_I(*inode)->time != 0) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto cgii_exit;
}
@@ -1857,15 +1858,15 @@ cifs_inode_needs_reval(struct inode *inode)
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ if (cifs_i->time == 0)
+ return true;
+
if (CIFS_CACHE_READ(cifs_i))
return false;
if (!lookupCacheEnabled)
return true;
- if (cifs_i->time == 0)
- return true;
-
if (!cifs_sb->actimeo)
return true;
@@ -2104,10 +2105,14 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
static void cifs_setsize(struct inode *inode, loff_t offset)
{
+ struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
spin_lock(&inode->i_lock);
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
+ /* Cached inode must be refreshed on truncate */
+ cifs_i->time = 0;
truncate_pagecache(inode, offset);
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 460084a8eac5..aba3fc3058da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -117,6 +117,8 @@ tconInfoAlloc(void)
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
spin_lock_init(&ret_buf->open_file_lock);
+ mutex_init(&ret_buf->prfid_mutex);
+ ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL);
#ifdef CONFIG_CIFS_STATS
spin_lock_init(&ret_buf->stat_lock);
#endif
@@ -134,6 +136,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
atomic_dec(&tconInfoAllocCount);
kfree(buf_to_free->nativeFileSystem);
kzfree(buf_to_free->password);
+ kfree(buf_to_free->prfid);
kfree(buf_to_free);
}
@@ -145,7 +148,7 @@ cifs_buf_get(void)
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- size_t buf_size = sizeof(struct smb2_hdr);
+ size_t buf_size = sizeof(struct smb2_sync_hdr);
/*
* We could use negotiated size instead of max_msgsize -
@@ -339,7 +342,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
/* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb))
return -EIO;
- clc_len = smbCalcSize(smb);
+ clc_len = smbCalcSize(smb, server);
if (4 + rfclen != total_read) {
cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index cc88f4f0325e..d7ad0dfe4e68 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -903,7 +903,7 @@ map_smb_to_linux_error(char *buf, bool logErr)
* portion, the number of word parameters and the data portion of the message
*/
unsigned int
-smbCalcSize(void *buf)
+smbCalcSize(void *buf, struct TCP_Server_Info *server)
{
struct smb_hdr *ptr = (struct smb_hdr *)buf;
return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a27fc8791551..eeab81c9452f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -650,7 +650,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
char *cur_ent;
char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
server->ops->calc_smb_size(
- cfile->srch_inf.ntwrk_buf_start);
+ cfile->srch_inf.ntwrk_buf_start,
+ server);
cur_ent = cfile->srch_inf.srch_entries_start;
first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
@@ -831,7 +832,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start);
+ cifsFile->srch_inf.ntwrk_buf_start,
+ tcon->ses->server);
end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 401a5d856636..0ffa18094335 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -61,9 +61,4 @@
/* Maximum buffer size value we can send with 1 credit */
#define SMB2_MAX_BUFFER_SIZE 65536
-static inline struct smb2_sync_hdr *get_sync_hdr(void *buf)
-{
- return &(((struct smb2_hdr *)buf)->sync_hdr);
-}
-
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1238cd3552f9..a6e786e39248 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -44,26 +44,38 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
__u32 create_options, void *data, int command)
{
int rc, tmprc = 0;
- __le16 *utf16_path;
+ __le16 *utf16_path = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ bool use_cached_root_handle = false;
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
+ if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
+ (desired_access == FILE_READ_ATTRIBUTES) &&
+ (create_disposition == FILE_OPEN) &&
+ (tcon->nohandlecache == false)) {
+ rc = open_shroot(xid, tcon, &fid);
+ if (rc == 0)
+ use_cached_root_handle = true;
+ }
+
+ if (use_cached_root_handle == false) {
+ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
- oparms.tcon = tcon;
- oparms.desired_access = desired_access;
- oparms.disposition = create_disposition;
- oparms.create_options = create_options;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
- if (rc) {
- kfree(utf16_path);
- return rc;
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ if (rc) {
+ kfree(utf16_path);
+ return rc;
+ }
}
switch (command) {
@@ -107,7 +119,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
break;
}
- rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ if (use_cached_root_handle == false)
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (tmprc)
rc = tmprc;
kfree(utf16_path);
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 3bfc9c990724..20a2d304c603 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -27,6 +27,7 @@
#include "smb2proto.h"
#include "smb2status.h"
#include "smb2glob.h"
+#include "trace.h"
struct status_to_posix_error {
__le32 smb2_status;
@@ -2450,13 +2451,16 @@ smb2_print_status(__le32 status)
int
map_smb2_to_linux_error(char *buf, bool log_err)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
unsigned int i;
int rc = -EIO;
__le32 smb2err = shdr->Status;
- if (smb2err == 0)
+ if (smb2err == 0) {
+ trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
return 0;
+ }
/* mask facility */
if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
@@ -2478,5 +2482,8 @@ map_smb2_to_linux_error(char *buf, bool log_err)
cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
__le32_to_cpu(smb2err), rc);
+ trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 68ea8491c160..cb5728e3d87d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -94,8 +94,8 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
};
#ifdef CONFIG_CIFS_SMB311
-static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
- size_t hdr_preamble_size)
+static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+ __u32 non_ctxlen)
{
__u16 neg_count;
__u32 nc_offset, size_of_pad_before_neg_ctxts;
@@ -109,12 +109,11 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
/* Make sure that negotiate contexts start after gss security blob */
nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset);
- if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) {
+ if (nc_offset < non_ctxlen) {
printk_once(KERN_WARNING "invalid negotiate context offset\n");
return 0;
}
- size_of_pad_before_neg_ctxts = nc_offset -
- (non_ctxlen - hdr_preamble_size);
+ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen;
/* Verify that at least minimal negotiate contexts fit within frame */
if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) {
@@ -131,25 +130,20 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen,
#endif /* CIFS_SMB311 */
int
-smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
+smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
- struct smb2_hdr *hdr = &pdu->hdr;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
__u64 mid;
- __u32 len = get_rfc1002_length(buf);
__u32 clc_len; /* calculated length */
int command;
-
- /* BB disable following printk later */
- cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
- __func__, length, len);
+ int pdu_size = sizeof(struct smb2_sync_pdu);
+ int hdr_size = sizeof(struct smb2_sync_hdr);
/*
* Add function to do table lookup of StructureSize by command
* ie Validate the wct via smb2_struct_sizes table above
*/
-
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
@@ -173,8 +167,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
mid = le64_to_cpu(shdr->MessageId);
- if (length < sizeof(struct smb2_pdu)) {
- if ((length >= sizeof(struct smb2_hdr))
+ if (len < pdu_size) {
+ if ((len >= hdr_size)
&& (shdr->Status != 0)) {
pdu->StructureSize2 = 0;
/*
@@ -187,8 +181,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
return 1;
}
- if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE -
- srvr->vals->header_preamble_size) {
+ if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) {
cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
mid);
return 1;
@@ -227,44 +220,38 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
}
- if (srvr->vals->header_preamble_size + len != length) {
- cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n",
- length, srvr->vals->header_preamble_size + len, mid);
- return 1;
- }
-
- clc_len = smb2_calc_size(hdr);
+ clc_len = smb2_calc_size(buf, srvr);
#ifdef CONFIG_CIFS_SMB311
if (shdr->Command == SMB2_NEGOTIATE)
- clc_len += get_neg_ctxt_len(hdr, len, clc_len,
- srvr->vals->header_preamble_size);
+ clc_len += get_neg_ctxt_len(shdr, len, clc_len);
#endif /* SMB311 */
- if (srvr->vals->header_preamble_size + len != clc_len) {
- cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n",
- clc_len, srvr->vals->header_preamble_size + len, mid);
+ if (len != clc_len) {
+ cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+ clc_len, len, mid);
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
- if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE)
+ if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
- if (clc_len == srvr->vals->header_preamble_size + len + 1)
+ if (clc_len == len + 1)
return 0;
/*
* MacOS server pads after SMB2.1 write response with 3 bytes
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
+ * Some windows servers do too when compounding is used.
* Log the server error (once), but allow it and continue
* since the frame is parseable.
*/
- if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) {
+ if (clc_len < len) {
printk_once(KERN_WARNING
- "SMB2 server sent bad RFC1001 len %d not %zu\n",
- len, clc_len - srvr->vals->header_preamble_size);
+ "SMB2 server sent bad RFC1001 len %d not %d\n",
+ len, clc_len);
return 0;
}
@@ -305,15 +292,14 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
*off = 0;
*len = 0;
/* error responses do not have data area */
if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
- (((struct smb2_err_rsp *)hdr)->StructureSize) ==
+ (((struct smb2_err_rsp *)shdr)->StructureSize) ==
SMB2_ERROR_STRUCTURE_SIZE2)
return NULL;
@@ -325,42 +311,44 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
switch (shdr->Command) {
case SMB2_NEGOTIATE:
*off = le16_to_cpu(
- ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
+ ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
- ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength);
+ ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_SESSION_SETUP:
*off = le16_to_cpu(
- ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset);
+ ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
- ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength);
+ ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_CREATE:
*off = le32_to_cpu(
- ((struct smb2_create_rsp *)hdr)->CreateContextsOffset);
+ ((struct smb2_create_rsp *)shdr)->CreateContextsOffset);
*len = le32_to_cpu(
- ((struct smb2_create_rsp *)hdr)->CreateContextsLength);
+ ((struct smb2_create_rsp *)shdr)->CreateContextsLength);
break;
case SMB2_QUERY_INFO:
*off = le16_to_cpu(
- ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset);
+ ((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
- ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
+ ((struct smb2_query_info_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_READ:
- *off = ((struct smb2_read_rsp *)hdr)->DataOffset;
- *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
+ /* TODO: is this a bug ? */
+ *off = ((struct smb2_read_rsp *)shdr)->DataOffset;
+ *len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength);
break;
case SMB2_QUERY_DIRECTORY:
*off = le16_to_cpu(
- ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
+ ((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
- ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
+ ((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_IOCTL:
*off = le32_to_cpu(
- ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
- *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+ ((struct smb2_ioctl_rsp *)shdr)->OutputOffset);
+ *len = le32_to_cpu(
+ ((struct smb2_ioctl_rsp *)shdr)->OutputCount);
break;
case SMB2_CHANGE_NOTIFY:
default:
@@ -403,15 +391,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
* portion, the number of word parameters and the data portion of the message.
*/
unsigned int
-smb2_calc_size(void *buf)
+smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
- struct smb2_hdr *hdr = &pdu->hdr;
- struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
+ struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
+ struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
- int len = 4 + le16_to_cpu(shdr->StructureSize);
+ int len = le16_to_cpu(shdr->StructureSize);
/*
* StructureSize2, ie length of fixed parameter area has already
@@ -422,7 +409,7 @@ smb2_calc_size(void *buf)
if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false)
goto calc_size_exit;
- smb2_get_data_area_len(&offset, &data_length, hdr);
+ smb2_get_data_area_len(&offset, &data_length, shdr);
cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
if (data_length > 0) {
@@ -430,15 +417,14 @@ smb2_calc_size(void *buf)
* Check to make sure that data area begins after fixed area,
* Note that last byte of the fixed area is part of data area
* for some commands, typically those with odd StructureSize,
- * so we must add one to the calculation (and 4 to account for
- * the size of the RFC1001 hdr.
+ * so we must add one to the calculation.
*/
- if (offset + 4 + 1 < len) {
+ if (offset + 1 < len) {
cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
- offset + 4 + 1, len);
+ offset + 1, len);
data_length = 0;
} else {
- len = 4 + offset + data_length;
+ len = offset + data_length;
}
}
calc_size_exit:
@@ -465,8 +451,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
/* Windows doesn't allow paths beginning with \ */
if (from[0] == '\\')
start_of_path = from + 1;
+#ifdef CONFIG_CIFS_SMB311
+ /* SMB311 POSIX extensions paths do not include leading slash */
+ else if (cifs_sb_master_tcon(cifs_sb)->posix_extensions)
+ start_of_path = from + 1;
+#endif /* 311 */
else
start_of_path = from;
+
to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
cifs_sb->local_nls, map_type);
return to;
@@ -621,7 +613,7 @@ smb2_is_valid_lease_break(char *buffer)
bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer;
+ struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -630,7 +622,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for oplock break\n");
- if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK)
+ if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
@@ -721,7 +713,7 @@ smb2_cancelled_close_fid(struct work_struct *work)
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer);
+ struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
struct close_cancelled_open *cancelled;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 9c6d95ffca97..950d0ab2cc61 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -123,7 +123,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
static unsigned int
smb2_get_credits(struct mid_q_entry *mid)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
return le16_to_cpu(shdr->CreditRequest);
}
@@ -190,7 +190,7 @@ static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
struct mid_q_entry *mid;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -212,15 +212,16 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
}
static void
-smb2_dump_detail(void *buf)
+smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
shdr->ProcessId);
- cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf));
+ cifs_dbg(VFS, "smb buf %p len %u\n", buf,
+ server->ops->calc_smb_size(buf, server));
#endif
}
@@ -322,6 +323,40 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
}
#endif /* STATS2 */
+/*
+ * Open the directory at the root of a share
+ */
+int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
+{
+ struct cifs_open_parms oparams;
+ int rc;
+ __le16 srch_path = 0; /* Null - since an open of top of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+
+ mutex_lock(&tcon->prfid_mutex);
+ if (tcon->valid_root_fid) {
+ cifs_dbg(FYI, "found a cached root file handle\n");
+ memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->prfid_mutex);
+ return 0;
+ }
+
+ oparams.tcon = tcon;
+ oparams.create_options = 0;
+ oparams.desired_access = FILE_READ_ATTRIBUTES;
+ oparams.disposition = FILE_OPEN;
+ oparams.fid = pfid;
+ oparams.reconnect = false;
+
+ rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL);
+ if (rc == 0) {
+ memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid));
+ tcon->valid_root_fid = true;
+ }
+ mutex_unlock(&tcon->prfid_mutex);
+ return rc;
+}
+
static void
smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
{
@@ -330,6 +365,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ bool no_cached_open = tcon->nohandlecache;
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -338,7 +374,11 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ if (no_cached_open)
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ else
+ rc = open_shroot(xid, tcon, &fid);
+
if (rc)
return;
@@ -352,7 +392,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
FS_DEVICE_INFORMATION);
SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ if (no_cached_open)
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
return;
}
@@ -394,6 +435,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ if ((*full_path == 0) && tcon->valid_root_fid)
+ return 0;
+
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
@@ -704,9 +748,11 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " TRIM-support,");
seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+ seq_printf(m, "\n\ttid: 0x%x", tcon->tid);
if (tcon->perf_sector_size)
seq_printf(m, "\tOptimal sector size: 0x%x",
tcon->perf_sector_size);
+ seq_printf(m, "\tMaximal Access: 0x%x", tcon->maximal_access);
}
static void
@@ -1257,7 +1303,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
@@ -1275,12 +1321,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
static bool
smb2_is_session_expired(char *buf)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED)
+ if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
+ shdr->Status != STATUS_USER_SESSION_DELETED)
return false;
- cifs_dbg(FYI, "Session expired\n");
+ cifs_dbg(FYI, "Session expired or deleted\n");
return true;
}
@@ -1474,8 +1521,6 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int sub_offset;
unsigned int print_len;
unsigned int print_offset;
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -1499,7 +1544,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
err_buf = err_iov.iov_base;
if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
- err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) {
+ err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
kfree(utf16_path);
return -ENOENT;
}
@@ -1512,14 +1557,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
print_len = le16_to_cpu(symlink->PrintNameLength);
print_offset = le16_to_cpu(symlink->PrintNameOffset);
- if (err_iov.iov_len + server->vals->header_preamble_size <
- SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
+ if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
kfree(utf16_path);
return -ENOENT;
}
- if (err_iov.iov_len + server->vals->header_preamble_size <
- SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
+ if (err_iov.iov_len <
+ SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
kfree(utf16_path);
return -ENOENT;
}
@@ -1593,8 +1637,11 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.create_options = 0;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return ERR_PTR(-ENOMEM);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ free_xid(xid);
+ return ERR_PTR(rc);
+ }
oparms.tcon = tcon;
oparms.desired_access = READ_CONTROL;
@@ -1652,8 +1699,11 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
access_flags = WRITE_DAC;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ free_xid(xid);
+ return rc;
+ }
oparms.tcon = tcon;
oparms.desired_access = access_flags;
@@ -1713,15 +1763,21 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false)
- return -EOPNOTSUPP;
+ if (keep_size == false) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
/*
* Must check if file sparse since fallocate -z (zero range) assumes
* non-sparse allocation
*/
- if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
- return -EOPNOTSUPP;
+ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
/*
* need to make sure we are not asked to extend the file since the SMB3
@@ -1730,8 +1786,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
* which for a non sparse file would zero the newly extended range
*/
if (keep_size == false)
- if (i_size_read(inode) < offset + len)
- return -EOPNOTSUPP;
+ if (i_size_read(inode) < offset + len) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
@@ -1764,8 +1823,11 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
- if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
- return -EOPNOTSUPP;
+ if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
@@ -1796,8 +1858,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false)
- return -EOPNOTSUPP;
+ if (keep_size == false) {
+ free_xid(xid);
+ return rc;
+ }
/*
* Files are non-sparse by default so falloc may be a no-op
@@ -1806,14 +1870,16 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
if (keep_size == true)
- return 0;
+ rc = 0;
/* check if extending file */
else if (i_size_read(inode) >= off + len)
/* not extending file and already not sparse */
- return 0;
+ rc = 0;
/* BB: in future add else clause to extend file */
else
- return -EOPNOTSUPP;
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
}
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
@@ -1825,8 +1891,11 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
* ie potentially making a few extra pages at the beginning
* or end of the file non-sparse via set_sparse is harmless.
*/
- if ((off > 8192) || (off + len + 8192 < i_size_read(inode)))
- return -EOPNOTSUPP;
+ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
}
@@ -2035,7 +2104,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
}
static __u8
-smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
{
struct create_lease *lc = (struct create_lease *)buf;
@@ -2046,13 +2115,16 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch)
}
static __u8
-smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
{
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
*epoch = le16_to_cpu(lc->lcontext.Epoch);
if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
+ if (lease_key)
+ memcpy(lease_key, &lc->lcontext.LeaseKeyLow,
+ SMB2_LEASE_KEY_SIZE);
return le32_to_cpu(lc->lcontext.LeaseState);
}
@@ -2070,12 +2142,11 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
}
static void
-fill_transform_hdr(struct TCP_Server_Info *server,
- struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
+fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
+ struct smb_rqst *old_rq)
{
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base;
- unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -2083,8 +2154,6 @@ fill_transform_hdr(struct TCP_Server_Info *server,
tr_hdr->Flags = cpu_to_le16(0x01);
get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
- inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size);
- inc_rfc1001_len(tr_hdr, orig_len);
}
/* We can not use the normal sg_set_buf() as we will sometimes pass a
@@ -2096,11 +2165,16 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
+/* Assumes:
+ * rqst->rq_iov[0] is rfc1002 length
+ * rqst->rq_iov[1] is tranform header
+ * rqst->rq_iov[2+] data to be encrypted/decrypted
+ */
static struct scatterlist *
init_sg(struct smb_rqst *rqst, u8 *sign)
{
- unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
struct scatterlist *sg;
unsigned int i;
unsigned int j;
@@ -2110,10 +2184,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign)
return NULL;
sg_init_table(sg, sg_len);
- smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
- for (i = 1; i < rqst->rq_nvec; i++)
- smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
- rqst->rq_iov[i].iov_len);
+ smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len);
+ for (i = 1; i < rqst->rq_nvec - 1; i++)
+ smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base,
+ rqst->rq_iov[i+1].iov_len);
for (j = 0; i < sg_len - 1; i++, j++) {
unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz
: rqst->rq_tailsz;
@@ -2145,9 +2219,10 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
}
/*
* Encrypt or decrypt @rqst message. @rqst has the following format:
- * iov[0] - transform header (associate data),
- * iov[1-N] and pages - data to encrypt.
- * On success return encrypted data in iov[1-N] and pages, leave iov[0]
+ * iov[0] - rfc1002 length
+ * iov[1] - transform header (associate data),
+ * iov[2-N] and pages - data to encrypt.
+ * On success return encrypted data in iov[2-N] and pages, leave iov[0-1]
* untouched.
*/
static int
@@ -2155,7 +2230,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
{
struct smb2_transform_hdr *tr_hdr =
(struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc = 0;
struct scatterlist *sg;
u8 sign[SMB2_SIGNATURE_SIZE] = {};
@@ -2242,6 +2317,10 @@ free_req:
return rc;
}
+/*
+ * This is called from smb_send_rqst. At this point we have the rfc1002
+ * header as the first element in the vector.
+ */
static int
smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
struct smb_rqst *old_rq)
@@ -2250,6 +2329,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
struct page **pages;
struct smb2_transform_hdr *tr_hdr;
unsigned int npages = old_rq->rq_npages;
+ unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
int i;
int rc = -ENOMEM;
@@ -2268,24 +2348,34 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
goto err_free_pages;
}
- iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL);
+ /* Make space for one extra iov to hold the transform header */
+ iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec),
+ GFP_KERNEL);
if (!iov)
goto err_free_pages;
/* copy all iovs from the old except the 1st one (rfc1002 length) */
- memcpy(&iov[1], &old_rq->rq_iov[1],
+ memcpy(&iov[2], &old_rq->rq_iov[1],
sizeof(struct kvec) * (old_rq->rq_nvec - 1));
+ /* copy the rfc1002 iov */
+ iov[0].iov_base = old_rq->rq_iov[0].iov_base;
+ iov[0].iov_len = old_rq->rq_iov[0].iov_len;
+
new_rq->rq_iov = iov;
- new_rq->rq_nvec = old_rq->rq_nvec;
+ new_rq->rq_nvec = old_rq->rq_nvec + 1;
tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
if (!tr_hdr)
goto err_free_iov;
- /* fill the 1st iov with a transform header */
- fill_transform_hdr(server, tr_hdr, old_rq);
- new_rq->rq_iov[0].iov_base = tr_hdr;
- new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+ /* fill the 2nd iov with a transform header */
+ fill_transform_hdr(tr_hdr, orig_len, old_rq);
+ new_rq->rq_iov[1].iov_base = tr_hdr;
+ new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+
+ /* Update rfc1002 header */
+ inc_rfc1001_len(new_rq->rq_iov[0].iov_base,
+ sizeof(struct smb2_transform_hdr));
/* copy pages form the old */
for (i = 0; i < npages; i++) {
@@ -2325,7 +2415,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst)
put_page(rqst->rq_pages[i]);
kfree(rqst->rq_pages);
/* free transform header */
- kfree(rqst->rq_iov[0].iov_base);
+ kfree(rqst->rq_iov[1].iov_base);
kfree(rqst->rq_iov);
}
@@ -2342,18 +2432,19 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
unsigned int buf_data_size, struct page **pages,
unsigned int npages, unsigned int page_data_size)
{
- struct kvec iov[2];
+ struct kvec iov[3];
struct smb_rqst rqst = {NULL};
- struct smb2_hdr *hdr;
int rc;
- iov[0].iov_base = buf;
- iov[0].iov_len = sizeof(struct smb2_transform_hdr);
- iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
- iov[1].iov_len = buf_data_size;
+ iov[0].iov_base = NULL;
+ iov[0].iov_len = 0;
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+ iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr);
+ iov[2].iov_len = buf_data_size;
rqst.rq_iov = iov;
- rqst.rq_nvec = 2;
+ rqst.rq_nvec = 3;
rqst.rq_pages = pages;
rqst.rq_npages = npages;
rqst.rq_pagesz = PAGE_SIZE;
@@ -2365,10 +2456,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
if (rc)
return rc;
- memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size);
- hdr = (struct smb2_hdr *)buf;
- hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size);
- server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size;
+ memmove(buf, iov[2].iov_base, buf_data_size);
+
+ server->total_read = buf_data_size + page_data_size;
return rc;
}
@@ -2393,7 +2483,7 @@ read_data_into_pages(struct TCP_Server_Info *server, struct page **pages,
zero_user(page, len, PAGE_SIZE - len);
len = 0;
}
- length = cifs_read_page_from_socket(server, page, n);
+ length = cifs_read_page_from_socket(server, page, 0, n);
if (length < 0)
return length;
server->total_read += length;
@@ -2441,7 +2531,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_page_idx;
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
struct bio_vec *bvec = NULL;
struct iov_iter iter;
struct kvec iov;
@@ -2472,7 +2562,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size;
+ data_offset = server->ops->read_data_offset(buf);
#ifdef CONFIG_CIFS_SMB_DIRECT
use_rdma_mr = rdata->mr;
#endif
@@ -2568,12 +2658,11 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
unsigned int npages;
struct page **pages;
unsigned int len;
- unsigned int buflen = server->pdu_size + server->vals->header_preamble_size;
+ unsigned int buflen = server->pdu_size;
int rc;
int i = 0;
- len = min_t(unsigned int, buflen, server->vals->read_rsp_size -
- server->vals->header_preamble_size +
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size +
sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
@@ -2581,8 +2670,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return rc;
server->total_read += rc;
- len = le32_to_cpu(tr_hdr->OriginalMessageSize) +
- server->vals->header_preamble_size -
+ len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
server->vals->read_rsp_size;
npages = DIV_ROUND_UP(len, PAGE_SIZE);
@@ -2609,8 +2697,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
if (rc)
goto free_pages;
- rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size -
- server->vals->header_preamble_size,
+ rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
pages, npages, len);
if (rc)
goto free_pages;
@@ -2647,7 +2734,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid_entry;
/* switch to large buffer if too big for a small one */
- if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE) {
server->large_buf = true;
memcpy(server->bigbuf, buf, server->total_read);
buf = server->bigbuf;
@@ -2655,13 +2742,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1 +
- server->vals->header_preamble_size);
+ pdu_length - HEADER_SIZE(server) + 1);
if (length < 0)
return length;
server->total_read += length;
- buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr);
+ buf_size = pdu_length - sizeof(struct smb2_transform_hdr);
length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
if (length)
return length;
@@ -2690,7 +2776,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) +
+ if (pdu_length < sizeof(struct smb2_transform_hdr) +
sizeof(struct smb2_sync_hdr)) {
cifs_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
@@ -2699,14 +2785,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return -ECONNABORTED;
}
- if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) {
+ if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_dbg(VFS, "Transform message is broken\n");
cifs_reconnect(server);
wake_up(&server->response_q);
return -ECONNABORTED;
}
- if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
return receive_encrypted_read(server, mid);
return receive_encrypted_standard(server, mid);
@@ -2717,11 +2803,23 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
- return handle_read_data(server, mid, buf, server->pdu_size +
- server->vals->header_preamble_size,
+ return handle_read_data(server, mid, buf, server->pdu_size,
NULL, 0, 0);
}
+static int
+smb2_next_header(char *buf)
+{
+ struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
+
+ if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
+ return sizeof(struct smb2_transform_hdr) +
+ le32_to_cpu(t_hdr->OriginalMessageSize);
+
+ return le32_to_cpu(hdr->NextCommand);
+}
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -2813,6 +2911,7 @@ struct smb_version_operations smb20_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
struct smb_version_operations smb21_operations = {
@@ -2907,6 +3006,7 @@ struct smb_version_operations smb21_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
struct smb_version_operations smb30_operations = {
@@ -3011,6 +3111,7 @@ struct smb_version_operations smb30_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
#ifdef CONFIG_CIFS_SMB311
@@ -3111,6 +3212,7 @@ struct smb_version_operations smb311_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
+ .next_header = smb2_next_header,
};
#endif /* CIFS_SMB311 */
@@ -3122,8 +3224,8 @@ struct smb_version_values smb20_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3143,8 +3245,8 @@ struct smb_version_values smb21_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3164,8 +3266,8 @@ struct smb_version_values smb3any_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3185,8 +3287,8 @@ struct smb_version_values smbdefault_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3206,8 +3308,8 @@ struct smb_version_values smb30_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3227,8 +3329,8 @@ struct smb_version_values smb302_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3249,8 +3351,8 @@ struct smb_version_values smb311_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0f48741a0130..281fbc1dc720 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -49,6 +49,7 @@
#include "cifspdu.h"
#include "cifs_spnego.h"
#include "smbdirect.h"
+#include "trace.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -79,7 +80,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
};
-static int encryption_required(const struct cifs_tcon *tcon)
+static int smb3_encryption_required(const struct cifs_tcon *tcon)
{
if (!tcon)
return 0;
@@ -145,7 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
if (tcon->ses && tcon->ses->server && tcon->ses->server->sign &&
- !encryption_required(tcon))
+ !smb3_encryption_required(tcon))
shdr->Flags |= SMB2_FLAGS_SIGNED;
out:
return;
@@ -367,6 +368,7 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
static void
build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
@@ -390,21 +392,35 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
}
static void
+build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
+{
+ pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
+ pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+}
+
+static void
assemble_neg_contexts(struct smb2_negotiate_req *req,
unsigned int *total_len)
{
char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
+ unsigned int ctxt_len;
+ *total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
- /* Add 2 to size to round to 8 byte boundary */
+ ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
- pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
- req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
- req->NegotiateContextCount = cpu_to_le16(2);
+ ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+
+ build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
+ *total_len += sizeof(struct smb2_posix_neg_context);
- *total_len += 4 + sizeof(struct smb2_preauth_neg_context)
- + sizeof(struct smb2_encryption_neg_context);
+ req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
+ req->NegotiateContextCount = cpu_to_le16(3);
}
static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
@@ -449,12 +465,12 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server,
}
static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
- struct TCP_Server_Info *server)
+ struct TCP_Server_Info *server,
+ unsigned int len_of_smb)
{
struct smb2_neg_context *pctx;
unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset);
unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount);
- unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length);
unsigned int len_of_ctxts, i;
int rc = 0;
@@ -475,8 +491,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
if (len_of_ctxts < sizeof(struct smb2_neg_context))
break;
- pctx = (struct smb2_neg_context *)(offset +
- server->vals->header_preamble_size + (char *)rsp);
+ pctx = (struct smb2_neg_context *)(offset + (char *)rsp);
clen = le16_to_cpu(pctx->DataLength);
if (clen > len_of_ctxts)
break;
@@ -487,6 +502,8 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES)
rc = decode_encrypt_ctx(server,
(struct smb2_encryption_neg_context *)pctx);
+ else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE)
+ server->posix_ext_supported = true;
else
cifs_dbg(VFS, "unknown negcontext of type %d ignored\n",
le16_to_cpu(pctx->ContextType));
@@ -501,6 +518,64 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
return rc;
}
+static struct create_posix *
+create_posix_buf(umode_t mode)
+{
+ struct create_posix *buf;
+
+ buf = kzalloc(sizeof(struct create_posix),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_posix, Mode));
+ buf->ccontext.DataLength = cpu_to_le32(4);
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_posix, Name));
+ buf->ccontext.NameLength = cpu_to_le16(16);
+
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ buf->Name[0] = 0x93;
+ buf->Name[1] = 0xAD;
+ buf->Name[2] = 0x25;
+ buf->Name[3] = 0x50;
+ buf->Name[4] = 0x9C;
+ buf->Name[5] = 0xB4;
+ buf->Name[6] = 0x11;
+ buf->Name[7] = 0xE7;
+ buf->Name[8] = 0xB4;
+ buf->Name[9] = 0x23;
+ buf->Name[10] = 0x83;
+ buf->Name[11] = 0xDE;
+ buf->Name[12] = 0x96;
+ buf->Name[13] = 0x8B;
+ buf->Name[14] = 0xCD;
+ buf->Name[15] = 0x7C;
+ buf->Mode = cpu_to_le32(mode);
+ cifs_dbg(FYI, "mode on posix create 0%o", mode);
+ return buf;
+}
+
+static int
+add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_posix_buf(mode);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_posix);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) +
+ iov[num - 1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix));
+ *num_iovec = num + 1;
+ return 0;
+}
+
#else
static void assemble_neg_contexts(struct smb2_negotiate_req *req,
unsigned int *total_len)
@@ -691,7 +766,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
- &rsp->hdr);
+ (struct smb2_sync_hdr *)rsp);
/*
* See MS-SMB2 section 2.2.4: if no blob, client picks default which
* for us will be
@@ -718,7 +793,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
#ifdef CONFIG_CIFS_SMB311
if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
if (rsp->NegotiateContextCount)
- rc = smb311_decode_neg_context(rsp, server);
+ rc = smb311_decode_neg_context(rsp, server,
+ rsp_iov.iov_len);
else
cifs_dbg(VFS, "Missing expected negotiate contexts\n");
}
@@ -1054,7 +1130,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
@@ -1130,13 +1206,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* If true, rc here is expected and not an error */
if (sess_data->buf0_type != CIFS_NO_BUFFER &&
- rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
rc = 0;
if (rc)
goto out;
- if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size !=
+ if (offsetof(struct smb2_sess_setup_rsp, Buffer) !=
le16_to_cpu(rsp->SecurityBufferOffset)) {
cifs_dbg(VFS, "Invalid security buffer offset %d\n",
le16_to_cpu(rsp->SecurityBufferOffset));
@@ -1151,7 +1227,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
out:
@@ -1209,7 +1285,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
rc = SMB2_sess_establish_session(sess_data);
@@ -1276,6 +1352,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
sess_data->ses = ses;
sess_data->buf0_type = CIFS_NO_BUFFER;
sess_data->nls_cp = (struct nls_table *) nls_cp;
+ sess_data->previous_session = ses->Suid;
#ifdef CONFIG_CIFS_SMB311
/*
@@ -1403,7 +1480,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
return rc;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
iov[0].iov_base = (char *)req;
@@ -1419,7 +1496,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
/* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
if ((ses->server->dialect == SMB311_PROT_ID) &&
- !encryption_required(tcon))
+ !smb3_encryption_required(tcon))
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
@@ -1457,7 +1534,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
- tcon->tid = rsp->hdr.sync_hdr.TreeId;
+ tcon->tid = rsp->sync_hdr.TreeId;
strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1477,7 +1554,7 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
}
goto tcon_exit;
@@ -1508,7 +1585,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
flags |= CIFS_NO_RESP;
@@ -1575,7 +1652,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
static __u8
parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch)
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -1583,14 +1660,15 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
unsigned int remaining;
char *name;
- data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset);
+ data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
if (le16_to_cpu(cc->NameLength) == 4 &&
strncmp(name, "RqLs", 4) == 0)
- return server->ops->parse_lease_buf(cc, epoch);
+ return server->ops->parse_lease_buf(cc, epoch,
+ lease_key);
next = le32_to_cpu(cc->Next);
if (!next)
@@ -1818,7 +1896,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct TCP_Server_Info *server;
struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct kvec iov[4];
+ struct kvec iov[5]; /* make sure at least one for each open context */
struct kvec rsp_iov = {NULL, 0};
int resp_buftype;
int uni_path_len;
@@ -1827,7 +1905,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
int rc = 0;
unsigned int n_iov = 2;
__u32 file_attributes = 0;
- char *dhc_buf = NULL, *lc_buf = NULL;
+ char *dhc_buf = NULL, *lc_buf = NULL, *pc_buf = NULL;
int flags = 0;
unsigned int total_len;
@@ -1843,7 +1921,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
if (oparms->create_options & CREATE_OPTION_READONLY)
@@ -1944,6 +2022,27 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
dhc_buf = iov[n_iov-1].iov_base;
}
+#ifdef CONFIG_CIFS_SMB311
+ if (tcon->posix_extensions) {
+ if (n_iov > 2) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[n_iov-1].iov_base;
+ ccontext->Next =
+ cpu_to_le32(iov[n_iov-1].iov_len);
+ }
+
+ rc = add_posix_context(iov, &n_iov, oparms->mode);
+ if (rc) {
+ cifs_small_buf_release(req);
+ kfree(copy_path);
+ kfree(lc_buf);
+ kfree(dhc_buf);
+ return rc;
+ }
+ pc_buf = iov[n_iov-1].iov_base;
+ }
+#endif /* SMB311 */
+
rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags,
&rsp_iov);
cifs_small_buf_release(req);
@@ -1956,8 +2055,13 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
resp_buftype = CIFS_NO_BUFFER;
rsp = NULL;
}
+ trace_smb3_open_err(xid, tcon->tid, ses->Suid,
+ oparms->create_options, oparms->desired_access, rc);
goto creat_exit;
- }
+ } else
+ trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+ ses->Suid, oparms->create_options,
+ oparms->desired_access);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
@@ -1972,13 +2076,15 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
+ *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
kfree(copy_path);
kfree(lc_buf);
kfree(dhc_buf);
+ kfree(pc_buf);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -1994,7 +2100,6 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
{
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp;
- struct smb2_sync_hdr *shdr;
struct cifs_ses *ses;
struct kvec iov[2];
struct kvec rsp_iov;
@@ -2025,7 +2130,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->CtlCode = cpu_to_le32(opcode);
@@ -2088,6 +2193,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
+ if (rc != 0)
+ trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, 0, opcode, rc);
+
if ((rc != 0) && (rc != -EINVAL)) {
cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
goto ioctl_exit;
@@ -2115,7 +2224,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+ if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) {
cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
le32_to_cpu(rsp->OutputOffset));
*plen = 0;
@@ -2129,8 +2238,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- shdr = get_sync_hdr(rsp);
- memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen);
+ memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
ioctl_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -2162,8 +2270,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
}
int
-SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid)
+SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int flags)
{
struct smb2_close_req *req;
struct smb2_close_rsp *rsp;
@@ -2172,7 +2280,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int resp_buftype;
int rc = 0;
- int flags = 0;
unsigned int total_len;
cifs_dbg(FYI, "Close\n");
@@ -2184,7 +2291,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->PersistentFileId = persistent_fid;
@@ -2199,6 +2306,8 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+ trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid,
+ rc);
goto close_exit;
}
@@ -2209,14 +2318,20 @@ close_exit:
return rc;
}
+int
+SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid)
+{
+ return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+}
+
static int
-validate_iov(struct TCP_Server_Info *server,
- unsigned int offset, unsigned int buffer_length,
+validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size)
{
unsigned int smb_len = iov->iov_len;
- char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base;
- char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base;
+ char *end_of_smb = smb_len + (char *)iov->iov_base;
+ char *begin_of_buf = offset + (char *)iov->iov_base;
char *end_of_buf = begin_of_buf + buffer_length;
@@ -2246,18 +2361,17 @@ validate_iov(struct TCP_Server_Info *server,
* Caller must free buffer.
*/
static int
-validate_and_copy_iov(struct TCP_Server_Info *server,
- unsigned int offset, unsigned int buffer_length,
+validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int minbufsize,
char *data)
{
- char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base);
+ char *begin_of_buf = offset + (char *)iov->iov_base;
int rc;
if (!data)
return -EINVAL;
- rc = validate_iov(server, offset, buffer_length, iov, minbufsize);
+ rc = validate_iov(offset, buffer_length, iov, minbufsize);
if (rc)
return rc;
@@ -2292,7 +2406,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->InfoType = info_type;
@@ -2318,6 +2432,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+ trace_smb3_query_info_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type, rc);
goto qinf_exit;
}
@@ -2335,8 +2451,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
}
}
- rc = validate_and_copy_iov(ses->server,
- le16_to_cpu(rsp->OutputBufferOffset),
+ rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
&rsp_iov, min_len, *data);
@@ -2407,7 +2522,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
unsigned int credits_received = 1;
if (mid->mid_state == MID_RESPONSE_RECEIVED)
- credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+ credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
DeleteMidQEntry(mid);
add_credits(server, credits_received, CIFS_ECHO_OP);
@@ -2536,7 +2651,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->PersistentFileId = persistent_fid;
@@ -2548,8 +2663,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
- if (rc != 0)
+ if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
+ trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid,
+ rc);
+ }
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc;
@@ -2658,11 +2776,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rdata->iov[1].iov_base;
+ (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
unsigned int credits_received = 1;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
+ .rq_offset = rdata->page_offset,
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
@@ -2760,7 +2879,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
return rc;
}
- if (encryption_required(io_parms.tcon))
+ if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
req_len = cpu_to_be32(total_len);
@@ -2791,7 +2910,13 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- }
+ trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length);
+ } else
+ trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length);
cifs_small_buf_release(buf);
return rc;
@@ -2804,7 +2929,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
int resp_buftype, rc = -EACCES;
struct smb2_read_plain_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
- struct smb2_sync_hdr *shdr;
struct kvec iov[1];
struct kvec rsp_iov;
unsigned int total_len;
@@ -2816,7 +2940,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc)
return rc;
- if (encryption_required(io_parms->tcon))
+ if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
iov[0].iov_base = (char *)req;
@@ -2832,9 +2956,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
}
+ trace_smb3_read_err(rc, xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
- }
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
*nbytes = le32_to_cpu(rsp->DataLength);
if ((*nbytes > CIFS_MAX_MSGSIZE) ||
@@ -2845,10 +2975,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
*nbytes = 0;
}
- shdr = get_sync_hdr(rsp);
-
if (*buf) {
- memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
+ memcpy(*buf, (char *)rsp + rsp->DataOffset, *nbytes);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
} else if (resp_buftype != CIFS_NO_BUFFER) {
*buf = rsp_iov.iov_base;
@@ -2875,7 +3003,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+ credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
if (wdata->result != 0)
break;
@@ -2952,7 +3080,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
goto async_writev_out;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
shdr = (struct smb2_sync_hdr *)req;
@@ -3013,6 +3141,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
rqst.rq_pages = wdata->pages;
+ rqst.rq_offset = wdata->page_offset;
rqst.rq_npages = wdata->nr_pages;
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
@@ -3050,9 +3179,15 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata, flags);
if (rc) {
+ trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes, rc);
kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- }
+ } else
+ trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes);
async_writev_out:
cifs_small_buf_release(req);
@@ -3090,7 +3225,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (io_parms->tcon->ses->server == NULL)
return -ECONNABORTED;
- if (encryption_required(io_parms->tcon))
+ if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
@@ -3116,10 +3251,19 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
+ trace_smb3_write_err(xid, req->PersistentFileId,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length, rc);
cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
cifs_dbg(VFS, "Send error in write = %d\n", rc);
- } else
+ } else {
*nbytes = le32_to_cpu(rsp->DataLength);
+ trace_smb3_write_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, *nbytes);
+ }
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -3200,7 +3344,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
switch (srch_inf->info_level) {
@@ -3251,7 +3395,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
- rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
srch_inf->endOfSearch = true;
rc = 0;
}
@@ -3259,8 +3403,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
goto qdir_exit;
}
- rc = validate_iov(server,
- le16_to_cpu(rsp->OutputBufferOffset),
+ rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
if (rc)
@@ -3275,10 +3418,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
cifs_buf_release(srch_inf->ntwrk_buf_start);
}
srch_inf->ntwrk_buf_start = (char *)rsp;
- srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
- (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
- /* 4 for rfc1002 length field */
- end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
+ srch_inf->srch_entries_start = srch_inf->last_entry =
+ (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
+ end_of_smb = rsp_iov.iov_len + (char *)rsp;
srch_inf->entries_in_buffer =
num_entries(srch_inf->srch_entries_start, end_of_smb,
&srch_inf->last_entry, info_buf_size);
@@ -3333,7 +3475,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3366,8 +3508,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_small_buf_release(req);
rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
- if (rc != 0)
+ if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
+ trace_smb3_set_info_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type, rc);
+ }
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
@@ -3514,7 +3659,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock_level)
{
int rc;
- struct smb2_oplock_break_req *req = NULL;
+ struct smb2_oplock_break *req = NULL;
struct cifs_ses *ses = tcon->ses;
int flags = CIFS_OBREAK_OP;
unsigned int total_len;
@@ -3528,7 +3673,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->VolatileFid = volatile_fid;
@@ -3593,7 +3738,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
req->InputBufferOffset =
cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
req->OutputBufferLength = cpu_to_le32(
- outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size);
+ outbuf_len + sizeof(struct smb2_query_info_rsp) - 1);
iov->iov_base = (char *)req;
iov->iov_len = total_len;
@@ -3610,7 +3755,6 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
@@ -3620,7 +3764,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3631,10 +3775,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
}
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
- info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size +
- le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
- rc = validate_iov(server,
- le16_to_cpu(rsp->OutputBufferOffset),
+ info = (struct smb2_fs_full_size_info *)(
+ le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
+ rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
sizeof(struct smb2_fs_full_size_info));
if (!rc)
@@ -3655,7 +3798,6 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
unsigned int rsp_len, offset;
int flags = 0;
@@ -3678,7 +3820,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3691,20 +3833,20 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rsp_len = le32_to_cpu(rsp->OutputBufferLength);
offset = le16_to_cpu(rsp->OutputBufferOffset);
- rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len);
+ rc = validate_iov(offset, rsp_len, &rsp_iov, min_len);
if (rc)
goto qfsattr_exit;
if (level == FS_ATTRIBUTE_INFORMATION)
- memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset
- + (char *)&rsp->hdr, min_t(unsigned int,
+ memcpy(&tcon->fsAttrInfo, offset
+ + (char *)rsp, min_t(unsigned int,
rsp_len, max_len));
else if (level == FS_DEVICE_INFORMATION)
- memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset
- + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+ memcpy(&tcon->fsDevInfo, offset
+ + (char *)rsp, sizeof(FILE_SYSTEM_DEVICE_INFO));
else if (level == FS_SECTOR_SIZE_INFORMATION) {
struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
- (server->vals->header_preamble_size + offset + (char *)&rsp->hdr);
+ (offset + (char *)rsp);
tcon->ss_flags = le32_to_cpu(ss_info->Flags);
tcon->perf_sector_size =
le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
@@ -3735,7 +3877,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3758,6 +3900,8 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
+ trace_smb3_lock_err(xid, persist_fid, tcon->tid,
+ tcon->ses->Suid, rc);
}
return rc;
@@ -3799,7 +3943,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.CreditRequest = cpu_to_le16(1);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d28f358022c5..a345560001ce 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -122,25 +122,10 @@ struct smb2_sync_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
-struct smb2_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /* length is only two or three bytes - with */
- /* one or two byte type preceding it that MBZ */
- struct smb2_sync_hdr sync_hdr;
-} __packed;
-
-struct smb2_pdu {
- struct smb2_hdr hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
#define SMB3_AES128CMM_NONCE 11
#define SMB3_AES128GCM_NONCE 12
struct smb2_transform_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /* length is only two or three bytes - with
- one or two byte type preceding it that MBZ */
__le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
__u8 Signature[16];
__u8 Nonce[16];
@@ -171,7 +156,7 @@ struct smb2_transform_hdr {
#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize;
__le16 Reserved; /* MBZ */
__le32 ByteCount; /* even if zero, at least one byte follows */
@@ -300,8 +285,16 @@ struct smb2_encryption_neg_context {
__le16 Ciphers[1]; /* Ciphers[0] since only one used now */
} __packed;
+#define POSIX_CTXT_DATA_LEN 8
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le64 Reserved1; /* In case needed for future (eg version or caps) */
+} __packed;
+
struct smb2_negotiate_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 65 */
__le16 SecurityMode;
__le16 DialectRevision;
@@ -341,7 +334,7 @@ struct smb2_sess_setup_req {
#define SMB2_SESSION_FLAG_IS_NULL 0x0002
#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
struct smb2_sess_setup_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 SessionFlags;
__le16 SecurityBufferOffset;
@@ -356,7 +349,7 @@ struct smb2_logoff_req {
} __packed;
struct smb2_logoff_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -452,7 +445,7 @@ struct smb2_tree_connect_req_extension {
} __packed;
struct smb2_tree_connect_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 16 */
__u8 ShareType; /* see below */
__u8 Reserved;
@@ -503,7 +496,7 @@ struct smb2_tree_disconnect_req {
} __packed;
struct smb2_tree_disconnect_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -615,7 +608,9 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
-#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
+#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
+#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
+
struct smb2_create_req {
struct smb2_sync_hdr sync_hdr;
@@ -638,7 +633,7 @@ struct smb2_create_req {
} __packed;
struct smb2_create_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 89 */
__u8 OplockLevel;
__u8 Reserved;
@@ -727,6 +722,13 @@ struct create_durable {
} Data;
} __packed;
+struct create_posix {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 Mode;
+ __u32 Reserved;
+} __packed;
+
/* See MS-SMB2 2.2.13.2.11 */
/* Flags */
#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
@@ -894,7 +896,7 @@ struct smb2_ioctl_req {
} __packed;
struct smb2_ioctl_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -921,7 +923,7 @@ struct smb2_close_req {
} __packed;
struct smb2_close_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* 60 */
__le16 Flags;
__le32 Reserved;
@@ -944,7 +946,7 @@ struct smb2_flush_req {
} __packed;
struct smb2_flush_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize;
__le16 Reserved;
} __packed;
@@ -976,7 +978,7 @@ struct smb2_read_plain_req {
} __packed;
struct smb2_read_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 17 */
__u8 DataOffset;
__u8 Reserved;
@@ -1007,7 +1009,7 @@ struct smb2_write_req {
} __packed;
struct smb2_write_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 17 */
__u8 DataOffset;
__u8 Reserved;
@@ -1041,7 +1043,7 @@ struct smb2_lock_req {
} __packed;
struct smb2_lock_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -1053,7 +1055,7 @@ struct smb2_echo_req {
} __packed;
struct smb2_echo_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
@@ -1079,7 +1081,7 @@ struct smb2_query_directory_req {
} __packed;
struct smb2_query_directory_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1128,7 +1130,7 @@ struct smb2_query_info_req {
} __packed;
struct smb2_query_info_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1150,12 +1152,11 @@ struct smb2_set_info_req {
} __packed;
struct smb2_set_info_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 2 */
} __packed;
-/* oplock break without an rfc1002 header */
-struct smb2_oplock_break_req {
+struct smb2_oplock_break {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 24 */
__u8 OplockLevel;
@@ -1165,21 +1166,10 @@ struct smb2_oplock_break_req {
__u64 VolatileFid;
} __packed;
-/* oplock break with an rfc1002 header */
-struct smb2_oplock_break_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __u64 PersistentFid;
- __u64 VolatileFid;
-} __packed;
-
#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
struct smb2_lease_break {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 44 */
__le16 Reserved;
__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 8ba24a95db71..908555b1c6b5 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -36,8 +36,9 @@ struct smb_rqst;
extern int map_smb2_to_linux_error(char *buf, bool log_err);
extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
-extern unsigned int smb2_calc_size(void *buf);
-extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
+extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
+extern char *smb2_get_data_area_len(int *off, int *len,
+ struct smb2_sync_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
struct cifs_sb_info *cifs_sb);
@@ -65,6 +66,8 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
+extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_fid *pfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
@@ -129,6 +132,8 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int flags);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8806f3f76c1d..2c671123a6bf 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -480,7 +480,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
unsigned int rc;
char server_response_sig[16];
struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
+ (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
if ((shdr->Command == SMB2_NEGOTIATE) ||
(shdr->Command == SMB2_SESSION_SETUP) ||
@@ -605,14 +605,12 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
bool log_error)
{
unsigned int len = mid->resp_buf_size;
- struct kvec iov[2];
+ struct kvec iov[1];
struct smb_rqst rqst = { .rq_iov = iov,
- .rq_nvec = 2 };
+ .rq_nvec = 1 };
iov[0].iov_base = (char *)mid->resp_buf;
- iov[0].iov_len = 4;
- iov[1].iov_base = (char *)mid->resp_buf + 4;
- iov[1].iov_len = len;
+ iov[0].iov_len = len;
dump_smb(mid->resp_buf, min_t(u32, 80, len));
/* convert the length into a more usable form */
diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
new file mode 100644
index 000000000000..bd4a546feec1
--- /dev/null
+++ b/fs/cifs/trace.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
new file mode 100644
index 000000000000..61e74d455d90
--- /dev/null
+++ b/fs/cifs/trace.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cifs
+
+#if !defined(_CIFS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CIFS_TRACE_H
+
+#include <linux/tracepoint.h>
+
+/* For logging errors in read or write */
+DECLARE_EVENT_CLASS(smb3_rw_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, offset, len, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->rc = rc;
+ ),
+ TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_RW_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
+
+DEFINE_SMB3_RW_ERR_EVENT(write_err);
+DEFINE_SMB3_RW_ERR_EVENT(read_err);
+
+
+/* For logging successful read or write */
+DECLARE_EVENT_CLASS(smb3_rw_done_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len),
+ TP_ARGS(xid, fid, tid, sesid, offset, len),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_SMB3_RW_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len), \
+ TP_ARGS(xid, fid, tid, sesid, offset, len))
+
+DEFINE_SMB3_RW_DONE_EVENT(write_done);
+DEFINE_SMB3_RW_DONE_EVENT(read_done);
+
+/*
+ * For handle based calls other than read and write, and get/set info
+ */
+DECLARE_EVENT_CLASS(smb3_fd_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_FD_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_fd_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, rc))
+
+DEFINE_SMB3_FD_ERR_EVENT(flush_err);
+DEFINE_SMB3_FD_ERR_EVENT(lock_err);
+DEFINE_SMB3_FD_ERR_EVENT(close_err);
+
+/*
+ * For handle based query/set info calls
+ */
+DECLARE_EVENT_CLASS(smb3_inf_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type, __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type, rc))
+
+DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+
+/*
+ * For logging SMB3 Status code and Command for responses which return errors
+ */
+DECLARE_EVENT_CLASS(smb3_cmd_err_class,
+ TP_PROTO(__u32 tid,
+ __u64 sesid,
+ __u16 cmd,
+ __u64 mid,
+ __u32 status,
+ int rc),
+ TP_ARGS(tid, sesid, cmd, mid, status, rc),
+ TP_STRUCT__entry(
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u16, cmd)
+ __field(__u64, mid)
+ __field(__u32, status)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->cmd = cmd;
+ __entry->mid = mid;
+ __entry->status = status;
+ __entry->rc = rc;
+ ),
+ TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
+ __entry->sesid, __entry->tid, __entry->cmd, __entry->mid,
+ __entry->status, __entry->rc)
+)
+
+#define DEFINE_SMB3_CMD_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_cmd_err_class, smb3_##name, \
+ TP_PROTO(__u32 tid, \
+ __u64 sesid, \
+ __u16 cmd, \
+ __u64 mid, \
+ __u32 status, \
+ int rc), \
+ TP_ARGS(tid, sesid, cmd, mid, status, rc))
+
+DEFINE_SMB3_CMD_ERR_EVENT(cmd_err);
+
+DECLARE_EVENT_CLASS(smb3_cmd_done_class,
+ TP_PROTO(__u32 tid,
+ __u64 sesid,
+ __u16 cmd,
+ __u64 mid),
+ TP_ARGS(tid, sesid, cmd, mid),
+ TP_STRUCT__entry(
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u16, cmd)
+ __field(__u64, mid)
+ ),
+ TP_fast_assign(
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->cmd = cmd;
+ __entry->mid = mid;
+ ),
+ TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu",
+ __entry->sesid, __entry->tid,
+ __entry->cmd, __entry->mid)
+)
+
+#define DEFINE_SMB3_CMD_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
+ TP_PROTO(__u32 tid, \
+ __u64 sesid, \
+ __u16 cmd, \
+ __u64 mid), \
+ TP_ARGS(tid, sesid, cmd, mid))
+
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
+
+DECLARE_EVENT_CLASS(smb3_exit_err_class,
+ TP_PROTO(unsigned int xid,
+ const char *func_name,
+ int rc),
+ TP_ARGS(xid, func_name, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(const char *, func_name)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->func_name = func_name;
+ __entry->rc = rc;
+ ),
+ TP_printk("\t%s: xid=%u rc=%d",
+ __entry->func_name, __entry->xid, __entry->rc)
+)
+
+#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_exit_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ const char *func_name, \
+ int rc), \
+ TP_ARGS(xid, func_name, rc))
+
+DEFINE_SMB3_EXIT_ERR_EVENT(exit_err);
+
+DECLARE_EVENT_CLASS(smb3_enter_exit_class,
+ TP_PROTO(unsigned int xid,
+ const char *func_name),
+ TP_ARGS(xid, func_name),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(const char *, func_name)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->func_name = func_name;
+ ),
+ TP_printk("\t%s: xid=%u",
+ __entry->func_name, __entry->xid)
+)
+
+#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \
+DEFINE_EVENT(smb3_enter_exit_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ const char *func_name), \
+ TP_ARGS(xid, func_name))
+
+DEFINE_SMB3_ENTER_EXIT_EVENT(enter);
+DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done);
+
+/*
+ * For smb2/smb3 open call
+ */
+DECLARE_EVENT_CLASS(smb3_open_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access,
+ int rc),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access, __entry->rc)
+)
+
+#define DEFINE_SMB3_OPEN_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_open_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc))
+
+DEFINE_SMB3_OPEN_ERR_EVENT(open_err);
+
+
+DECLARE_EVENT_CLASS(smb3_open_done_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_open_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
+
+#endif /* _CIFS_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 927226a2122f..e7254e386b79 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -800,8 +800,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
#ifdef CONFIG_CIFS_SMB311
if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
struct kvec iov = {
- .iov_base = buf + 4,
- .iov_len = get_rfc1002_length(buf)
+ .iov_base = buf,
+ .iov_len = midQ->resp_buf_size
};
smb311_update_preauth_hash(ses, &iov, 1);
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 124b093d14e5..c4fb9ad7c808 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -808,10 +808,7 @@ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, un
}
out:
mutex_unlock(&read_mutex);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int cramfs_readpage(struct file *file, struct page *page)
diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..aa86d9f971a4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -677,7 +677,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5243989a60cc..a5e4a221435c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con)
int result;
int addr_len;
struct socket *sock;
+ struct timeval tv = { .tv_sec = 5, .tv_usec = 0 };
if (con->nodeid == 0) {
log_print("attempt to connect sock 0 foiled");
@@ -1080,11 +1081,22 @@ static void sctp_connect_to_sock(struct connection *con)
log_print("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
- kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+ kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
sizeof(one));
+ /*
+ * Make sock->ops->connect() function return in specified time,
+ * since O_NONBLOCK argument in connect() function does not work here,
+ * then, we should restore the default value of this attribute.
+ */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
- O_NONBLOCK);
+ 0);
+ memset(&tv, 0, sizeof(tv));
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ sizeof(tv));
+
if (result == -EINPROGRESS)
result = 0;
if (result == 0)
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd602f73..61c9514da5e9 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file)
return 0;
}
-static __poll_t eventfd_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *
+eventfd_get_poll_head(struct file *file, __poll_t events)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+
+ return &ctx->wqh;
+}
+
+static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
{
struct eventfd_ctx *ctx = file->private_data;
__poll_t events = 0;
u64 count;
- poll_wait(file, &ctx->wqh, wait);
-
/*
* All writes to ctx->count occur within ctx->wqh.lock. This read
* can be done outside ctx->wqh.lock because we know that poll_wait
@@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = {
.show_fdinfo = eventfd_show_fdinfo,
#endif
.release = eventfd_release,
- .poll = eventfd_poll,
+ .get_poll_head = eventfd_get_poll_head,
+ .poll_mask = eventfd_poll_mask,
.read = eventfd_read,
.write = eventfd_write,
.llseek = noop_llseek,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca4285b2e..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
- epi->event.events;
+ return vfs_poll(epi->ffd.file, pt) & epi->event.events;
ep = epi->ffd.file->private_data;
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* The target file descriptor must support poll */
error = -EPERM;
- if (!tf.file->f_op->poll)
+ if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d737ff082472..c42169459298 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
if (fa->fa_file != filp)
continue;
- spin_lock_irq(&fa->fa_lock);
+ write_lock_irq(&fa->fa_lock);
fa->fa_file = NULL;
- spin_unlock_irq(&fa->fa_lock);
+ write_unlock_irq(&fa->fa_lock);
*fp = fa->fa_next;
call_rcu(&fa->fa_rcu, fasync_free_rcu);
@@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
if (fa->fa_file != filp)
continue;
- spin_lock_irq(&fa->fa_lock);
+ write_lock_irq(&fa->fa_lock);
fa->fa_fd = fd;
- spin_unlock_irq(&fa->fa_lock);
+ write_unlock_irq(&fa->fa_lock);
goto out;
}
- spin_lock_init(&new->fa_lock);
+ rwlock_init(&new->fa_lock);
new->magic = FASYNC_MAGIC;
new->fa_file = filp;
new->fa_fd = fd;
@@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
- unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- spin_lock_irqsave(&fa->fa_lock, flags);
+ read_lock(&fa->fa_lock);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- spin_unlock_irqrestore(&fa->fa_lock, flags);
+ read_unlock(&fa->fa_lock);
fa = rcu_dereference(fa->fa_next);
}
}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index ce4785fd81c6..a51425634f65 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -193,13 +193,9 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
return ERR_PTR(-ENAMETOOLONG);
ino = vxfs_inode_by_name(dip, dp);
- if (ino) {
+ if (ino)
ip = vxfs_iget(dip->i_sb, ino);
- if (IS_ERR(ip))
- return ERR_CAST(ip);
- }
- d_add(dp, ip);
- return NULL;
+ return d_splice_alias(ip, dp);
}
/**
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f58716567972..35f5ee23566d 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -54,8 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
continue;
if (start >= to)
break;
- if (gfs2_is_jdata(ip))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
gfs2_trans_add_data(ip->i_gl, bh);
}
}
@@ -747,18 +746,21 @@ out:
put_page(page);
gfs2_trans_end(sdp);
- if (pos + len > ip->i_inode.i_size)
- gfs2_trim_blocks(&ip->i_inode);
- goto out_trans_fail;
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
+ if (pos + len > ip->i_inode.i_size)
+ gfs2_trim_blocks(&ip->i_inode);
+ }
+ goto out_qunlock;
out_endtrans:
gfs2_trans_end(sdp);
out_trans_fail:
- if (alloc_required) {
+ if (alloc_required)
gfs2_inplace_release(ip);
out_qunlock:
+ if (alloc_required)
gfs2_quota_unlock(ip);
- }
out_unlock:
if (&ip->i_inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
@@ -814,7 +816,6 @@ out:
* @inode: The inode
* @dibh: The buffer_head containing the on-disk inode
* @pos: The file position
- * @len: The length of the write
* @copied: How much was actually copied by the VFS
* @page: The page
*
@@ -824,17 +825,15 @@ out:
* Returns: errno
*/
static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
- loff_t pos, unsigned len, unsigned copied,
+ loff_t pos, unsigned copied,
struct page *page)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
u64 to = pos + copied;
void *kaddr;
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
- BUG_ON(pos + len > gfs2_max_stuffed_size(ip));
+ BUG_ON(pos + copied > gfs2_max_stuffed_size(ip));
kaddr = kmap_atomic(page);
memcpy(buf + pos, kaddr + pos, copied);
@@ -850,20 +849,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
i_size_write(inode, to);
mark_inode_dirty(inode);
}
-
- if (inode == sdp->sd_rindex) {
- adjust_fs_space(inode);
- sdp->sd_rindex_uptodate = 0;
- }
-
- brelse(dibh);
- gfs2_trans_end(sdp);
- if (inode == sdp->sd_rindex) {
- gfs2_glock_dq(&m_ip->i_gh);
- gfs2_holder_uninit(&m_ip->i_gh);
- }
- gfs2_glock_dq(&ip->i_gh);
- gfs2_holder_uninit(&ip->i_gh);
return copied;
}
@@ -877,9 +862,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
* @page: The page that has been written
* @fsdata: The fsdata (unused in GFS2)
*
- * The main write_end function for GFS2. We have a separate one for
- * stuffed files as they are slightly different, otherwise we just
- * put our locking around the VFS provided functions.
+ * The main write_end function for GFS2. We just put our locking around the VFS
+ * provided functions.
*
* Returns: errno
*/
@@ -900,32 +884,39 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
ret = gfs2_meta_inode_buffer(ip, &dibh);
- if (unlikely(ret)) {
- unlock_page(page);
- put_page(page);
- goto failed;
- }
+ if (unlikely(ret))
+ goto out;
- if (gfs2_is_stuffed(ip))
- return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+ if (gfs2_is_stuffed(ip)) {
+ ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page);
+ page = NULL;
+ goto out2;
+ }
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len);
+ else
+ gfs2_ordered_add_inode(ip);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ page = NULL;
if (tr->tr_num_buf_new)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
else
gfs2_trans_add_meta(ip->i_gl, dibh);
-
+out2:
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
sdp->sd_rindex_uptodate = 0;
}
brelse(dibh);
-failed:
+out:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
if (ip->i_qadata && ip->i_qadata->qa_qd_num)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 278ed0869c3c..a7b586e02693 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -89,10 +89,12 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
map_bh(bh, inode->i_sb, block);
set_buffer_uptodate(bh);
- if (!gfs2_is_jdata(ip))
- mark_buffer_dirty(bh);
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
+ else {
+ mark_buffer_dirty(bh);
+ gfs2_ordered_add_inode(ip);
+ }
if (release) {
unlock_page(page);
@@ -176,8 +178,8 @@ out:
/**
* find_metapath - Find path through the metadata tree
* @sdp: The superblock
- * @mp: The metapath to return the result in
* @block: The disk block to look up
+ * @mp: The metapath to return the result in
* @height: The pre-calculated height of the metadata tree
*
* This routine returns a struct metapath structure that defines a path
@@ -188,8 +190,7 @@ out:
* filesystem with a blocksize of 4096.
*
* find_metapath() would return a struct metapath structure set to:
- * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
- * and mp_list[2] = 165.
+ * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
*
* That means that in order to get to the block containing the byte at
* offset 101342453, we would load the indirect block pointed to by pointer
@@ -279,6 +280,21 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
return p + mp->mp_list[height];
}
+static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
+{
+ const struct buffer_head *bh = mp->mp_bh[height];
+ return (const __be64 *)(bh->b_data + bh->b_size);
+}
+
+static void clone_metapath(struct metapath *clone, struct metapath *mp)
+{
+ unsigned int hgt;
+
+ *clone = *mp;
+ for (hgt = 0; hgt < mp->mp_aheight; hgt++)
+ get_bh(clone->mp_bh[hgt]);
+}
+
static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
{
const __be64 *t;
@@ -420,20 +436,140 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b
return (ptr - first);
}
-static inline void bmap_lock(struct gfs2_inode *ip, int create)
+typedef const __be64 *(*gfs2_metadata_walker)(
+ struct metapath *mp,
+ const __be64 *start, const __be64 *end,
+ u64 factor, void *data);
+
+#define WALK_STOP ((__be64 *)0)
+#define WALK_NEXT ((__be64 *)1)
+
+static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
+ u64 len, struct metapath *mp, gfs2_metadata_walker walker,
+ void *data)
{
- if (create)
- down_write(&ip->i_rw_mutex);
- else
- down_read(&ip->i_rw_mutex);
+ struct metapath clone;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ const __be64 *start, *end, *ptr;
+ u64 factor = 1;
+ unsigned int hgt;
+ int ret = 0;
+
+ for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
+ factor *= sdp->sd_inptrs;
+
+ for (;;) {
+ u64 step;
+
+ /* Walk indirect block. */
+ start = metapointer(hgt, mp);
+ end = metaend(hgt, mp);
+
+ step = (end - start) * factor;
+ if (step > len)
+ end = start + DIV_ROUND_UP_ULL(len, factor);
+
+ ptr = walker(mp, start, end, factor, data);
+ if (ptr == WALK_STOP)
+ break;
+ if (step >= len)
+ break;
+ len -= step;
+ if (ptr != WALK_NEXT) {
+ BUG_ON(!*ptr);
+ mp->mp_list[hgt] += ptr - start;
+ goto fill_up_metapath;
+ }
+
+lower_metapath:
+ /* Decrease height of metapath. */
+ if (mp != &clone) {
+ clone_metapath(&clone, mp);
+ mp = &clone;
+ }
+ brelse(mp->mp_bh[hgt]);
+ mp->mp_bh[hgt] = NULL;
+ if (!hgt)
+ break;
+ hgt--;
+ factor *= sdp->sd_inptrs;
+
+ /* Advance in metadata tree. */
+ (mp->mp_list[hgt])++;
+ start = metapointer(hgt, mp);
+ end = metaend(hgt, mp);
+ if (start >= end) {
+ mp->mp_list[hgt] = 0;
+ if (!hgt)
+ break;
+ goto lower_metapath;
+ }
+
+fill_up_metapath:
+ /* Increase height of metapath. */
+ if (mp != &clone) {
+ clone_metapath(&clone, mp);
+ mp = &clone;
+ }
+ ret = fillup_metapath(ip, mp, ip->i_height - 1);
+ if (ret < 0)
+ break;
+ hgt += ret;
+ for (; ret; ret--)
+ do_div(factor, sdp->sd_inptrs);
+ mp->mp_aheight = hgt + 1;
+ }
+ if (mp == &clone)
+ release_metapath(mp);
+ return ret;
}
-static inline void bmap_unlock(struct gfs2_inode *ip, int create)
+struct gfs2_hole_walker_args {
+ u64 blocks;
+};
+
+static const __be64 *gfs2_hole_walker(struct metapath *mp,
+ const __be64 *start, const __be64 *end,
+ u64 factor, void *data)
{
- if (create)
- up_write(&ip->i_rw_mutex);
- else
- up_read(&ip->i_rw_mutex);
+ struct gfs2_hole_walker_args *args = data;
+ const __be64 *ptr;
+
+ for (ptr = start; ptr < end; ptr++) {
+ if (*ptr) {
+ args->blocks += (ptr - start) * factor;
+ if (mp->mp_aheight == mp->mp_fheight)
+ return WALK_STOP;
+ return ptr; /* increase height */
+ }
+ }
+ args->blocks += (end - start) * factor;
+ return WALK_NEXT;
+}
+
+/**
+ * gfs2_hole_size - figure out the size of a hole
+ * @inode: The inode
+ * @lblock: The logical starting block number
+ * @len: How far to look (in blocks)
+ * @mp: The metapath at lblock
+ * @iomap: The iomap to store the hole size in
+ *
+ * This function modifies @mp.
+ *
+ * Returns: errno on error
+ */
+static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
+ struct metapath *mp, struct iomap *iomap)
+{
+ struct gfs2_hole_walker_args args = { };
+ int ret = 0;
+
+ ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
+ if (!ret)
+ iomap->length = args.blocks << inode->i_blkbits;
+ return ret;
}
static inline __be64 *gfs2_indirect_init(struct metapath *mp,
@@ -462,15 +598,11 @@ enum alloc_state {
};
/**
- * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * gfs2_iomap_alloc - Build a metadata tree of the requested height
* @inode: The GFS2 inode
- * @lblock: The logical starting block of the extent
- * @bh_map: This is used to return the mapping details
- * @zero_new: True if newly allocated blocks should be zeroed
+ * @iomap: The iomap structure
+ * @flags: iomap flags
* @mp: The metapath, with proper height information calculated
- * @maxlen: The max number of data blocks to alloc
- * @dblock: Pointer to return the resulting new block
- * @dblks: Pointer to return the number of blocks allocated
*
* In this routine we may have to alloc:
* i) Indirect blocks to grow the metadata tree height
@@ -483,6 +615,13 @@ enum alloc_state {
* blocks are available, there will only be one request per bmap call)
* and uses the state machine to initialise the blocks in order.
*
+ * Right now, this function will allocate at most one indirect block
+ * worth of data -- with a default block size of 4K, that's slightly
+ * less than 2M. If this limitation is ever removed to allow huge
+ * allocations, we would probably still want to limit the iomap size we
+ * return to avoid stalling other tasks during huge writes; the next
+ * iomap iteration would then find the blocks already allocated.
+ *
* Returns: errno on error
*/
@@ -497,6 +636,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = mp->mp_fheight - 1;
+ int ret;
enum alloc_state state;
__be64 *ptr;
__be64 zero_bn = 0;
@@ -507,6 +647,8 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
gfs2_trans_add_meta(ip->i_gl, dibh);
+ down_write(&ip->i_rw_mutex);
+
if (mp->mp_fheight == mp->mp_aheight) {
struct buffer_head *bh;
int eob;
@@ -542,11 +684,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
blks = dblks + iblks;
i = mp->mp_aheight;
do {
- int error;
n = blks - alloced;
- error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
- if (error)
- return error;
+ ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ if (ret)
+ goto out;
alloced += n;
if (state != ALLOC_DATA || gfs2_is_jdata(ip))
gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -602,7 +743,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
dblks = n;
ptr = metapointer(end_of_metadata, mp);
iomap->addr = bn << inode->i_blkbits;
- iomap->flags |= IOMAP_F_NEW;
+ iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
break;
@@ -612,64 +753,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
iomap->length = (u64)dblks << inode->i_blkbits;
ip->i_height = mp->mp_fheight;
gfs2_add_inode_blocks(&ip->i_inode, alloced);
- gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
- return 0;
-}
-
-/**
- * hole_size - figure out the size of a hole
- * @inode: The inode
- * @lblock: The logical starting block number
- * @mp: The metapath
- *
- * Returns: The hole size in bytes
- *
- */
-static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct metapath mp_eof;
- u64 factor = 1;
- int hgt;
- u64 holesz = 0;
- const __be64 *first, *end, *ptr;
- const struct buffer_head *bh;
- u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
- int zeroptrs;
- bool done = false;
-
- /* Get another metapath, to the very last byte */
- find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
- for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
- bh = mp->mp_bh[hgt];
- if (bh) {
- zeroptrs = 0;
- first = metapointer(hgt, mp);
- end = (const __be64 *)(bh->b_data + bh->b_size);
-
- for (ptr = first; ptr < end; ptr++) {
- if (*ptr) {
- done = true;
- break;
- } else {
- zeroptrs++;
- }
- }
- } else {
- zeroptrs = sdp->sd_inptrs;
- }
- if (factor * zeroptrs >= lblock_stop - lblock + 1) {
- holesz = lblock_stop - lblock + 1;
- break;
- }
- holesz += factor * zeroptrs;
-
- factor *= sdp->sd_inptrs;
- if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
- (mp->mp_list[hgt - 1])++;
- }
- return holesz << inode->i_blkbits;
+ gfs2_dinode_out(ip, dibh->b_data);
+out:
+ up_write(&ip->i_rw_mutex);
+ return ret;
}
static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
@@ -685,121 +772,130 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
}
/**
- * gfs2_iomap_begin - Map blocks from an inode to disk blocks
+ * gfs2_iomap_get - Map blocks from an inode to disk blocks
* @inode: The inode
* @pos: Starting position in bytes
* @length: Length to map, in bytes
* @flags: iomap flags
* @iomap: The iomap structure
+ * @mp: The metapath
*
* Returns: errno
*/
-int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap)
+static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap,
+ struct metapath *mp)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct metapath mp = { .mp_aheight = 1, };
- unsigned int factor = sdp->sd_sb.sb_bsize;
- const u64 *arr = sdp->sd_heightsize;
__be64 *ptr;
sector_t lblock;
- sector_t lend;
- int ret = 0;
+ sector_t lblock_stop;
+ int ret;
int eob;
- unsigned int len;
+ u64 len;
struct buffer_head *bh;
u8 height;
- trace_gfs2_iomap_start(ip, pos, length, flags);
- if (!length) {
- ret = -EINVAL;
- goto out;
- }
+ if (!length)
+ return -EINVAL;
if (gfs2_is_stuffed(ip)) {
if (flags & IOMAP_REPORT) {
+ if (pos >= i_size_read(inode))
+ return -ENOENT;
gfs2_stuffed_iomap(inode, iomap);
- if (pos >= iomap->length)
- ret = -ENOENT;
- goto out;
+ return 0;
}
BUG_ON(!(flags & IOMAP_WRITE));
}
-
lblock = pos >> inode->i_blkbits;
- lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
-
iomap->offset = lblock << inode->i_blkbits;
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
- iomap->flags = IOMAP_F_MERGED;
- bmap_lock(ip, flags & IOMAP_WRITE);
+ lblock_stop = (pos + length - 1) >> inode->i_blkbits;
+ len = lblock_stop - lblock + 1;
- /*
- * Directory data blocks have a struct gfs2_meta_header header, so the
- * remaining size is smaller than the filesystem block size. Logical
- * block numbers for directories are in units of this remaining size!
- */
- if (gfs2_is_dir(ip)) {
- factor = sdp->sd_jbsize;
- arr = sdp->sd_jheightsize;
- }
+ down_read(&ip->i_rw_mutex);
- ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+ ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]);
if (ret)
- goto out_release;
+ goto unlock;
height = ip->i_height;
- while ((lblock + 1) * factor > arr[height])
+ while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
height++;
- find_metapath(sdp, lblock, &mp, height);
+ find_metapath(sdp, lblock, mp, height);
if (height > ip->i_height || gfs2_is_stuffed(ip))
goto do_alloc;
- ret = lookup_metapath(ip, &mp);
+ ret = lookup_metapath(ip, mp);
if (ret)
- goto out_release;
+ goto unlock;
- if (mp.mp_aheight != ip->i_height)
+ if (mp->mp_aheight != ip->i_height)
goto do_alloc;
- ptr = metapointer(ip->i_height - 1, &mp);
+ ptr = metapointer(ip->i_height - 1, mp);
if (*ptr == 0)
goto do_alloc;
- iomap->type = IOMAP_MAPPED;
- iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+ bh = mp->mp_bh[ip->i_height - 1];
+ len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob);
- bh = mp.mp_bh[ip->i_height - 1];
- len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
+ iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+ iomap->length = len << inode->i_blkbits;
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_MERGED;
if (eob)
iomap->flags |= IOMAP_F_BOUNDARY;
- iomap->length = (u64)len << inode->i_blkbits;
-out_release:
- release_metapath(&mp);
- bmap_unlock(ip, flags & IOMAP_WRITE);
out:
- trace_gfs2_iomap_end(ip, iomap, ret);
+ iomap->bdev = inode->i_sb->s_bdev;
+unlock:
+ up_read(&ip->i_rw_mutex);
return ret;
do_alloc:
- if (flags & IOMAP_WRITE) {
- ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
- } else if (flags & IOMAP_REPORT) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = len << inode->i_blkbits;
+ iomap->type = IOMAP_HOLE;
+ iomap->flags = 0;
+ if (flags & IOMAP_REPORT) {
loff_t size = i_size_read(inode);
if (pos >= size)
ret = -ENOENT;
- else if (height <= ip->i_height)
- iomap->length = hole_size(inode, lblock, &mp);
+ else if (height == ip->i_height)
+ ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
else
iomap->length = size - pos;
}
- goto out_release;
+ goto out;
+}
+
+static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct metapath mp = { .mp_aheight = 1, };
+ int ret;
+
+ trace_gfs2_iomap_start(ip, pos, length, flags);
+ if (flags & IOMAP_WRITE) {
+ ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ if (!ret && iomap->type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+ release_metapath(&mp);
+ } else {
+ ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ release_metapath(&mp);
+ }
+ trace_gfs2_iomap_end(ip, iomap, ret);
+ return ret;
}
+const struct iomap_ops gfs2_iomap_ops = {
+ .iomap_begin = gfs2_iomap_begin,
+};
+
/**
* gfs2_block_map - Map one or more blocks of an inode to a disk block
* @inode: The inode
@@ -825,25 +921,34 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh_map, int create)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct iomap iomap;
- int ret, flags = 0;
+ loff_t pos = (loff_t)lblock << inode->i_blkbits;
+ loff_t length = bh_map->b_size;
+ struct metapath mp = { .mp_aheight = 1, };
+ struct iomap iomap = { };
+ int ret;
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
clear_buffer_boundary(bh_map);
trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
- if (create)
- flags |= IOMAP_WRITE;
- ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
- bh_map->b_size, flags, &iomap);
- if (ret) {
- if (!create && ret == -ENOENT) {
- /* Return unmapped buffer beyond the end of file. */
+ if (create) {
+ ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
+ if (!ret && iomap.type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
+ release_metapath(&mp);
+ } else {
+ ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
+ release_metapath(&mp);
+
+ /* Return unmapped buffer beyond the end of file. */
+ if (ret == -ENOENT) {
ret = 0;
+ goto out;
}
- goto out;
}
+ if (ret)
+ goto out;
if (iomap.length > bh_map->b_size) {
iomap.length = bh_map->b_size;
@@ -945,8 +1050,10 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
err = 0;
}
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
+ else
+ gfs2_ordered_add_inode(ip);
zero_user(page, offset, length);
mark_buffer_dirty(bh);
@@ -1056,6 +1163,19 @@ out:
return error;
}
+int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap)
+{
+ struct metapath mp = { .mp_aheight = 1, };
+ int ret;
+
+ ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
+ if (!ret && iomap->type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
+ release_metapath(&mp);
+ return ret;
+}
+
/**
* sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
* @ip: inode
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c3402fe00653..6b18fb323f0a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,11 +46,13 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
}
+extern const struct iomap_ops gfs2_iomap_ops;
+
extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
extern int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh, int create);
-extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap);
+extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
u64 *dblock, unsigned *extlen);
extern int gfs2_setattr_size(struct inode *inode, u64 size);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b71f021a9e2..7137db7b0119 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -733,7 +733,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
struct gfs2_inode *ip = GFS2_I(inode);
loff_t end = offset + len;
struct buffer_head *dibh;
- struct iomap iomap;
+ struct iomap iomap = { };
int error;
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -749,8 +749,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
}
while (offset < end) {
- error = gfs2_iomap_begin(inode, offset, end - offset,
- IOMAP_WRITE, &iomap);
+ error = gfs2_iomap_get_alloc(inode, offset, end - offset,
+ &iomap);
if (error)
goto out;
offset = iomap.offset + iomap.length;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1b6b1e3f5caf..d2ad817e089f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -116,6 +116,7 @@ static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
{
+ BUG_ON(rbm->offset >= rbm->rgd->rd_data);
return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
rbm->offset;
}
@@ -696,8 +697,6 @@ struct gfs2_sbd {
u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
u32 sd_max_height; /* Max height of a file's metadata tree */
u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
- u32 sd_max_jheight; /* Max height of journaled file's meta tree */
- u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
struct gfs2_args sd_args; /* Mount arguments */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8700eb815638..feda55f67050 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2006,10 +2006,6 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-const struct iomap_ops gfs2_iomap_ops = {
- .iomap_begin = gfs2_iomap_begin,
-};
-
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 1862e310a067..20241436126d 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/writeback.h>
#include "incore.h"
+#include "inode.h"
/**
* gfs2_log_lock - acquire the right to mess with the log manager
@@ -50,8 +51,12 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_sbd *sdp;
+ if (!gfs2_is_ordered(ip))
+ return;
+
+ sdp = GFS2_SB(&ip->i_inode);
if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
spin_lock(&sdp->sd_ordered_lock);
if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3ba3f167641c..c2469833b4fb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -335,25 +335,6 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_heightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
- sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_dinode);
- sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
- for (x = 2;; x++) {
- u64 space, d;
- u32 m;
-
- space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
- d = space;
- m = do_div(d, sdp->sd_inptrs);
-
- if (d != sdp->sd_jheightsize[x - 1] || m)
- break;
- sdp->sd_jheightsize[x] = space;
- }
- sdp->sd_max_jheight = x;
- sdp->sd_jheightsize[x] = ~0;
- gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_leaf)) /
GFS2_MIN_DIRENT_SIZE;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 7a98abd340ee..e8585dfd209f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,7 +735,10 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
if (!buffer_uptodate(bh))
goto unlock_out;
}
- gfs2_trans_add_data(ip->i_gl, bh);
+ if (gfs2_is_jdata(ip))
+ gfs2_trans_add_data(ip->i_gl, bh);
+ else
+ gfs2_ordered_add_inode(ip);
/* If we need to write to the next block as well */
if (to_write > (bsize - boff)) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b683917a27e..6bc5cfe710d1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -372,8 +372,8 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
start = bi->bi_bh->b_data;
if (bi->bi_clone)
start = bi->bi_clone;
- end = start + bi->bi_bh->b_size;
start += bi->bi_offset;
+ end = start + bi->bi_len;
BUG_ON(rbm.offset & 3);
start += (rbm.offset / GFS2_NBBY);
bytes = min_t(u32, len / GFS2_NBBY, (end - start));
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index c75cacaa349b..064c9a0ef046 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -143,32 +143,21 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
* @gl: The inode glock associated with the buffer
* @bh: The buffer to add
*
- * This is used in two distinct cases:
- * i) In ordered write mode
- * We put the data buffer on a list so that we can ensure that it's
- * synced to disk at the right time
- * ii) In journaled data mode
- * We need to journal the data block in the same way as metadata in
- * the functions above. The difference is that here we have a tag
- * which is two __be64's being the block number (as per meta data)
- * and a flag which says whether the data block needs escaping or
- * not. This means we need a new log entry for each 251 or so data
- * blocks, which isn't an enormous overhead but twice as much as
- * for normal metadata blocks.
+ * This is used in journaled data mode.
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
*/
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_trans *tr = current->journal_info;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *mapping = bh->b_page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_bufdata *bd;
- if (!gfs2_is_jdata(ip)) {
- gfs2_ordered_add_inode(ip);
- return;
- }
-
lock_buffer(bh);
if (buffer_pinned(bh)) {
set_bit(TR_TOUCHED, &tr->tr_flags);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 75b254280ff6..3bf2ae0e467c 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -31,21 +31,15 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
res = hfs_brec_read(&fd, &rec, sizeof(rec));
if (res) {
- hfs_find_exit(&fd);
- if (res == -ENOENT) {
- /* No such entry */
- inode = NULL;
- goto done;
- }
- return ERR_PTR(res);
+ if (res != -ENOENT)
+ inode = ERR_PTR(res);
+ } else {
+ inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
+ if (!inode)
+ inode = ERR_PTR(-EACCES);
}
- inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
hfs_find_exit(&fd);
- if (!inode)
- return ERR_PTR(-EACCES);
-done:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
/*
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2538b49cc349..b3309b83371a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -543,9 +543,9 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
igrab(dir);
hlist_add_fake(&inode->i_hash);
mark_inode_dirty(inode);
+ dont_mount(dentry);
out:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
void hfs_evict_inode(struct inode *inode)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 15e06fb552da..b5254378f011 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -122,8 +122,7 @@ again:
if (S_ISREG(inode->i_mode))
HFSPLUS_I(inode)->linkid = linkid;
out:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
fail:
hfs_find_exit(&fd);
return ERR_PTR(err);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4823431d1c9d..b445b13fc59b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -549,7 +549,7 @@ static int ioctl_fsfreeze(struct file *filp)
{
struct super_block *sb = file_inode(filp)->i_sb;
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* If filesystem doesn't support freeze feature, return. */
@@ -566,7 +566,7 @@ static int ioctl_fsthaw(struct file *filp)
{
struct super_block *sb = file_inode(filp)->i_sb;
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Thaw */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ccf0f00030bf..1a6084d2b02e 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -28,13 +28,9 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
return ERR_PTR(-ENAMETOOLONG);
ino = minix_inode_by_name(dentry);
- if (ino) {
+ if (ino)
inode = minix_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
diff --git a/fs/namei.c b/fs/namei.c
index a59968de1636..6df1f61855d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -984,13 +984,15 @@ static bool safe_hardlink_source(struct inode *inode)
*/
static int may_linkat(struct path *link)
{
- struct inode *inode;
+ struct inode *inode = link->dentry->d_inode;
+
+ /* Inode writeback is not safe when the uid or gid are invalid. */
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
return 0;
- inode = link->dentry->d_inode;
-
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
@@ -2747,6 +2749,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
BUG_ON(!inode);
BUG_ON(victim->d_parent->d_inode != dir);
+
+ /* Inode writeback is not safe when the uid or gid are invalid. */
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
+
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -3675,7 +3682,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (error)
return error;
- if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+ !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
diff --git a/fs/namespace.c b/fs/namespace.c
index 5f75969adff1..8ddd14806799 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1590,7 +1590,7 @@ static int do_umount(struct mount *mnt, int flags)
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
down_write(&sb->s_umount);
if (!sb_rdonly(sb))
@@ -2333,7 +2333,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
down_write(&sb->s_umount);
if (ms_flags & MS_BIND)
err = change_mount_flags(path->mnt, ms_flags);
- else if (!capable(CAP_SYS_ADMIN))
+ else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
err = -EPERM;
else
err = do_remount_sb(sb, sb_flags, data, 0);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b7146526afff..4bee3a72b9f3 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -305,11 +305,10 @@ static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
ino_t ino = be64_to_cpu(oi->i_head.h_self);
brelse(bh);
inode = omfs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ } else if (bh != ERR_PTR(-ENOENT)) {
+ inode = ERR_CAST(bh);
}
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
/* sanity check block's self pointer */
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 2200662a9bf1..607092f367ad 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,8 +256,7 @@ found:
break;
}
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 1b5707c44c3f..365cd73d9109 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -110,7 +110,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
struct inode *inode;
- struct dentry *res;
int ret = -EINVAL;
/*
@@ -158,65 +157,18 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
new_op->downcall.resp.lookup.refn.fs_id,
ret);
- if (ret < 0) {
- if (ret == -ENOENT) {
- /*
- * if no inode was found, add a negative dentry to
- * dcache anyway; if we don't, we don't hold expected
- * lookup semantics and we most noticeably break
- * during directory renames.
- *
- * however, if the operation failed or exited, do not
- * add the dentry (e.g. in the case that a touch is
- * issued on a file that already exists that was
- * interrupted during this lookup -- no need to add
- * another negative dentry for an existing file)
- */
-
- gossip_debug(GOSSIP_NAME_DEBUG,
- "orangefs_lookup: Adding *negative* dentry "
- "%p for %pd\n",
- dentry,
- dentry);
-
- d_add(dentry, NULL);
- res = NULL;
- goto out;
- }
-
+ if (ret >= 0) {
+ orangefs_set_timeout(dentry);
+ inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+ } else if (ret == -ENOENT) {
+ inode = NULL;
+ } else {
/* must be a non-recoverable error */
- res = ERR_PTR(ret);
- goto out;
- }
-
- orangefs_set_timeout(dentry);
-
- inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
- if (IS_ERR(inode)) {
- gossip_debug(GOSSIP_NAME_DEBUG,
- "error %ld from iget\n", PTR_ERR(inode));
- res = ERR_CAST(inode);
- goto out;
+ inode = ERR_PTR(ret);
}
- gossip_debug(GOSSIP_NAME_DEBUG,
- "%s:%s:%d "
- "Found good inode [%lu] with count [%d]\n",
- __FILE__,
- __func__,
- __LINE__,
- inode->i_ino,
- (int)atomic_read(&inode->i_count));
-
- /* update dentry/inode pair into dcache */
- res = d_splice_alias(inode, dentry);
-
- gossip_debug(GOSSIP_NAME_DEBUG,
- "Lookup success (inode ct = %d)\n",
- (int)atomic_read(&inode->i_count));
-out:
op_release(new_op);
- return res;
+ return d_splice_alias(inode, dentry);
}
/* return 0 on success; non-zero otherwise */
diff --git a/fs/pipe.c b/fs/pipe.c
index 39d6f431da83..bb0840e234f3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
}
-/* No kernel lock held - fine */
-static __poll_t
-pipe_poll(struct file *filp, poll_table *wait)
+static struct wait_queue_head *
+pipe_get_poll_head(struct file *filp, __poll_t events)
{
- __poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- int nrbufs;
- poll_wait(filp, &pipe->wait, wait);
+ return &pipe->wait;
+}
+
+/* No kernel lock held - fine */
+static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+{
+ struct pipe_inode_info *pipe = filp->private_data;
+ int nrbufs = pipe->nrbufs;
+ __poll_t mask = 0;
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = pipe->nrbufs;
- mask = 0;
if (filp->f_mode & FMODE_READ) {
mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = {
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
- .poll = pipe_poll,
+ .get_poll_head = pipe_get_poll_head,
+ .poll_mask = pipe_poll_mask,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4e35593546b1..33ed1746927a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1807,15 +1807,22 @@ int pid_getattr(const struct path *path, struct kstat *stat,
/* dentry stuff */
/*
- * Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- *
+ * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+ */
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+ task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
+ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+}
+
+/*
* Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc.
*
*/
-int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
@@ -1827,10 +1834,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
task = get_proc_task(inode);
if (task) {
- task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-
- inode->i_mode &= ~(S_ISUID | S_ISGID);
- security_task_to_inode(task, inode);
+ pid_update_inode(task, inode);
put_task_struct(task);
return 1;
}
@@ -1878,8 +1882,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
struct dentry *child, *dir = file->f_path.dentry;
struct qstr qname = QSTR_INIT(name, len);
struct inode *inode;
- unsigned type;
- ino_t ino;
+ unsigned type = DT_UNKNOWN;
+ ino_t ino = 1;
child = d_hash_and_lookup(dir, &qname);
if (!child) {
@@ -1888,22 +1892,23 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
if (IS_ERR(child))
goto end_instantiate;
if (d_in_lookup(child)) {
- int err = instantiate(d_inode(dir), child, task, ptr);
+ struct dentry *res;
+ res = instantiate(child, task, ptr);
d_lookup_done(child);
- if (err < 0) {
- dput(child);
+ if (IS_ERR(res))
goto end_instantiate;
+ if (unlikely(res)) {
+ dput(child);
+ child = res;
}
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
+end_instantiate:
dput(child);
return dir_emit(ctx, name, len, ino, type);
-
-end_instantiate:
- return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
/*
@@ -2065,19 +2070,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = {
.setattr = proc_setattr,
};
-static int
-proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+static struct dentry *
+proc_map_files_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
fmode_t mode = (fmode_t)(unsigned long)ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
((mode & FMODE_READ ) ? S_IRUSR : 0) |
((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
- return -ENOENT;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
ei->op.proc_get_link = map_files_get_link;
@@ -2086,9 +2091,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode->i_size = 64;
d_set_d_op(dentry, &tid_map_files_dentry_operations);
- d_add(dentry, inode);
-
- return 0;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2097,19 +2100,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
struct task_struct *task;
- int result;
+ struct dentry *result;
struct mm_struct *mm;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
task = get_proc_task(dir);
if (!task)
goto out;
- result = -EACCES;
+ result = ERR_PTR(-EACCES);
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
goto out_put_task;
@@ -2123,7 +2126,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out_no_vma;
if (vma->vm_file)
- result = proc_map_files_instantiate(dir, dentry, task,
+ result = proc_map_files_instantiate(dentry, task,
(void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
@@ -2132,7 +2135,7 @@ out_no_vma:
out_put_task:
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
static const struct inode_operations proc_map_files_inode_operations = {
@@ -2433,16 +2436,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
.release = single_release,
};
-static int proc_pident_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
+ inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
if (S_ISDIR(inode->i_mode))
@@ -2452,13 +2455,9 @@ static int proc_pident_instantiate(struct inode *dir,
if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2466,11 +2465,9 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
const struct pid_entry *ents,
unsigned int nents)
{
- int error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -2489,11 +2486,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
if (p >= last)
goto out;
- error = proc_pident_instantiate(dir, dentry, task, p);
+ res = proc_pident_instantiate(dentry, task, p);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
@@ -3136,38 +3133,32 @@ void proc_flush_task(struct task_struct *task)
}
}
-static int proc_pid_instantiate(struct inode *dir,
- struct dentry * dentry,
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
set_nlink(inode, nlink_tgid);
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
@@ -3182,10 +3173,10 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
if (!task)
goto out;
- result = proc_pid_instantiate(dir, dentry, task, NULL);
+ result = proc_pid_instantiate(dentry, task, NULL);
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
/*
@@ -3433,37 +3424,32 @@ static const struct inode_operations proc_tid_base_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_task_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_task_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
-
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
+
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
- inode->i_flags|=S_IMMUTABLE;
+ inode->i_flags |= S_IMMUTABLE;
set_nlink(inode, nlink_tid);
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!leader)
goto out_no_task;
@@ -3483,13 +3469,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
if (!same_thread_group(leader, task))
goto out_drop_task;
- result = proc_task_instantiate(dir, dentry, task, NULL);
+ result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
put_task_struct(task);
out:
put_task_struct(leader);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
/*
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6b80cd1e419a..05b9893e9a22 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -81,9 +81,41 @@ static const struct file_operations proc_fdinfo_file_operations = {
.release = single_release,
};
+static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+{
+ struct files_struct *files = get_files_struct(task);
+ struct file *file;
+
+ if (!files)
+ return false;
+
+ rcu_read_lock();
+ file = fcheck_files(files, fd);
+ if (file)
+ *mode = file->f_mode;
+ rcu_read_unlock();
+ put_files_struct(files);
+ return !!file;
+}
+
+static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
+ fmode_t f_mode)
+{
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
+ security_task_to_inode(task, inode);
+}
+
static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct files_struct *files;
struct task_struct *task;
struct inode *inode;
unsigned int fd;
@@ -96,35 +128,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
fd = proc_fd(inode);
if (task) {
- files = get_files_struct(task);
- if (files) {
- struct file *file;
-
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- unsigned f_mode = file->f_mode;
-
- rcu_read_unlock();
- put_files_struct(files);
-
- task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-
- if (S_ISLNK(inode->i_mode)) {
- unsigned i_mode = S_IFLNK;
- if (f_mode & FMODE_READ)
- i_mode |= S_IRUSR | S_IXUSR;
- if (f_mode & FMODE_WRITE)
- i_mode |= S_IWUSR | S_IXUSR;
- inode->i_mode = i_mode;
- }
-
- security_task_to_inode(task, inode);
- put_task_struct(task);
- return 1;
- }
- rcu_read_unlock();
- put_files_struct(files);
+ fmode_t f_mode;
+ if (tid_fd_mode(task, fd, &f_mode)) {
+ tid_fd_update_inode(task, inode, f_mode);
+ put_task_struct(task);
+ return 1;
}
put_task_struct(task);
}
@@ -166,34 +174,33 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
return ret;
}
-static int
-proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+struct fd_data {
+ fmode_t mode;
+ unsigned fd;
+};
+
+static struct dentry *proc_fd_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
+ tid_fd_update_inode(task, inode, data->mode);
d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
-
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -201,19 +208,21 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
instantiate_t instantiate)
{
struct task_struct *task = get_proc_task(dir);
- int result = -ENOENT;
- unsigned fd = name_to_int(&dentry->d_name);
+ struct fd_data data = {.fd = name_to_int(&dentry->d_name)};
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
- if (fd == ~0U)
+ if (data.fd == ~0U)
+ goto out;
+ if (!tid_fd_mode(task, data.fd, &data.mode))
goto out;
- result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+ result = instantiate(dentry, task, &data);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
static int proc_readfd_common(struct file *file, struct dir_context *ctx,
@@ -236,17 +245,22 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
for (fd = ctx->pos - 2;
fd < files_fdtable(files)->max_fds;
fd++, ctx->pos++) {
+ struct file *f;
+ struct fd_data data;
char name[10 + 1];
int len;
- if (!fcheck_files(files, fd))
+ f = fcheck_files(files, fd);
+ if (!f)
continue;
+ data.mode = f->f_mode;
rcu_read_unlock();
+ data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
- (void *)(unsigned long)fd))
+ &data))
goto out_fd_loop;
cond_resched();
rcu_read_lock();
@@ -304,31 +318,25 @@ const struct inode_operations proc_fd_inode_operations = {
.setattr = proc_setattr,
};
-static int
-proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
inode->i_fop = &proc_fdinfo_file_operations;
+ tid_fd_update_inode(task, inode, 0);
d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
-
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 02bb1914f5f7..7b4d9714f248 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -257,8 +257,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
if (!inode)
return ERR_PTR(-ENOMEM);
d_set_d_op(dentry, &proc_misc_dentry_ops);
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a318ae5b36b4..43c70c9e6b62 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -152,14 +152,14 @@ extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
-extern int pid_revalidate(struct dentry *, unsigned int);
+extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
-typedef int instantiate_t(struct inode *, struct dentry *,
+typedef struct dentry *instantiate_t(struct dentry *,
struct task_struct *, const void *);
extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
instantiate_t, struct task_struct *, const void *);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 59b17e509f46..dd2b35f78b09 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -87,28 +87,24 @@ static const struct inode_operations proc_ns_link_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_ns_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_ns_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct proc_ns_operations *ns_ops = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
inode->i_op = &proc_ns_link_inode_operations;
ei->ns_ops = ns_ops;
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -147,12 +143,10 @@ const struct file_operations proc_ns_dir_operations = {
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
- int error;
struct task_struct *task = get_proc_task(dir);
const struct proc_ns_operations **entry, **last;
unsigned int len = dentry->d_name.len;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -167,11 +161,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
if (entry == last)
goto out;
- error = proc_ns_instantiate(dir, dentry, task, *entry);
+ res = proc_ns_instantiate(dentry, task, *entry);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8989936f2995..4d765e5e91ed 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -554,9 +554,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
if (!inode)
goto out;
- err = NULL;
d_set_d_op(dentry, &proc_sys_dentry_operations);
- d_add(dentry, inode);
+ err = d_splice_alias(inode, dentry);
out:
if (h)
@@ -684,6 +683,7 @@ static bool proc_sys_fill_cache(struct file *file,
if (IS_ERR(child))
return false;
if (d_in_lookup(child)) {
+ struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
d_lookup_done(child);
@@ -691,7 +691,16 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
}
d_set_d_op(child, &proc_sys_dentry_operations);
- d_add(child, inode);
+ res = d_splice_alias(inode, child);
+ d_lookup_done(child);
+ if (unlikely(res)) {
+ if (IS_ERR(res)) {
+ dput(child);
+ return false;
+ }
+ dput(child);
+ child = res;
+ }
}
}
inode = d_inode(child);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4b43f0..a20c6e495bb2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -937,7 +937,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
- * Documentation/vm/soft-dirty.txt for full description
+ * Documentation/admin-guide/mm/soft-dirty.rst for full description
* of how soft-dirty works.
*/
pte_t ptent = *pte;
@@ -1421,7 +1421,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
* Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index eca27878079d..8d72221735d7 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -114,13 +114,9 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned i
brelse(bh);
foundinode = qnx4_iget(dir->i_sb, ino);
- if (IS_ERR(foundinode)) {
+ if (IS_ERR(foundinode))
QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
PTR_ERR(foundinode)));
- return ERR_CAST(foundinode);
- }
out:
- d_add(dentry, foundinode);
-
- return NULL;
+ return d_splice_alias(foundinode, dentry);
}
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 72c2770830be..e2e98e653b8d 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,15 +29,11 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
if (ino) {
foundinode = qnx6_iget(dir->i_sb, ino);
qnx6_put_page(page);
- if (IS_ERR(foundinode)) {
+ if (IS_ERR(foundinode))
pr_debug("lookup->iget -> error %ld\n",
PTR_ERR(foundinode));
- return ERR_CAST(foundinode);
- }
} else {
pr_debug("%s(): not found %s\n", __func__, name);
- return NULL;
}
- d_add(dentry, foundinode);
- return NULL;
+ return d_splice_alias(foundinode, dentry);
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 8f06fd1f3d69..6ccb51993a76 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -213,7 +213,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
unsigned long offset, maxoff;
- struct inode *inode;
+ struct inode *inode = NULL;
struct romfs_inode ri;
const char *name; /* got from dentry */
int len, ret;
@@ -233,7 +233,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
for (;;) {
if (!offset || offset >= maxoff)
- goto out0;
+ break;
ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
if (ret < 0)
@@ -244,37 +244,19 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
len);
if (ret < 0)
goto error;
- if (ret == 1)
+ if (ret == 1) {
+ /* Hard link handling */
+ if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+ offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+ inode = romfs_iget(dir->i_sb, offset);
break;
+ }
/* next entry */
offset = be32_to_cpu(ri.next) & ROMFH_MASK;
}
- /* Hard link handling */
- if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
- offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
- inode = romfs_iget(dir->i_sb, offset);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto error;
- }
- goto outi;
-
- /*
- * it's a bit funky, _lookup needs to return an error code
- * (negative) or a NULL, both as a dentry. ENOENT should not
- * be returned, instead we need to create a negative dentry by
- * d_add(dentry, NULL); and return 0 as no error.
- * (Although as I see, it only matters on writable file
- * systems).
- */
-out0:
- inode = NULL;
-outi:
- d_add(dentry, inode);
- ret = 0;
+ return d_splice_alias(inode, dentry);
error:
return ERR_PTR(ret);
}
diff --git a/fs/select.c b/fs/select.c
index ba879c51288f..bc3cc0f98896 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
#include <linux/uaccess.h>
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+ if (file->f_op->poll) {
+ return file->f_op->poll(file, pt);
+ } else if (file_has_poll_mask(file)) {
+ unsigned int events = poll_requested_events(pt);
+ struct wait_queue_head *head;
+
+ if (pt && pt->_qproc) {
+ head = file->f_op->get_poll_head(file, events);
+ if (!head)
+ return DEFAULT_POLLMASK;
+ if (IS_ERR(head))
+ return EPOLLERR;
+ pt->_qproc(file, head, pt);
+ }
+
+ return file->f_op->poll_mask(file, events);
+ } else {
+ return DEFAULT_POLLMASK;
+ }
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
/*
* Estimate expected accuracy in ns from a timeval.
@@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
add_wait_queue(wait_address, &entry->wait);
}
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
ktime_t *expires, unsigned long slack)
{
int rc = -EINTR;
@@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
return rc;
}
-EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
@@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
continue;
f = fdget(i);
if (f.file) {
- const struct file_operations *f_op;
- f_op = f.file->f_op;
- mask = DEFAULT_POLLMASK;
- if (f_op->poll) {
- wait_key_set(wait, in, out,
- bit, busy_flag);
- mask = (*f_op->poll)(f.file, wait);
- }
+ wait_key_set(wait, in, out, bit,
+ busy_flag);
+ mask = vfs_poll(f.file, wait);
+
fdput(f);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
@@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
__poll_t busy_flag)
{
- __poll_t mask;
- int fd;
-
- mask = 0;
- fd = pollfd->fd;
- if (fd >= 0) {
- struct fd f = fdget(fd);
- mask = EPOLLNVAL;
- if (f.file) {
- /* userland u16 ->events contains POLL... bitmap */
- __poll_t filter = demangle_poll(pollfd->events) |
- EPOLLERR | EPOLLHUP;
- mask = DEFAULT_POLLMASK;
- if (f.file->f_op->poll) {
- pwait->_key = filter;
- pwait->_key |= busy_flag;
- mask = f.file->f_op->poll(f.file, pwait);
- if (mask & busy_flag)
- *can_busy_poll = true;
- }
- /* Mask out unneeded events. */
- mask &= filter;
- fdput(f);
- }
- }
+ int fd = pollfd->fd;
+ __poll_t mask = 0, filter;
+ struct fd f;
+
+ if (fd < 0)
+ goto out;
+ mask = EPOLLNVAL;
+ f = fdget(fd);
+ if (!f.file)
+ goto out;
+
+ /* userland u16 ->events contains POLL... bitmap */
+ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+ pwait->_key = filter | busy_flag;
+ mask = vfs_poll(f.file, pwait);
+ if (mask & busy_flag)
+ *can_busy_poll = true;
+ mask &= filter; /* Mask out unneeded events. */
+ fdput(f);
+
+out:
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask);
-
return mask;
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2187a813376..cbb42f77a2bd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -81,83 +81,86 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
siginfo_t const *kinfo)
{
- long err;
+ struct signalfd_siginfo new;
BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
/*
* Unused members should be zero ...
*/
- err = __clear_user(uinfo, sizeof(*uinfo));
+ memset(&new, 0, sizeof(new));
/*
* If you change siginfo_t structure, please be sure
* this code is fixed accordingly.
*/
- err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
- err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
- err |= __put_user(kinfo->si_code, &uinfo->ssi_code);
+ new.ssi_signo = kinfo->si_signo;
+ new.ssi_errno = kinfo->si_errno;
+ new.ssi_code = kinfo->si_code;
switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
case SIL_KILL:
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
break;
case SIL_TIMER:
- err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
- err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
- err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
- err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+ new.ssi_tid = kinfo->si_tid;
+ new.ssi_overrun = kinfo->si_overrun;
+ new.ssi_ptr = (long) kinfo->si_ptr;
+ new.ssi_int = kinfo->si_int;
break;
case SIL_POLL:
- err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
- err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
+ new.ssi_band = kinfo->si_band;
+ new.ssi_fd = kinfo->si_fd;
break;
- case SIL_FAULT:
- err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
-#ifdef __ARCH_SI_TRAPNO
- err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
-#endif
-#ifdef BUS_MCEERR_AO
+ case SIL_FAULT_BNDERR:
+ case SIL_FAULT_PKUERR:
/*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
+ * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR
+ * and SIL_FAULT_PKUERR are only generated by faults that
+ * deliver them synchronously to userspace. In case someone
+ * injects one of these signals and signalfd catches it treat
+ * it as SIL_FAULT.
*/
- if (kinfo->si_signo == SIGBUS &&
- kinfo->si_code == BUS_MCEERR_AO)
- err |= __put_user((short) kinfo->si_addr_lsb,
- &uinfo->ssi_addr_lsb);
+ case SIL_FAULT:
+ new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+ new.ssi_trapno = kinfo->si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- /*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
- */
- if (kinfo->si_signo == SIGBUS &&
- kinfo->si_code == BUS_MCEERR_AR)
- err |= __put_user((short) kinfo->si_addr_lsb,
- &uinfo->ssi_addr_lsb);
+ break;
+ case SIL_FAULT_MCEERR:
+ new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+ new.ssi_trapno = kinfo->si_trapno;
#endif
+ new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
case SIL_CHLD:
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
- err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
- err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
- err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
+ new.ssi_status = kinfo->si_status;
+ new.ssi_utime = kinfo->si_utime;
+ new.ssi_stime = kinfo->si_stime;
break;
case SIL_RT:
- default:
/*
* This case catches also the signals queued by sigqueue().
*/
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
- err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
- err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
+ new.ssi_ptr = (long) kinfo->si_ptr;
+ new.ssi_int = kinfo->si_int;
+ break;
+ case SIL_SYS:
+ new.ssi_call_addr = (long) kinfo->si_call_addr;
+ new.ssi_syscall = kinfo->si_syscall;
+ new.ssi_arch = kinfo->si_arch;
break;
}
- return err ? -EFAULT: sizeof(*uinfo);
+ if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo)))
+ return -EFAULT;
+
+ return sizeof(*uinfo);
}
static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 250b0755b908..4d5d20491ffd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -51,14 +51,9 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un
if (dentry->d_name.len > SYSV_NAMELEN)
return ERR_PTR(-ENAMETOOLONG);
ino = sysv_inode_by_name(dentry);
-
- if (ino) {
+ if (ino)
inode = sysv_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index cdad49da3ff7..d84a2bee4f82 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)
kfree_rcu(ctx, rcu);
return 0;
}
-
-static __poll_t timerfd_poll(struct file *file, poll_table *wait)
+
+static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
+ __poll_t eventmask)
{
struct timerfd_ctx *ctx = file->private_data;
- __poll_t events = 0;
- unsigned long flags;
- poll_wait(file, &ctx->wqh, wait);
+ return &ctx->wqh;
+}
- spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->ticks)
- events |= EPOLLIN;
- spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
+{
+ struct timerfd_ctx *ctx = file->private_data;
- return events;
+ return ctx->ticks ? EPOLLIN : 0;
}
static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
static const struct file_operations timerfd_fops = {
.release = timerfd_release,
- .poll = timerfd_poll,
+ .get_poll_head = timerfd_get_poll_head,
+ .poll_mask = timerfd_poll_mask,
.read = timerfd_read,
.llseek = noop_llseek,
.show_fdinfo = timerfd_show,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d7fb88e172e..4e267cc21c77 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -214,7 +214,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
int err;
union ubifs_key key;
struct inode *inode = NULL;
- struct ubifs_dent_node *dent;
+ struct ubifs_dent_node *dent = NULL;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct fscrypt_name nm;
@@ -229,14 +229,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(err);
if (fname_len(&nm) > UBIFS_MAX_NLEN) {
- err = -ENAMETOOLONG;
- goto out_fname;
+ inode = ERR_PTR(-ENAMETOOLONG);
+ goto done;
}
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
if (!dent) {
- err = -ENOMEM;
- goto out_fname;
+ inode = ERR_PTR(-ENOMEM);
+ goto done;
}
if (nm.hash) {
@@ -250,16 +250,16 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
}
if (err) {
- if (err == -ENOENT) {
+ if (err == -ENOENT)
dbg_gen("not found");
- goto done;
- }
- goto out_dent;
+ else
+ inode = ERR_PTR(err);
+ goto done;
}
if (dbg_check_name(c, dent, &nm)) {
- err = -EINVAL;
- goto out_dent;
+ inode = ERR_PTR(-EINVAL);
+ goto done;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -272,7 +272,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
- goto out_dent;
+ goto done;
}
if (ubifs_crypt_is_encrypted(dir) &&
@@ -280,27 +280,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
!fscrypt_has_permitted_context(dir, inode)) {
ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
dir->i_ino, inode->i_ino);
- err = -EPERM;
- goto out_inode;
+ iput(inode);
+ inode = ERR_PTR(-EPERM);
}
done:
kfree(dent);
fscrypt_free_filename(&nm);
- /*
- * Note, d_splice_alias() would be required instead if we supported
- * NFS.
- */
- d_add(dentry, inode);
- return NULL;
-
-out_inode:
- iput(inode);
-out_dent:
- kfree(dent);
-out_fname:
- fscrypt_free_filename(&nm);
- return ERR_PTR(err);
+ return d_splice_alias(inode, dentry);
}
static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3ed3c811dfa..df42e4cb4dc4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -260,6 +260,7 @@ xfs_vn_lookup(
struct dentry *dentry,
unsigned int flags)
{
+ struct inode *inode;
struct xfs_inode *cip;
struct xfs_name name;
int error;
@@ -269,14 +270,13 @@ xfs_vn_lookup(
xfs_dentry_to_name(&name, dentry);
error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
- if (unlikely(error)) {
- if (unlikely(error != -ENOENT))
- return ERR_PTR(error);
- d_add(dentry, NULL);
- return NULL;
- }
-
- return d_splice_alias(VFS_I(cip), dentry);
+ if (likely(!error))
+ inode = VFS_I(cip);
+ else if (likely(error == -ENOENT))
+ inode = NULL;
+ else
+ inode = ERR_PTR(error);
+ return d_splice_alias(inode, dentry);
}
STATIC struct dentry *
diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 34a028a7bcc5..87d14476edc2 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic64 ## x
+#define ATOMIC_LONG_TYPE s64
#else
@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic ## x
+#define ATOMIC_LONG_TYPE int
#endif
@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
#define atomic_long_cmpxchg(l, old, new) \
(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
#define atomic_long_xchg_relaxed(v, new) \
(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
#define atomic_long_xchg_acquire(v, new) \
@@ -244,6 +261,8 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
#define atomic_long_inc_not_zero(l) \
ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l))
+#define atomic_long_cond_read_relaxed(v, c) \
+ ATOMIC_LONG_PFX(_cond_read_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (c))
#define atomic_long_cond_read_acquire(v, c) \
ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c))
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 29458bbb2fa0..2cafdbb9ae4c 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -221,18 +221,17 @@ do { \
#endif
/**
- * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees
* @ptr: pointer to the variable to wait on
* @cond: boolean expression to wait for
*
- * Equivalent to using smp_load_acquire() on the condition variable but employs
- * the control dependency of the wait to reduce the barrier on many platforms.
+ * Equivalent to using READ_ONCE() on the condition variable.
*
* Due to C lacking lambda expressions we load the value of *ptr into a
* pre-named variable @VAL to be used in @cond.
*/
-#ifndef smp_cond_load_acquire
-#define smp_cond_load_acquire(ptr, cond_expr) ({ \
+#ifndef smp_cond_load_relaxed
+#define smp_cond_load_relaxed(ptr, cond_expr) ({ \
typeof(ptr) __PTR = (ptr); \
typeof(*ptr) VAL; \
for (;;) { \
@@ -241,10 +240,26 @@ do { \
break; \
cpu_relax(); \
} \
- smp_acquire__after_ctrl_dep(); \
VAL; \
})
#endif
+/**
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ */
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({ \
+ typeof(*ptr) _val; \
+ _val = smp_cond_load_relaxed(ptr, cond_expr); \
+ smp_acquire__after_ctrl_dep(); \
+ _val; \
+})
+#endif
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h
new file mode 100644
index 000000000000..28819451b6d1
--- /dev/null
+++ b/include/asm-generic/compat.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* This is an empty stub for 32-bit-only architectures */
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 880a292d792f..ad2868263867 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -4,7 +4,16 @@
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
{
+ /*
+ * Use the non-coherent ops if available. If an architecture wants a
+ * more fine-grained selection of operations it will have to implement
+ * get_arch_dma_ops itself or use the per-device dma_ops.
+ */
+#ifdef CONFIG_DMA_NONCOHERENT_OPS
+ return &dma_noncoherent_ops;
+#else
return &dma_direct_ops;
+#endif
}
#endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
index 830d7659289b..6bb3cd3d695a 100644
--- a/include/asm-generic/pci.h
+++ b/include/asm-generic/pci.h
@@ -14,12 +14,4 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
}
#endif /* HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ */
-/*
- * By default, assume that no iommu is in use and that the PCI
- * space is mapped to address physical 0.
- */
-#ifndef PCI_DMA_BUS_IS_PHYS
-#define PCI_DMA_BUS_IS_PHYS (1)
-#endif
-
#endif /* _ASM_GENERIC_PCI_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index b37b4ad7eb94..9cc457597ddf 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -26,7 +26,6 @@
* @lock: Pointer to queued spinlock structure
* Return: 1 if it is locked, 0 otherwise
*/
-#ifndef queued_spin_is_locked
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
/*
@@ -35,7 +34,6 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
*/
return atomic_read(&lock->val);
}
-#endif
/**
* queued_spin_value_unlocked - is the spinlock structure unlocked?
@@ -100,7 +98,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
/*
* unlock() needs release semantics:
*/
- (void)atomic_sub_return_release(_Q_LOCKED_VAL, &lock->val);
+ smp_store_release(&lock->locked, 0);
}
#endif
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 034acd0c4956..0763f065b975 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -29,13 +29,41 @@
#endif
typedef struct qspinlock {
- atomic_t val;
+ union {
+ atomic_t val;
+
+ /*
+ * By using the whole 2nd least significant byte for the
+ * pending bit, we can allow better optimization of the lock
+ * acquisition for the pending bit holder.
+ */
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 locked;
+ u8 pending;
+ };
+ struct {
+ u16 locked_pending;
+ u16 tail;
+ };
+#else
+ struct {
+ u16 tail;
+ u16 locked_pending;
+ };
+ struct {
+ u8 reserved[2];
+ u8 pending;
+ u8 locked;
+ };
+#endif
+ };
} arch_spinlock_t;
/*
* Initializier
*/
-#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) }
+#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) }
/*
* Bitfields in the atomic value:
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 482461d8931d..cc414db9da0a 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -245,8 +245,7 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, int flags);
void af_alg_free_resources(struct af_alg_async_req *areq);
void af_alg_async_cb(struct crypto_async_request *_req, int err);
-__poll_t af_alg_poll(struct file *file, struct socket *sock,
- poll_table *wait);
+__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);
struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
unsigned int areqlen);
int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9d8aabecfe2d..b83e68dd006f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -8,8 +8,6 @@ struct kioctx;
struct kiocb;
struct mm_struct;
-#define KIOCB_KEY 0
-
typedef int (kiocb_cancel_fn)(struct kiocb *);
/* prototypes */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 8b276fd9a127..01ce3997cb42 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -654,6 +654,7 @@ static inline int atomic_dec_if_positive(atomic_t *v)
}
#endif
+#define atomic_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c))
#define atomic_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c))
#ifdef CONFIG_GENERIC_ATOMIC64
@@ -1075,6 +1076,7 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v
}
#endif
+#define atomic64_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c))
#define atomic64_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c))
#include <asm-generic/atomic-long.h>
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 081281ad5772..b1a5562b3215 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -7,8 +7,7 @@
*/
#include <linux/types.h>
-
-#ifdef CONFIG_COMPAT
+#include <linux/compat_time.h>
#include <linux/stat.h>
#include <linux/param.h> /* for HZ */
@@ -21,8 +20,11 @@
#include <linux/unistd.h>
#include <asm/compat.h>
+
+#ifdef CONFIG_COMPAT
#include <asm/siginfo.h>
#include <asm/signal.h>
+#endif
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
/*
@@ -83,6 +85,8 @@
static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#endif /* COMPAT_SYSCALL_DEFINEx */
+#ifdef CONFIG_COMPAT
+
#ifndef compat_user_stack_pointer
#define compat_user_stack_pointer() current_user_stack_pointer()
#endif
@@ -290,8 +294,6 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
extern int compat_put_timespec(const struct timespec *, void __user *);
extern int compat_get_timeval(struct timeval *, const void __user *);
extern int compat_put_timeval(const struct timeval *, void __user *);
-extern int compat_get_timespec64(struct timespec64 *, const void __user *);
-extern int compat_put_timespec64(const struct timespec64 *, void __user *);
extern int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits);
extern int put_compat_itimerspec64(const struct itimerspec64 *its,
@@ -330,6 +332,7 @@ extern int put_compat_rusage(const struct rusage *,
struct compat_rusage __user *);
struct compat_siginfo;
+struct __compat_aio_sigset;
struct compat_dirent {
u32 d_ino;
@@ -553,6 +556,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
compat_long_t nr,
struct io_event __user *events,
struct compat_timespec __user *timeout);
+asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
+ compat_long_t min_nr,
+ compat_long_t nr,
+ struct io_event __user *events,
+ struct compat_timespec __user *timeout,
+ const struct __compat_aio_sigset __user *usig);
/* fs/cookies.c */
asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
@@ -1016,7 +1025,9 @@ static inline struct compat_timeval ns_to_compat_timeval(s64 nsec)
#else /* !CONFIG_COMPAT */
#define is_compat_task() (0)
+#ifndef in_compat_syscall
static inline bool in_compat_syscall(void) { return false; }
+#endif
#endif /* CONFIG_COMPAT */
diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h
new file mode 100644
index 000000000000..31f2774f1994
--- /dev/null
+++ b/include/linux/compat_time.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_COMPAT_TIME_H
+#define _LINUX_COMPAT_TIME_H
+
+#include <linux/types.h>
+#include <linux/time64.h>
+
+typedef s32 compat_time_t;
+
+struct compat_timespec {
+ compat_time_t tv_sec;
+ s32 tv_nsec;
+};
+
+struct compat_timeval {
+ compat_time_t tv_sec;
+ s32 tv_usec;
+};
+
+extern int compat_get_timespec64(struct timespec64 *, const void __user *);
+extern int compat_put_timespec64(const struct timespec64 *, void __user *);
+
+#endif /* _LINUX_COMPAT_TIME_H */
diff --git a/include/linux/cper.h b/include/linux/cper.h
index d14ef4e77c8a..9c703a0abe6e 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -381,7 +381,7 @@ struct cper_sec_proc_generic {
/* IA32/X64 Processor Error Section */
struct cper_sec_proc_ia {
__u64 validation_bits;
- __u8 lapic_id;
+ __u64 lapic_id;
__u8 cpuid[48];
};
@@ -551,5 +551,7 @@ const char *cper_mem_err_unpack(struct trace_seq *,
struct cper_mem_err_compact *);
void cper_print_proc_arm(const char *pfx,
const struct cper_sec_proc_arm *proc);
+void cper_print_proc_ia(const char *pfx,
+ const struct cper_sec_proc_ia *proc);
#endif
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 5e335b6203f4..e6c0448ebcc7 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -29,7 +29,7 @@
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned int flags; /* Private per-task flags */
/* For each stat XXX, add following, aligned appropriately
diff --git a/include/linux/device.h b/include/linux/device.h
index 477956990f5e..00b6c3b42437 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -88,6 +88,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
* @resume: Called to bring a device on this bus out of sleep mode.
* @num_vf: Called to find out how many virtual functions a device on this
* bus supports.
+ * @dma_configure: Called to setup DMA configuration on a device on
+ this bus.
* @pm: Power management operations of this bus, callback the specific
* device driver's pm-ops.
* @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU
@@ -96,8 +98,6 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
* @p: The private data of the driver core, only the driver core can
* touch this.
* @lock_key: Lock class key for use by the lock validator
- * @force_dma: Assume devices on this bus should be set up by dma_configure()
- * even if DMA capability is not explicitly described by firmware.
*
* A bus is a channel between the processor and one or more devices. For the
* purposes of the device model, all devices are connected via a bus, even if
@@ -130,14 +130,14 @@ struct bus_type {
int (*num_vf)(struct device *dev);
+ int (*dma_configure)(struct device *dev);
+
const struct dev_pm_ops *pm;
const struct iommu_ops *iommu_ops;
struct subsys_private *p;
struct lock_class_key lock_key;
-
- bool force_dma;
};
extern int __must_check bus_register(struct bus_type *bus);
@@ -904,6 +904,8 @@ struct dev_links_info {
* @offline: Set after successful invocation of bus type's .offline().
* @of_node_reused: Set if the device-tree node is shared with an ancestor
* device.
+ * @dma_32bit_limit: bridge limited to 32bit DMA even if the device itself
+ * indicates support for a higher limit in the dma_mask field.
*
* At the lowest level, every device in a Linux system is represented by an
* instance of struct device. The device structure contains the information
@@ -992,6 +994,7 @@ struct device {
bool offline_disabled:1;
bool offline:1;
bool of_node_reused:1;
+ bool dma_32bit_limit:1;
};
static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index c7d844f09c3a..a785f2507159 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -30,8 +30,6 @@ struct bus_type;
extern void dma_debug_add_bus(struct bus_type *bus);
-extern void dma_debug_init(u32 num_entries);
-
extern int dma_debug_resize_entries(u32 num_entries);
extern void debug_dma_map_page(struct device *dev, struct page *page,
@@ -100,10 +98,6 @@ static inline void dma_debug_add_bus(struct bus_type *bus)
{
}
-static inline void dma_debug_init(u32 num_entries)
-{
-}
-
static inline int dma_debug_resize_entries(u32 num_entries)
{
return 0;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 53ad6a47f513..8d9f33febde5 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -59,6 +59,11 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs);
void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs);
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs);
int dma_direct_supported(struct device *dev, u64 mask);
-
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr);
#endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 92f20832fd28..e8ca5e654277 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -17,6 +17,7 @@
#define __DMA_IOMMU_H
#ifdef __KERNEL__
+#include <linux/types.h>
#include <asm/errno.h>
#ifdef CONFIG_IOMMU_DMA
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f8ab1c0f589e..f9cc309507d9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -133,10 +133,10 @@ struct dma_map_ops {
#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
u64 (*get_required_mask)(struct device *dev);
#endif
- int is_phys;
};
extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_noncoherent_ops;
extern const struct dma_map_ops dma_virt_ops;
#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
@@ -502,7 +502,7 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0)
#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev, flag) (true)
+#define arch_dma_alloc_attrs(dev) (true)
#endif
static inline void *dma_alloc_attrs(struct device *dev, size_t size,
@@ -521,7 +521,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (!arch_dma_alloc_attrs(&dev, &flag))
+ if (!arch_dma_alloc_attrs(&dev))
return NULL;
if (!ops->alloc)
return NULL;
@@ -572,14 +572,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
return 0;
}
-/*
- * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
- * don't use this in new code.
- */
-#ifndef arch_dma_supported
-#define arch_dma_supported(dev, mask) (1)
-#endif
-
static inline void dma_check_mask(struct device *dev, u64 mask)
{
if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
@@ -592,9 +584,6 @@ static inline int dma_supported(struct device *dev, u64 mask)
if (!ops)
return 0;
- if (!arch_dma_supported(dev, mask))
- return 0;
-
if (!ops->dma_supported)
return 1;
return ops->dma_supported(dev, mask);
@@ -839,7 +828,7 @@ static inline int dma_mmap_wc(struct device *dev,
#define dma_mmap_writecombine dma_mmap_wc
#endif
-#if defined(CONFIG_NEED_DMA_MAP_STATE) || defined(CONFIG_DMA_API_DEBUG)
+#ifdef CONFIG_NEED_DMA_MAP_STATE
#define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME
#define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME
#define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME)
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
new file mode 100644
index 000000000000..10b2654d549b
--- /dev/null
+++ b/include/linux/dma-noncoherent.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_DMA_NONCOHERENT_H
+#define _LINUX_DMA_NONCOHERENT_H 1
+
+#include <linux/dma-mapping.h>
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs);
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_addr, unsigned long attrs);
+
+#ifdef CONFIG_DMA_NONCOHERENT_MMAP
+int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+#else
+#define arch_dma_mmap NULL
+#endif /* CONFIG_DMA_NONCOHERENT_MMAP */
+
+#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
+void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction direction);
+#else
+#define arch_dma_cache_sync NULL
+#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_device(struct device *dev,
+ phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir);
+#else
+static inline void arch_sync_dma_for_cpu(struct device *dev,
+ phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
+
+#endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3016d8c456bc..56add823f190 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -397,7 +397,7 @@ typedef struct {
u32 set_bar_attributes;
u64 romsize;
u32 romimage;
-} efi_pci_io_protocol_32;
+} efi_pci_io_protocol_32_t;
typedef struct {
u64 poll_mem;
@@ -417,7 +417,7 @@ typedef struct {
u64 set_bar_attributes;
u64 romsize;
u64 romimage;
-} efi_pci_io_protocol_64;
+} efi_pci_io_protocol_64_t;
typedef struct {
void *poll_mem;
@@ -437,7 +437,7 @@ typedef struct {
void *set_bar_attributes;
uint64_t romsize;
void *romimage;
-} efi_pci_io_protocol;
+} efi_pci_io_protocol_t;
#define EFI_PCI_IO_ATTRIBUTE_ISA_MOTHERBOARD_IO 0x0001
#define EFI_PCI_IO_ATTRIBUTE_ISA_IO 0x0002
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5d306028f0a2..d4c37d371da5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
}
struct fasync_struct {
- spinlock_t fa_lock;
+ rwlock_t fa_lock;
int magic;
int fa_fd;
struct fasync_struct *fa_next; /* singly linked list */
@@ -1711,6 +1711,8 @@ struct file_operations {
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
+ struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+ __poll_t (*poll_mask) (struct file *, __poll_t);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 39988924de3a..2f1327c37a63 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -16,7 +16,7 @@
/*
* Heterogeneous Memory Management (HMM)
*
- * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
* is for. Here we focus on the HMM API description, with some explanation of
* the underlying implementation.
*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 752464f5a772..c74b0321922a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1508,8 +1508,6 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
hwif->hwif_data = data;
}
-extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-
u64 ide_get_lba_addr(struct ide_cmd *, int);
u8 ide_dump_status(ide_drive_t *, const char *, u8);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5426627f9c55..eeceac3376fc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -432,11 +432,18 @@ extern bool force_irqthreads;
#define force_irqthreads (0)
#endif
-#ifndef __ARCH_SET_SOFTIRQ_PENDING
-#define set_softirq_pending(x) (local_softirq_pending() = (x))
-#define or_softirq_pending(x) (local_softirq_pending() |= (x))
+#ifndef local_softirq_pending
+
+#ifndef local_softirq_pending_ref
+#define local_softirq_pending_ref irq_stat.__softirq_pending
#endif
+#define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref))
+#define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x)))
+#define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x)))
+
+#endif /* local_softirq_pending */
+
/* Some architectures might implement lazy enabling/disabling of
* interrupts. In some cases, such as stop_machine, we might want
* to ensure that after a local_irq_disable(), interrupts have
diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index cb9a9248c8c0..70d01edcbf8b 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_IOMMU_HELPER_H
#define _LINUX_IOMMU_HELPER_H
+#include <linux/bug.h>
#include <linux/kernel.h>
static inline unsigned long iommu_device_max_index(unsigned long size,
@@ -14,9 +15,15 @@ static inline unsigned long iommu_device_max_index(unsigned long size,
return size;
}
-extern int iommu_is_span_boundary(unsigned int index, unsigned int nr,
- unsigned long shift,
- unsigned long boundary_size);
+static inline int iommu_is_span_boundary(unsigned int index, unsigned int nr,
+ unsigned long shift, unsigned long boundary_size)
+{
+ BUG_ON(!is_power_of_2(boundary_size));
+
+ shift = (shift + index) & (boundary_size - 1);
+ return shift + nr > boundary_size;
+}
+
extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr,
unsigned long shift,
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 65916a305f3d..b2067083aa94 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -512,6 +512,7 @@ enum {
IRQCHIP_SKIP_SET_WAKE = (1 << 4),
IRQCHIP_ONESHOT_SAFE = (1 << 5),
IRQCHIP_EOI_THREADED = (1 << 6),
+ IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7),
};
#include <linux/irqdesc.h>
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index 4954948d1973..6e8895cd4d92 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -18,15 +18,11 @@
*/
#ifndef __ARCH_IRQ_STAT
-extern irq_cpustat_t irq_stat[]; /* defined in asm/hardirq.h */
-#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member)
+DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); /* defined in asm/hardirq.h */
+#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat.member, cpu))
#endif
- /* arch independent irq_stat fields */
-#define local_softirq_pending() \
- __IRQ_STAT(smp_processor_id(), __softirq_pending)
-
- /* arch dependent irq_stat fields */
+/* arch dependent irq_stat fields */
#define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) /* i386 */
#endif /* __irq_cpustat_h */
diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h
index 0380d899b955..630a57e55db6 100644
--- a/include/linux/irq_sim.h
+++ b/include/linux/irq_sim.h
@@ -1,14 +1,11 @@
-#ifndef _LINUX_IRQ_SIM_H
-#define _LINUX_IRQ_SIM_H
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
*/
+#ifndef _LINUX_IRQ_SIM_H
+#define _LINUX_IRQ_SIM_H
+
#include <linux/irq_work.h>
#include <linux/device.h>
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index f5af3b594e6e..cbb872c1b607 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -587,6 +587,7 @@ struct fwnode_handle;
int its_cpu_init(void);
int its_init(struct fwnode_handle *handle, struct rdists *rdists,
struct irq_domain *domain);
+int mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent);
static inline bool gic_enable_sre(void)
{
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 48c7e86bb556..dccfa65aee96 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -301,7 +301,13 @@ static inline struct irq_domain *irq_find_matching_host(struct device_node *node
static inline struct irq_domain *irq_find_host(struct device_node *node)
{
- return irq_find_matching_host(node, DOMAIN_BUS_ANY);
+ struct irq_domain *d;
+
+ d = irq_find_matching_host(node, DOMAIN_BUS_WIRED);
+ if (!d)
+ d = irq_find_matching_host(node, DOMAIN_BUS_ANY);
+
+ return d;
}
/**
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7b4899c06f49..74ea5e2310a8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,7 +45,7 @@ struct vmem_altmap {
* must be treated as an opaque object, rather than a "normal" struct page.
*
* A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ * include/linux/hmm.h and Documentation/vm/hmm.rst.
*
* MEMORY_DEVICE_PUBLIC:
* Device memory that is cache coherent from device and CPU point of view. This
@@ -67,7 +67,7 @@ enum memory_type {
* page_free()
*
* Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
* explanation in include/linux/memory_hotplug.h.
*
* The page_fault() callback must migrate page back, from device memory to
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 2d4e23c9ea0a..f09e9cf2e4ab 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -197,6 +197,8 @@ struct cros_ec_dev {
u32 features[2];
};
+#define to_cros_ec_dev(dev) container_of(dev, struct cros_ec_dev, class_dev)
+
/**
* cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device
*
diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h
index 638222e43e48..54a3cd808f9e 100644
--- a/include/linux/mfd/mc13xxx.h
+++ b/include/linux/mfd/mc13xxx.h
@@ -243,6 +243,8 @@ struct mc13xxx_platform_data {
#define MC13XXX_ADC0_LICELLCON (1 << 0)
#define MC13XXX_ADC0_CHRGICON (1 << 1)
#define MC13XXX_ADC0_BATICON (1 << 2)
+#define MC13XXX_ADC0_ADIN7SEL_DIE (1 << 4)
+#define MC13XXX_ADC0_ADIN7SEL_UID (2 << 4)
#define MC13XXX_ADC0_ADREFEN (1 << 10)
#define MC13XXX_ADC0_TSMOD0 (1 << 12)
#define MC13XXX_ADC0_TSMOD1 (1 << 13)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2d07a1ed5a31..392e6af82701 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -174,7 +174,7 @@ struct mmu_notifier_ops {
* invalidate_range_start()/end() notifiers, as
* invalidate_range() alread catches the points in time when an
* external TLB range needs to be flushed. For more in depth
- * discussion on this see Documentation/vm/mmu_notifier.txt
+ * discussion on this see Documentation/vm/mmu_notifier.rst
*
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 1f1bbb5b4679..5839d8062dfc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -289,6 +289,8 @@ enum {
* MSI_FLAG_ACTIVATE_EARLY has been set.
*/
MSI_FLAG_MUST_REACTIVATE = (1 << 5),
+ /* Is level-triggered capable, using two messages */
+ MSI_FLAG_LEVEL_CAPABLE = (1 << 6),
};
int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 14bc0d5d0ee5..3093dd162424 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -146,9 +146,6 @@ extern void __mutex_init(struct mutex *lock, const char *name,
*/
static inline bool mutex_is_locked(struct mutex *lock)
{
- /*
- * XXX think about spin_is_locked
- */
return __mutex_owner(lock) != NULL;
}
diff --git a/include/linux/net.h b/include/linux/net.h
index 2248a052061d..3fd9d8c16581 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,6 +147,7 @@ struct proto_ops {
int (*getname) (struct socket *sock,
struct sockaddr *addr,
int peer);
+ __poll_t (*poll_mask) (struct socket *sock, __poll_t events);
__poll_t (*poll) (struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8da5a1b31ece..165fd302b442 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,7 +55,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
return of_node_get(cpu_dev->of_node);
}
-int of_dma_configure(struct device *dev, struct device_node *np);
+int of_dma_configure(struct device *dev,
+ struct device_node *np,
+ bool force_dma);
void of_dma_deconfigure(struct device *dev);
#else /* CONFIG_OF */
@@ -105,7 +107,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
return NULL;
}
-static inline int of_dma_configure(struct device *dev, struct device_node *np)
+static inline int of_dma_configure(struct device *dev,
+ struct device_node *np,
+ bool force_dma)
{
return 0;
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..55371cb827ad 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -670,7 +670,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 val);
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
typedef u64 pci_bus_addr_t;
#else
typedef u32 pci_bus_addr_t;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..bea0b0cd4bf7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,7 +467,7 @@ enum perf_addr_filter_action_t {
*/
struct perf_addr_filter {
struct list_head entry;
- struct inode *inode;
+ struct path path;
unsigned long offset;
unsigned long size;
enum perf_addr_filter_action_t action;
@@ -1016,6 +1016,14 @@ static inline int is_software_event(struct perf_event *event)
return event->event_caps & PERF_EV_CAP_SOFTWARE;
}
+/*
+ * Return 1 for event in sw context, 0 for event in hw context
+ */
+static inline int in_software_context(struct perf_event *event)
+{
+ return event->ctx->pmu->task_ctx_nr == perf_sw_context;
+}
+
extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 49f634d96118..3097c943fab9 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -356,6 +356,8 @@ extern int platform_pm_restore(struct device *dev);
#define platform_pm_restore NULL
#endif
+extern int platform_dma_configure(struct device *dev);
+
#ifdef CONFIG_PM_SLEEP
#define USE_PLATFORM_PM_SLEEP_OPS \
.suspend = platform_pm_suspend, \
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f45ebd017eaa..fdf86b4cbc71 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
pt->_key = ~(__poll_t)0; /* all events enabled */
}
+static inline bool file_has_poll_mask(struct file *file)
+{
+ return file->f_op->get_poll_head && file->f_op->poll_mask;
+}
+
+static inline bool file_can_poll(struct file *file)
+{
+ return file->f_op->poll || file_has_poll_mask(file);
+}
+
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
+
struct poll_table_entry {
struct file *filp;
__poll_t key;
@@ -96,8 +108,6 @@ struct poll_wqueues {
extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
-extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
- ktime_t *expires, unsigned long slack);
extern u64 select_estimate_accuracy(struct timespec64 *tv);
#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 919b2a0b0307..037bf0ef1ae9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,7 +345,6 @@ extern void user_single_step_siginfo(struct task_struct *tsk,
static inline void user_single_step_siginfo(struct task_struct *tsk,
struct pt_regs *regs, siginfo_t *info)
{
- memset(info, 0, sizeof(*info));
info->si_signo = SIGTRAP;
}
#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 36360d07f25b..e679b175b411 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,7 +108,6 @@ void rcu_sched_qs(void);
void rcu_bh_qs(void);
void rcu_check_callbacks(int user);
void rcu_report_dead(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
void rcutree_migrate_callbacks(int cpu);
#ifdef CONFIG_RCU_STALL_COMMON
@@ -188,13 +187,13 @@ static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/**
- * cond_resched_rcu_qs - Report potential quiescent states to RCU
+ * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
*
* This macro resembles cond_resched(), except that it is defined to
* report potential quiescent states to RCU-tasks even if the cond_resched()
* machinery were to be shut off, as some advocate for PREEMPT kernels.
*/
-#define cond_resched_rcu_qs() \
+#define cond_resched_tasks_rcu_qs() \
do { \
if (!cond_resched()) \
rcu_note_voluntary_context_switch_lite(current); \
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ce9beec35e34..7b3c82e8a625 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); }
#define rcutree_offline_cpu NULL
#define rcutree_dead_cpu NULL
#define rcutree_dying_cpu NULL
+static inline void rcu_cpu_starting(unsigned int cpu) { }
#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fd996cdf1833..914655848ef6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
void rcu_barrier(void);
void rcu_barrier_bh(void);
void rcu_barrier_sched(void);
+bool rcu_eqs_special_set(int cpu);
unsigned long get_state_synchronize_rcu(void);
void cond_synchronize_rcu(unsigned long oldstate);
unsigned long get_state_synchronize_sched(void);
@@ -100,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu);
int rcutree_offline_cpu(unsigned int cpu);
int rcutree_dead_cpu(unsigned int cpu);
int rcutree_dying_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
#endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5f7ad0552c03..4f38068ffb71 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
+#include <linux/ktime.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/bug.h>
@@ -587,7 +588,10 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw,
const struct regmap_config *config,
struct lock_class_key *lock_key,
const char *lock_name);
-
+struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus,
+ const struct regmap_config *config,
+ struct lock_class_key *lock_key,
+ const char *lock_name);
/*
* Wrapper for regmap_init macros to include a unique lockdep key and name
* for each call. No-op if CONFIG_LOCKDEP is not set.
@@ -906,6 +910,19 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
__regmap_lockdep_wrapper(__devm_regmap_init_sdw, #config, \
sdw, config)
+/**
+ * devm_regmap_init_slimbus() - Initialise managed register map
+ *
+ * @slimbus: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap. The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_slimbus(slimbus, config) \
+ __regmap_lockdep_wrapper(__devm_regmap_init_slimbus, #config, \
+ slimbus, config)
int regmap_mmio_attach_clk(struct regmap *map, struct clk *clk);
void regmap_mmio_detach_clk(struct regmap *map);
void regmap_exit(struct regmap *map);
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index bcfdb918cd81..5d83d0c1d06c 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -7,6 +7,7 @@
#include <linux/compiler.h>
#include <linux/types.h>
+#include <linux/time64.h>
struct timespec;
struct compat_timespec;
@@ -15,9 +16,7 @@ struct pollfd;
enum timespec_type {
TT_NONE = 0,
TT_NATIVE = 1,
-#ifdef CONFIG_COMPAT
TT_COMPAT = 2,
-#endif
};
/*
@@ -40,10 +39,8 @@ struct restart_block {
clockid_t clockid;
enum timespec_type type;
union {
- struct timespec __user *rmtp;
-#ifdef CONFIG_COMPAT
+ struct __kernel_timespec __user *rmtp;
struct compat_timespec __user *compat_rmtp;
-#endif
};
u64 expires;
} nanosleep;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca3f3eae8980..14e4f9c12337 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
+extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
@@ -1661,7 +1662,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
* explicit rescheduling in places that are safe. The return
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
*/
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
@@ -1681,13 +1681,6 @@ extern int __cond_resched_lock(spinlock_t *lock);
__cond_resched_lock(lock); \
})
-extern int __cond_resched_softirq(void);
-
-#define cond_resched_softirq() ({ \
- ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
- __cond_resched_softirq(); \
-})
-
static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2c570cd934af..76a8cb4ef178 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -28,7 +28,7 @@ extern struct mm_struct *mm_alloc(void);
*
* Use mmdrop() to release the reference acquired by mmgrab().
*
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmgrab(struct mm_struct *mm)
@@ -62,7 +62,7 @@ static inline void mmdrop(struct mm_struct *mm)
*
* Use mmput() to release the reference acquired by mmget().
*
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmget(struct mm_struct *mm)
@@ -170,6 +170,17 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif
+/**
+ * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOIO allocation scope.
+ * All further allocations will implicitly drop __GFP_IO flag and so
+ * they are safe for the IO critical section from the allocation recursion
+ * point of view. Use memalloc_noio_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
static inline unsigned int memalloc_noio_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
@@ -177,11 +188,30 @@ static inline unsigned int memalloc_noio_save(void)
return flags;
}
+/**
+ * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_noio_save call.
+ */
static inline void memalloc_noio_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}
+/**
+ * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOFS allocation scope.
+ * All further allocations will implicitly drop __GFP_FS flag and so
+ * they are safe for the FS critical section from the allocation recursion
+ * point of view. Use memalloc_nofs_restore to end the scope with flags
+ * returned by this function.
+ *
+ * This function is safe to be used from any context.
+ */
static inline unsigned int memalloc_nofs_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
@@ -189,6 +219,14 @@ static inline unsigned int memalloc_nofs_save(void)
return flags;
}
+/**
+ * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
+ * Always make sure that that the given flags is the return value from the
+ * pairing memalloc_nofs_save call.
+ */
static inline void memalloc_nofs_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a9bc7e1b077e..3c5200137b24 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -28,6 +28,9 @@ enum siginfo_layout {
SIL_TIMER,
SIL_POLL,
SIL_FAULT,
+ SIL_FAULT_MCEERR,
+ SIL_FAULT_BNDERR,
+ SIL_FAULT_PKUERR,
SIL_CHLD,
SIL_RT,
SIL_SYS,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477ed255..89198379b39d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3250,8 +3250,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
int *peeked, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
int *err);
-__poll_t datagram_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
new file mode 100644
index 000000000000..bb4bd15ae1f6
--- /dev/null
+++ b/include/linux/spi/spi-mem.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 Exceet Electronics GmbH
+ * Copyright (C) 2018 Bootlin
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#ifndef __LINUX_SPI_MEM_H
+#define __LINUX_SPI_MEM_H
+
+#include <linux/spi/spi.h>
+
+#define SPI_MEM_OP_CMD(__opcode, __buswidth) \
+ { \
+ .buswidth = __buswidth, \
+ .opcode = __opcode, \
+ }
+
+#define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \
+ { \
+ .nbytes = __nbytes, \
+ .val = __val, \
+ .buswidth = __buswidth, \
+ }
+
+#define SPI_MEM_OP_NO_ADDR { }
+
+#define SPI_MEM_OP_DUMMY(__nbytes, __buswidth) \
+ { \
+ .nbytes = __nbytes, \
+ .buswidth = __buswidth, \
+ }
+
+#define SPI_MEM_OP_NO_DUMMY { }
+
+#define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth) \
+ { \
+ .dir = SPI_MEM_DATA_IN, \
+ .nbytes = __nbytes, \
+ .buf.in = __buf, \
+ .buswidth = __buswidth, \
+ }
+
+#define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth) \
+ { \
+ .dir = SPI_MEM_DATA_OUT, \
+ .nbytes = __nbytes, \
+ .buf.out = __buf, \
+ .buswidth = __buswidth, \
+ }
+
+#define SPI_MEM_OP_NO_DATA { }
+
+/**
+ * enum spi_mem_data_dir - describes the direction of a SPI memory data
+ * transfer from the controller perspective
+ * @SPI_MEM_DATA_IN: data coming from the SPI memory
+ * @SPI_MEM_DATA_OUT: data sent the SPI memory
+ */
+enum spi_mem_data_dir {
+ SPI_MEM_DATA_IN,
+ SPI_MEM_DATA_OUT,
+};
+
+/**
+ * struct spi_mem_op - describes a SPI memory operation
+ * @cmd.buswidth: number of IO lines used to transmit the command
+ * @cmd.opcode: operation opcode
+ * @addr.nbytes: number of address bytes to send. Can be zero if the operation
+ * does not need to send an address
+ * @addr.buswidth: number of IO lines used to transmit the address cycles
+ * @addr.val: address value. This value is always sent MSB first on the bus.
+ * Note that only @addr.nbytes are taken into account in this
+ * address value, so users should make sure the value fits in the
+ * assigned number of bytes.
+ * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can
+ * be zero if the operation does not require dummy bytes
+ * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes
+ * @data.buswidth: number of IO lanes used to send/receive the data
+ * @data.dir: direction of the transfer
+ * @data.buf.in: input buffer
+ * @data.buf.out: output buffer
+ */
+struct spi_mem_op {
+ struct {
+ u8 buswidth;
+ u8 opcode;
+ } cmd;
+
+ struct {
+ u8 nbytes;
+ u8 buswidth;
+ u64 val;
+ } addr;
+
+ struct {
+ u8 nbytes;
+ u8 buswidth;
+ } dummy;
+
+ struct {
+ u8 buswidth;
+ enum spi_mem_data_dir dir;
+ unsigned int nbytes;
+ /* buf.{in,out} must be DMA-able. */
+ union {
+ void *in;
+ const void *out;
+ } buf;
+ } data;
+};
+
+#define SPI_MEM_OP(__cmd, __addr, __dummy, __data) \
+ { \
+ .cmd = __cmd, \
+ .addr = __addr, \
+ .dummy = __dummy, \
+ .data = __data, \
+ }
+
+/**
+ * struct spi_mem - describes a SPI memory device
+ * @spi: the underlying SPI device
+ * @drvpriv: spi_mem_drviver private data
+ *
+ * Extra information that describe the SPI memory device and may be needed by
+ * the controller to properly handle this device should be placed here.
+ *
+ * One example would be the device size since some controller expose their SPI
+ * mem devices through a io-mapped region.
+ */
+struct spi_mem {
+ struct spi_device *spi;
+ void *drvpriv;
+};
+
+/**
+ * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem
+ * device
+ * @mem: memory device
+ * @data: data to attach to the memory device
+ */
+static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data)
+{
+ mem->drvpriv = data;
+}
+
+/**
+ * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem
+ * device
+ * @mem: memory device
+ *
+ * Return: the data attached to the mem device.
+ */
+static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
+{
+ return mem->drvpriv;
+}
+
+/**
+ * struct spi_controller_mem_ops - SPI memory operations
+ * @adjust_op_size: shrink the data xfer of an operation to match controller's
+ * limitations (can be alignment of max RX/TX size
+ * limitations)
+ * @supports_op: check if an operation is supported by the controller
+ * @exec_op: execute a SPI memory operation
+ *
+ * This interface should be implemented by SPI controllers providing an
+ * high-level interface to execute SPI memory operation, which is usually the
+ * case for QSPI controllers.
+ */
+struct spi_controller_mem_ops {
+ int (*adjust_op_size)(struct spi_mem *mem, struct spi_mem_op *op);
+ bool (*supports_op)(struct spi_mem *mem,
+ const struct spi_mem_op *op);
+ int (*exec_op)(struct spi_mem *mem,
+ const struct spi_mem_op *op);
+};
+
+/**
+ * struct spi_mem_driver - SPI memory driver
+ * @spidrv: inherit from a SPI driver
+ * @probe: probe a SPI memory. Usually where detection/initialization takes
+ * place
+ * @remove: remove a SPI memory
+ * @shutdown: take appropriate action when the system is shutdown
+ *
+ * This is just a thin wrapper around a spi_driver. The core takes care of
+ * allocating the spi_mem object and forwarding the probe/remove/shutdown
+ * request to the spi_mem_driver. The reason we use this wrapper is because
+ * we might have to stuff more information into the spi_mem struct to let
+ * SPI controllers know more about the SPI memory they interact with, and
+ * having this intermediate layer allows us to do that without adding more
+ * useless fields to the spi_device object.
+ */
+struct spi_mem_driver {
+ struct spi_driver spidrv;
+ int (*probe)(struct spi_mem *mem);
+ int (*remove)(struct spi_mem *mem);
+ void (*shutdown)(struct spi_mem *mem);
+};
+
+#if IS_ENABLED(CONFIG_SPI_MEM)
+int spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sg);
+
+void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sg);
+#else
+static inline int
+spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sg)
+{
+ return -ENOTSUPP;
+}
+
+static inline void
+spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
+ const struct spi_mem_op *op,
+ struct sg_table *sg)
+{
+}
+#endif /* CONFIG_SPI_MEM */
+
+int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op);
+
+bool spi_mem_supports_op(struct spi_mem *mem,
+ const struct spi_mem_op *op);
+
+int spi_mem_exec_op(struct spi_mem *mem,
+ const struct spi_mem_op *op);
+
+int spi_mem_driver_register_with_owner(struct spi_mem_driver *drv,
+ struct module *owner);
+
+void spi_mem_driver_unregister(struct spi_mem_driver *drv);
+
+#define spi_mem_driver_register(__drv) \
+ spi_mem_driver_register_with_owner(__drv, THIS_MODULE)
+
+#define module_spi_mem_driver(__drv) \
+ module_driver(__drv, spi_mem_driver_register, \
+ spi_mem_driver_unregister)
+
+#endif /* __LINUX_SPI_MEM_H */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index bc6bb325d1bf..a64235e05321 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -26,7 +26,7 @@ struct dma_chan;
struct property_entry;
struct spi_controller;
struct spi_transfer;
-struct spi_flash_read_message;
+struct spi_controller_mem_ops;
/*
* INTERFACES between SPI master-side drivers and SPI slave protocol handlers,
@@ -376,13 +376,11 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
* transfer_one callback.
* @handle_err: the subsystem calls the driver to handle an error that occurs
* in the generic implementation of transfer_one_message().
+ * @mem_ops: optimized/dedicated operations for interactions with SPI memory.
+ * This field is optional and should only be implemented if the
+ * controller has native support for memory like operations.
* @unprepare_message: undo any work done by prepare_message().
* @slave_abort: abort the ongoing transfer request on an SPI slave controller
- * @spi_flash_read: to support spi-controller hardwares that provide
- * accelerated interface to read from flash devices.
- * @spi_flash_can_dma: analogous to can_dma() interface, but for
- * controllers implementing spi_flash_read.
- * @flash_read_supported: spi device supports flash read
* @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
* number. Any individual value may be -ENOENT for CS lines that
* are not GPIOs (driven by the SPI controller itself).
@@ -548,11 +546,6 @@ struct spi_controller {
int (*unprepare_message)(struct spi_controller *ctlr,
struct spi_message *message);
int (*slave_abort)(struct spi_controller *ctlr);
- int (*spi_flash_read)(struct spi_device *spi,
- struct spi_flash_read_message *msg);
- bool (*spi_flash_can_dma)(struct spi_device *spi,
- struct spi_flash_read_message *msg);
- bool (*flash_read_supported)(struct spi_device *spi);
/*
* These hooks are for drivers that use a generic implementation
@@ -564,6 +557,9 @@ struct spi_controller {
void (*handle_err)(struct spi_controller *ctlr,
struct spi_message *message);
+ /* Optimized handlers for SPI memory-like operations. */
+ const struct spi_controller_mem_ops *mem_ops;
+
/* gpio chip select */
int *cs_gpios;
@@ -1183,48 +1179,6 @@ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd)
return be16_to_cpu(result);
}
-/**
- * struct spi_flash_read_message - flash specific information for
- * spi-masters that provide accelerated flash read interfaces
- * @buf: buffer to read data
- * @from: offset within the flash from where data is to be read
- * @len: length of data to be read
- * @retlen: actual length of data read
- * @read_opcode: read_opcode to be used to communicate with flash
- * @addr_width: number of address bytes
- * @dummy_bytes: number of dummy bytes
- * @opcode_nbits: number of lines to send opcode
- * @addr_nbits: number of lines to send address
- * @data_nbits: number of lines for data
- * @rx_sg: Scatterlist for receive data read from flash
- * @cur_msg_mapped: message has been mapped for DMA
- */
-struct spi_flash_read_message {
- void *buf;
- loff_t from;
- size_t len;
- size_t retlen;
- u8 read_opcode;
- u8 addr_width;
- u8 dummy_bytes;
- u8 opcode_nbits;
- u8 addr_nbits;
- u8 data_nbits;
- struct sg_table rx_sg;
- bool cur_msg_mapped;
-};
-
-/* SPI core interface for flash read support */
-static inline bool spi_flash_read_supported(struct spi_device *spi)
-{
- return spi->controller->spi_flash_read &&
- (!spi->controller->flash_read_supported ||
- spi->controller->flash_read_supported(spi));
-}
-
-int spi_flash_read(struct spi_device *spi,
- struct spi_flash_read_message *msg);
-
/*---------------------------------------------------------------------------*/
/*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4894d322d258..1e8a46435838 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -380,6 +380,24 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})
+/**
+ * spin_is_locked() - Check whether a spinlock is locked.
+ * @lock: Pointer to the spinlock.
+ *
+ * This function is NOT required to provide any memory ordering
+ * guarantees; it could be used for debugging purposes or, when
+ * additional synchronization is needed, accompanied with other
+ * constructs (memory barriers) enforcing the synchronization.
+ *
+ * Returns: 1 if @lock is locked, 0 otherwise.
+ *
+ * Note that the function only tells you that the spinlock is
+ * seen to be locked, not that it is locked on your CPU.
+ *
+ * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
+ * the return value is always 0 (see include/linux/spinlock_up.h).
+ * Therefore you should not rely heavily on the return value.
+ */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
return raw_spin_is_locked(&lock->rlock);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 33c1c698df09..91494d7e8e41 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -69,11 +69,45 @@ struct srcu_struct { };
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
-void cleanup_srcu_struct(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
void synchronize_srcu(struct srcu_struct *sp);
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ _cleanup_srcu_struct(sp, false);
+}
+
+/**
+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory. Also,
+ * all grace-period processing must have completed.
+ *
+ * "Completed" means that the last synchronize_srcu() and
+ * synchronize_srcu_expedited() calls must have returned before the call
+ * to cleanup_srcu_struct_quiesced(). It also means that the callback
+ * from the last call_srcu() must have been invoked before the call to
+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
+ * with this last. Violating these rules will get you a WARN_ON() splat
+ * (with high probability, anyway), and will also cause the srcu_struct
+ * to be leaked.
+ */
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+{
+ _cleanup_srcu_struct(sp, true);
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
diff --git a/include/linux/string.h b/include/linux/string.h
index dd39a690c841..4a5a0eb7df51 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -147,8 +147,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
extern void * memchr(const void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCPY_MCSAFE
-static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
- size_t cnt)
+static inline __must_check unsigned long memcpy_mcsafe(void *dst,
+ const void *src, size_t cnt)
{
memcpy(dst, src, cnt);
return 0;
diff --git a/include/linux/swait.h b/include/linux/swait.h
index c98aaf677466..bf8cb0dee23c 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -5,10 +5,23 @@
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>
+#include <linux/wait.h>
#include <asm/current.h>
/*
- * Simple wait queues
+ * BROKEN wait-queues.
+ *
+ * These "simple" wait-queues are broken garbage, and should never be
+ * used. The comments below claim that they are "similar" to regular
+ * wait-queues, but the semantics are actually completely different, and
+ * every single user we have ever had has been buggy (or pointless).
+ *
+ * A "swake_up()" only wakes up _one_ waiter, which is not at all what
+ * "wake_up()" does, and has led to problems. In other cases, it has
+ * been fine, because there's only ever one waiter (kvm), but in that
+ * case gthe whole "simple" wait-queue is just pointless to begin with,
+ * since there is no "queue". Use "wake_up_process()" with a direct
+ * pointer instead.
*
* While these are very similar to regular wait queues (wait.h) the most
* important difference is that the simple waitqueue allows for deterministic
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2417d288e016..c063443d8638 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -53,7 +53,7 @@ static inline int current_is_kswapd(void)
/*
* Unaddressable device memory support. See include/linux/hmm.h and
- * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * Documentation/vm/hmm.rst. Short description is we need struct pages for
* device memory that is unaddressable (inaccessible) by CPU, so that we can
* migrate part of a process memory to device memory.
*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 70fcda1a9049..390e814fdc8d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
long nr,
struct io_event __user *events,
struct timespec __user *timeout);
+asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
+ long min_nr,
+ long nr,
+ struct io_event __user *events,
+ struct timespec __user *timeout,
+ const struct __aio_sigset *sig);
/* fs/xattr.c */
asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
@@ -536,7 +542,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
size_t len);
/* kernel/hrtimer.c */
-asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
+asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+ struct __kernel_timespec __user *rmtp);
/* kernel/itimer.c */
asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
@@ -567,14 +574,14 @@ asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
struct itimerspec __user *old_setting);
asmlinkage long sys_timer_delete(timer_t timer_id);
asmlinkage long sys_clock_settime(clockid_t which_clock,
- const struct timespec __user *tp);
+ const struct __kernel_timespec __user *tp);
asmlinkage long sys_clock_gettime(clockid_t which_clock,
- struct timespec __user *tp);
+ struct __kernel_timespec __user *tp);
asmlinkage long sys_clock_getres(clockid_t which_clock,
- struct timespec __user *tp);
+ struct __kernel_timespec __user *tp);
asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
- const struct timespec __user *rqtp,
- struct timespec __user *rmtp);
+ const struct __kernel_timespec __user *rqtp,
+ struct __kernel_timespec __user *rmtp);
/* kernel/printk.c */
asmlinkage long sys_syslog(int type, char __user *buf, int len);
@@ -679,8 +686,8 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info);
/* ipc/mqueue.c */
asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
asmlinkage long sys_mq_unlink(const char __user *name);
-asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
-asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct __kernel_timespec __user *abs_timeout);
+asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout);
asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
@@ -697,7 +704,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg);
asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg);
asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
unsigned nsops,
- const struct timespec __user *timeout);
+ const struct __kernel_timespec __user *timeout);
asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
unsigned nsops);
diff --git a/include/linux/time.h b/include/linux/time.h
index 4b62a2c0a661..aed74463592d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -10,9 +10,9 @@
extern struct timezone sys_tz;
int get_timespec64(struct timespec64 *ts,
- const struct timespec __user *uts);
+ const struct __kernel_timespec __user *uts);
int put_timespec64(const struct timespec64 *ts,
- struct timespec __user *uts);
+ struct __kernel_timespec __user *uts);
int get_itimerspec64(struct itimerspec64 *it,
const struct itimerspec __user *uit);
int put_itimerspec64(const struct itimerspec64 *it,
diff --git a/include/linux/time32.h b/include/linux/time32.h
index d2bcd4377b56..0b14f936100a 100644
--- a/include/linux/time32.h
+++ b/include/linux/time32.h
@@ -18,25 +18,14 @@
/* timespec64 is defined as timespec here */
static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
{
- return ts64;
+ return *(const struct timespec *)&ts64;
}
static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
{
- return ts;
+ return *(const struct timespec64 *)&ts;
}
-# define timespec_equal timespec64_equal
-# define timespec_compare timespec64_compare
-# define set_normalized_timespec set_normalized_timespec64
-# define timespec_add timespec64_add
-# define timespec_sub timespec64_sub
-# define timespec_valid timespec64_valid
-# define timespec_valid_strict timespec64_valid_strict
-# define timespec_to_ns timespec64_to_ns
-# define ns_to_timespec ns_to_timespec64
-# define timespec_add_ns timespec64_add_ns
-
#else
static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
{
@@ -55,6 +44,7 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
ret.tv_nsec = ts.tv_nsec;
return ret;
}
+#endif
static inline int timespec_equal(const struct timespec *a,
const struct timespec *b)
@@ -159,8 +149,6 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
a->tv_nsec = ns;
}
-#endif
-
/**
* time_to_tm - converts the calendar time to local broken-down time
*
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 93d39499838e..0a7b2f79cec7 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -2,17 +2,20 @@
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H
-#include <uapi/linux/time.h>
#include <linux/math64.h>
typedef __s64 time64_t;
typedef __u64 timeu64_t;
-#if __BITS_PER_LONG == 64
-/* this trick allows us to optimize out timespec64_to_timespec */
-# define timespec64 timespec
-#define itimerspec64 itimerspec
-#else
+/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path
+ * and 32-bit emulation.
+ */
+#ifndef CONFIG_64BIT_TIME
+#define __kernel_timespec timespec
+#endif
+
+#include <uapi/linux/time.h>
+
struct timespec64 {
time64_t tv_sec; /* seconds */
long tv_nsec; /* nanoseconds */
@@ -23,8 +26,6 @@ struct itimerspec64 {
struct timespec64 it_value;
};
-#endif
-
/* Parameters used to convert the timespec values: */
#define MSEC_PER_SEC 1000L
#define USEC_PER_MSEC 1000L
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 588a0e4b1ab9..86bc2026efce 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -19,27 +19,25 @@ extern void xtime_update(unsigned long ticks);
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
const struct timezone *tz);
-/*
- * Kernel time accessors
- */
-struct timespec64 current_kernel_time64(void);
/*
* timespec64 based interfaces
*/
-struct timespec64 get_monotonic_coarse64(void);
-extern void getrawmonotonic64(struct timespec64 *ts);
+extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
+extern void ktime_get_real_ts64(struct timespec64 *tv);
+extern void ktime_get_coarse_ts64(struct timespec64 *ts);
+extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
+
+void getboottime64(struct timespec64 *ts);
+
+/*
+ * time64_t base interfaces
+ */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);
-extern int __getnstimeofday64(struct timespec64 *tv);
-extern void getnstimeofday64(struct timespec64 *tv);
-extern void getboottime64(struct timespec64 *ts);
-
-#define ktime_get_real_ts64(ts) getnstimeofday64(ts)
-
/*
* ktime_t based interfaces
*/
@@ -53,6 +51,7 @@ enum tk_offsets {
extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
+extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);
@@ -65,6 +64,11 @@ static inline ktime_t ktime_get_real(void)
return ktime_get_with_offset(TK_OFFS_REAL);
}
+static inline ktime_t ktime_get_coarse_real(void)
+{
+ return ktime_get_coarse_with_offset(TK_OFFS_REAL);
+}
+
/**
* ktime_get_boottime - Returns monotonic time since boot in ktime_t format
*
@@ -76,6 +80,11 @@ static inline ktime_t ktime_get_boottime(void)
return ktime_get_with_offset(TK_OFFS_BOOT);
}
+static inline ktime_t ktime_get_coarse_boottime(void)
+{
+ return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
+}
+
/**
* ktime_get_clocktai - Returns the TAI time of day in ktime_t format
*/
@@ -84,6 +93,11 @@ static inline ktime_t ktime_get_clocktai(void)
return ktime_get_with_offset(TK_OFFS_TAI);
}
+static inline ktime_t ktime_get_coarse_clocktai(void)
+{
+ return ktime_get_coarse_with_offset(TK_OFFS_TAI);
+}
+
/**
* ktime_mono_to_real - Convert monotonic time to clock realtime
*/
@@ -123,18 +137,40 @@ extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);
/*
- * timespec64 interfaces utilizing the ktime based ones
+ * timespec64/time64_t interfaces utilizing the ktime based ones
+ * for API completeness, these could be implemented more efficiently
+ * if needed.
*/
-static inline void get_monotonic_boottime64(struct timespec64 *ts)
+static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
*ts = ktime_to_timespec64(ktime_get_boottime());
}
-static inline void timekeeping_clocktai64(struct timespec64 *ts)
+static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
+{
+ *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
+}
+
+static inline time64_t ktime_get_boottime_seconds(void)
+{
+ return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
+}
+
+static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
*ts = ktime_to_timespec64(ktime_get_clocktai());
}
+static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
+{
+ *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
+}
+
+static inline time64_t ktime_get_clocktai_seconds(void)
+{
+ return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
+}
+
/*
* RTC specific
*/
@@ -210,5 +246,30 @@ extern void read_persistent_clock64(struct timespec64 *ts);
extern void read_boot_clock64(struct timespec64 *ts);
extern int update_persistent_clock64(struct timespec64 now);
+/*
+ * deprecated aliases, don't use in new code
+ */
+#define getnstimeofday64(ts) ktime_get_real_ts64(ts)
+#define get_monotonic_boottime64(ts) ktime_get_boottime_ts64(ts)
+#define getrawmonotonic64(ts) ktime_get_raw_ts64(ts)
+#define timekeeping_clocktai64(ts) ktime_get_clocktai_ts64(ts)
+
+static inline struct timespec64 current_kernel_time64(void)
+{
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+
+ return ts;
+}
+
+static inline struct timespec64 get_monotonic_coarse64(void)
+{
+ struct timespec64 ts;
+
+ ktime_get_coarse_ts64(&ts);
+
+ return ts;
+}
#endif
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index 3616b4becb59..8762c2f45f8b 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -11,55 +11,13 @@ unsigned long get_seconds(void);
static inline struct timespec current_kernel_time(void)
{
- struct timespec64 now = current_kernel_time64();
-
- return timespec64_to_timespec(now);
-}
-
-#if BITS_PER_LONG == 64
-/**
- * Deprecated. Use do_settimeofday64().
- */
-static inline int do_settimeofday(const struct timespec *ts)
-{
- return do_settimeofday64(ts);
-}
-
-static inline int __getnstimeofday(struct timespec *ts)
-{
- return __getnstimeofday64(ts);
-}
-
-static inline void getnstimeofday(struct timespec *ts)
-{
- getnstimeofday64(ts);
-}
-
-static inline void ktime_get_ts(struct timespec *ts)
-{
- ktime_get_ts64(ts);
-}
-
-static inline void ktime_get_real_ts(struct timespec *ts)
-{
- getnstimeofday64(ts);
-}
+ struct timespec64 ts64;
-static inline void getrawmonotonic(struct timespec *ts)
-{
- getrawmonotonic64(ts);
-}
+ ktime_get_coarse_real_ts64(&ts64);
-static inline struct timespec get_monotonic_coarse(void)
-{
- return get_monotonic_coarse64();
+ return timespec64_to_timespec(ts64);
}
-static inline void getboottime(struct timespec *ts)
-{
- return getboottime64(ts);
-}
-#else
/**
* Deprecated. Use do_settimeofday64().
*/
@@ -71,20 +29,11 @@ static inline int do_settimeofday(const struct timespec *ts)
return do_settimeofday64(&ts64);
}
-static inline int __getnstimeofday(struct timespec *ts)
-{
- struct timespec64 ts64;
- int ret = __getnstimeofday64(&ts64);
-
- *ts = timespec64_to_timespec(ts64);
- return ret;
-}
-
static inline void getnstimeofday(struct timespec *ts)
{
struct timespec64 ts64;
- getnstimeofday64(&ts64);
+ ktime_get_real_ts64(&ts64);
*ts = timespec64_to_timespec(ts64);
}
@@ -100,7 +49,7 @@ static inline void ktime_get_real_ts(struct timespec *ts)
{
struct timespec64 ts64;
- getnstimeofday64(&ts64);
+ ktime_get_real_ts64(&ts64);
*ts = timespec64_to_timespec(ts64);
}
@@ -108,13 +57,17 @@ static inline void getrawmonotonic(struct timespec *ts)
{
struct timespec64 ts64;
- getrawmonotonic64(&ts64);
+ ktime_get_raw_ts64(&ts64);
*ts = timespec64_to_timespec(ts64);
}
static inline struct timespec get_monotonic_coarse(void)
{
- return timespec64_to_timespec(get_monotonic_coarse64());
+ struct timespec64 ts64;
+
+ ktime_get_coarse_ts64(&ts64);
+
+ return timespec64_to_timespec(ts64);
}
static inline void getboottime(struct timespec *ts)
@@ -124,7 +77,6 @@ static inline void getboottime(struct timespec *ts)
getboottime64(&ts64);
*ts = timespec64_to_timespec(ts64);
}
-#endif
/*
* Timespec interfaces utilizing the ktime based ones
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 26c152122a42..4a8841963c2e 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -124,6 +124,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
{
if (step) {
siginfo_t info;
+ clear_siginfo(&info);
user_single_step_siginfo(current, regs, &info);
force_sig_info(SIGTRAP, &info, current);
return;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e67e12adb136..f5766e853a77 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,12 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+#else
+#define _copy_to_iter_mcsafe _copy_to_iter
+#endif
+
static __always_inline __must_check
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
@@ -163,6 +169,15 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
return _copy_from_iter_flushcache(addr, bytes, i);
}
+static __always_inline __must_check
+size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i)
+{
+ if (unlikely(!check_copy_size(addr, bytes, false)))
+ return 0;
+ else
+ return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index ec9d6bc65855..53ce8176c313 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags);
int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags);
-__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);
int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
int bt_sock_wait_ready(struct sock *sk, unsigned long flags);
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 71c72a939bf8..c5187438af38 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
#endif
}
+static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events)
+{
+ if (sk_can_busy_loop(sock->sk) &&
+ events && (events & POLL_BUSY_LOOP)) {
+ /* once, only if requested by syscall */
+ sk_busy_loop(sock->sk, 1);
+ }
+}
+
+/* if this socket can poll_ll, tell the system call */
+static inline __poll_t sock_poll_busy_flag(struct socket *sock)
+{
+ return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0;
+}
+
/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_napi_id(struct sk_buff *skb,
struct napi_struct *napi)
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index f4c21b5a1242..b0eaeb02d46d 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -153,8 +153,6 @@ struct iucv_sock_list {
atomic_t autobind_name;
};
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait);
void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 35498e613ff5..e6d349b2a791 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -109,8 +109,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
int sctp_inet_listen(struct socket *sock, int backlog);
void sctp_write_space(struct sock *sk);
void sctp_data_ready(struct sock *sk);
-__poll_t sctp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);
void sctp_sock_rfree(struct sk_buff *skb);
void sctp_copy_sock(struct sock *newsk, struct sock *sk,
struct sctp_association *asoc);
diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725fdbe0f..4d2e8ad98985 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1591,8 +1591,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, int, bool);
int sock_no_getname(struct socket *, struct sockaddr *, int);
-__poll_t sock_no_poll(struct file *, struct socket *,
- struct poll_table_struct *);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51dc7a26a2fa..f88f8a2cab0d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,8 +388,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
int tcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/include/net/udp.h b/include/net/udp.h
index 621778b80e3d..d8ca3b26964d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -276,7 +276,7 @@ int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);
int udp_disconnect(struct sock *sk, int flags);
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events);
struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
bool is_ipv6);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 965c650a5273..39b94ec965be 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -121,9 +121,9 @@ TRACE_EVENT(btrfs_transaction_commit,
__entry->root_objectid = root->root_key.objectid;
),
- TP_printk_btrfs("root = %llu(%s), gen = %llu",
+ TP_printk_btrfs("root=%llu(%s) gen=%llu",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->generation)
+ __entry->generation)
);
DECLARE_EVENT_CLASS(btrfs__inode,
@@ -133,7 +133,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
TP_ARGS(inode),
TP_STRUCT__entry_btrfs(
- __field( ino_t, ino )
+ __field( u64, ino )
__field( blkcnt_t, blocks )
__field( u64, disk_i_size )
__field( u64, generation )
@@ -143,7 +143,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
),
TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
- __entry->ino = inode->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(inode));
__entry->blocks = inode->i_blocks;
__entry->disk_i_size = BTRFS_I(inode)->disk_i_size;
__entry->generation = BTRFS_I(inode)->generation;
@@ -153,15 +153,15 @@ DECLARE_EVENT_CLASS(btrfs__inode,
BTRFS_I(inode)->root->root_key.objectid;
),
- TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%lu blocks=%llu "
+ TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "
"disk_i_size=%llu last_trans=%llu logged_trans=%llu",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->generation,
- (unsigned long)__entry->ino,
+ __entry->generation,
+ __entry->ino,
(unsigned long long)__entry->blocks,
- (unsigned long long)__entry->disk_i_size,
- (unsigned long long)__entry->last_trans,
- (unsigned long long)__entry->logged_trans)
+ __entry->disk_i_size,
+ __entry->last_trans,
+ __entry->logged_trans)
);
DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
@@ -244,23 +244,25 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
"block_len=%llu flags=%s refs=%u "
"compress_type=%u",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->ino,
- (unsigned long long)__entry->start,
- (unsigned long long)__entry->len,
- (unsigned long long)__entry->orig_start,
+ __entry->ino,
+ __entry->start,
+ __entry->len,
+ __entry->orig_start,
show_map_type(__entry->block_start),
- (unsigned long long)__entry->block_len,
+ __entry->block_len,
show_map_flags(__entry->flags),
__entry->refs, __entry->compress_type)
);
TRACE_EVENT(btrfs_handle_em_exist,
- TP_PROTO(const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len),
+ TP_PROTO(struct btrfs_fs_info *fs_info,
+ const struct extent_map *existing, const struct extent_map *map,
+ u64 start, u64 len),
- TP_ARGS(existing, map, start, len),
+ TP_ARGS(fs_info, existing, map, start, len),
- TP_STRUCT__entry(
+ TP_STRUCT__entry_btrfs(
__field( u64, e_start )
__field( u64, e_len )
__field( u64, map_start )
@@ -269,7 +271,7 @@ TRACE_EVENT(btrfs_handle_em_exist,
__field( u64, len )
),
- TP_fast_assign(
+ TP_fast_assign_btrfs(fs_info,
__entry->e_start = existing->start;
__entry->e_len = existing->len;
__entry->map_start = map->start;
@@ -278,15 +280,15 @@ TRACE_EVENT(btrfs_handle_em_exist,
__entry->len = len;
),
- TP_printk("start=%llu len=%llu "
+ TP_printk_btrfs("start=%llu len=%llu "
"existing(start=%llu len=%llu) "
"em(start=%llu len=%llu)",
- (unsigned long long)__entry->start,
- (unsigned long long)__entry->len,
- (unsigned long long)__entry->e_start,
- (unsigned long long)__entry->e_len,
- (unsigned long long)__entry->map_start,
- (unsigned long long)__entry->map_len)
+ __entry->start,
+ __entry->len,
+ __entry->e_start,
+ __entry->e_len,
+ __entry->map_start,
+ __entry->map_len)
);
/* file extent item */
@@ -443,7 +445,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
TP_ARGS(inode, ordered),
TP_STRUCT__entry_btrfs(
- __field( ino_t, ino )
+ __field( u64, ino )
__field( u64, file_offset )
__field( u64, start )
__field( u64, len )
@@ -457,7 +459,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
),
TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
- __entry->ino = inode->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(inode));
__entry->file_offset = ordered->file_offset;
__entry->start = ordered->start;
__entry->len = ordered->len;
@@ -477,13 +479,13 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
"bytes_left=%llu flags=%s compress_type=%d "
"refs=%d",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->ino,
- (unsigned long long)__entry->file_offset,
- (unsigned long long)__entry->start,
- (unsigned long long)__entry->len,
- (unsigned long long)__entry->disk_len,
- (unsigned long long)__entry->truncated_len,
- (unsigned long long)__entry->bytes_left,
+ __entry->ino,
+ __entry->file_offset,
+ __entry->start,
+ __entry->len,
+ __entry->disk_len,
+ __entry->truncated_len,
+ __entry->bytes_left,
show_ordered_flags(__entry->flags),
__entry->compress_type, __entry->refs)
);
@@ -528,7 +530,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
TP_ARGS(page, inode, wbc),
TP_STRUCT__entry_btrfs(
- __field( ino_t, ino )
+ __field( u64, ino )
__field( pgoff_t, index )
__field( long, nr_to_write )
__field( long, pages_skipped )
@@ -542,7 +544,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
),
TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
- __entry->ino = inode->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(inode));
__entry->index = page->index;
__entry->nr_to_write = wbc->nr_to_write;
__entry->pages_skipped = wbc->pages_skipped;
@@ -556,12 +558,12 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
BTRFS_I(inode)->root->root_key.objectid;
),
- TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu "
+ TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu "
"nr_to_write=%ld pages_skipped=%ld range_start=%llu "
"range_end=%llu for_kupdate=%d "
"for_reclaim=%d range_cyclic=%d writeback_index=%lu",
show_root_type(__entry->root_objectid),
- (unsigned long)__entry->ino, __entry->index,
+ __entry->ino, __entry->index,
__entry->nr_to_write, __entry->pages_skipped,
__entry->range_start, __entry->range_end,
__entry->for_kupdate,
@@ -584,7 +586,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
TP_ARGS(page, start, end, uptodate),
TP_STRUCT__entry_btrfs(
- __field( ino_t, ino )
+ __field( u64, ino )
__field( pgoff_t, index )
__field( u64, start )
__field( u64, end )
@@ -593,7 +595,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
),
TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb),
- __entry->ino = page->mapping->host->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(page->mapping->host));
__entry->index = page->index;
__entry->start = start;
__entry->end = end;
@@ -602,12 +604,12 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
BTRFS_I(page->mapping->host)->root->root_key.objectid;
),
- TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu start=%llu "
+ TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu "
"end=%llu uptodate=%d",
show_root_type(__entry->root_objectid),
- (unsigned long)__entry->ino, (unsigned long)__entry->index,
- (unsigned long long)__entry->start,
- (unsigned long long)__entry->end, __entry->uptodate)
+ __entry->ino, (unsigned long)__entry->index,
+ __entry->start,
+ __entry->end, __entry->uptodate)
);
TRACE_EVENT(btrfs_sync_file,
@@ -617,8 +619,8 @@ TRACE_EVENT(btrfs_sync_file,
TP_ARGS(file, datasync),
TP_STRUCT__entry_btrfs(
- __field( ino_t, ino )
- __field( ino_t, parent )
+ __field( u64, ino )
+ __field( u64, parent )
__field( int, datasync )
__field( u64, root_objectid )
),
@@ -628,16 +630,17 @@ TRACE_EVENT(btrfs_sync_file,
const struct inode *inode = d_inode(dentry);
TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
- __entry->ino = inode->i_ino;
- __entry->parent = d_inode(dentry->d_parent)->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(inode));
+ __entry->parent = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
__entry->datasync = datasync;
__entry->root_objectid =
BTRFS_I(inode)->root->root_key.objectid;
),
- TP_printk_btrfs("root=%llu(%s) ino=%ld parent=%ld datasync=%d",
+ TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
show_root_type(__entry->root_objectid),
- (unsigned long)__entry->ino, (unsigned long)__entry->parent,
+ __entry->ino,
+ __entry->parent,
__entry->datasync)
);
@@ -655,7 +658,7 @@ TRACE_EVENT(btrfs_sync_fs,
__entry->wait = wait;
),
- TP_printk_btrfs("wait = %d", __entry->wait)
+ TP_printk_btrfs("wait=%d", __entry->wait)
);
TRACE_EVENT(btrfs_add_block_group,
@@ -665,8 +668,7 @@ TRACE_EVENT(btrfs_add_block_group,
TP_ARGS(fs_info, block_group, create),
- TP_STRUCT__entry(
- __array( u8, fsid, BTRFS_FSID_SIZE )
+ TP_STRUCT__entry_btrfs(
__field( u64, offset )
__field( u64, size )
__field( u64, flags )
@@ -675,8 +677,7 @@ TRACE_EVENT(btrfs_add_block_group,
__field( int, create )
),
- TP_fast_assign(
- memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+ TP_fast_assign_btrfs(fs_info,
__entry->offset = block_group->key.objectid;
__entry->size = block_group->key.offset;
__entry->flags = block_group->flags;
@@ -686,16 +687,16 @@ TRACE_EVENT(btrfs_add_block_group,
__entry->create = create;
),
- TP_printk("%pU: block_group offset=%llu size=%llu "
+ TP_printk_btrfs("block_group offset=%llu size=%llu "
"flags=%llu(%s) bytes_used=%llu bytes_super=%llu "
- "create=%d", __entry->fsid,
- (unsigned long long)__entry->offset,
- (unsigned long long)__entry->size,
- (unsigned long long)__entry->flags,
+ "create=%d",
+ __entry->offset,
+ __entry->size,
+ __entry->flags,
__print_flags((unsigned long)__entry->flags, "|",
BTRFS_GROUP_FLAGS),
- (unsigned long long)__entry->bytes_used,
- (unsigned long long)__entry->bytes_super, __entry->create)
+ __entry->bytes_used,
+ __entry->bytes_super, __entry->create)
);
#define show_ref_action(action) \
@@ -740,13 +741,13 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
"parent=%llu(%s) ref_root=%llu(%s) level=%d "
"type=%s seq=%llu",
- (unsigned long long)__entry->bytenr,
- (unsigned long long)__entry->num_bytes,
+ __entry->bytenr,
+ __entry->num_bytes,
show_ref_action(__entry->action),
show_root_type(__entry->parent),
show_root_type(__entry->ref_root),
__entry->level, show_ref_type(__entry->type),
- (unsigned long long)__entry->seq)
+ __entry->seq)
);
DEFINE_EVENT(btrfs_delayed_tree_ref, add_delayed_tree_ref,
@@ -805,15 +806,15 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
"parent=%llu(%s) ref_root=%llu(%s) owner=%llu "
"offset=%llu type=%s seq=%llu",
- (unsigned long long)__entry->bytenr,
- (unsigned long long)__entry->num_bytes,
+ __entry->bytenr,
+ __entry->num_bytes,
show_ref_action(__entry->action),
show_root_type(__entry->parent),
show_root_type(__entry->ref_root),
- (unsigned long long)__entry->owner,
- (unsigned long long)__entry->offset,
+ __entry->owner,
+ __entry->offset,
show_ref_type(__entry->type),
- (unsigned long long)__entry->seq)
+ __entry->seq)
);
DEFINE_EVENT(btrfs_delayed_data_ref, add_delayed_data_ref,
@@ -859,8 +860,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
),
TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d",
- (unsigned long long)__entry->bytenr,
- (unsigned long long)__entry->num_bytes,
+ __entry->bytenr,
+ __entry->num_bytes,
show_ref_action(__entry->action),
__entry->is_data)
);
@@ -923,8 +924,8 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu "
"num_stripes=%d sub_stripes=%d type=%s",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->offset,
- (unsigned long long)__entry->size,
+ __entry->offset,
+ __entry->size,
__entry->num_stripes, __entry->sub_stripes,
show_chunk_type(__entry->type))
);
@@ -974,9 +975,9 @@ TRACE_EVENT(btrfs_cow_block,
"(orig_level=%d) cow_buf=%llu (cow_level=%d)",
show_root_type(__entry->root_objectid),
__entry->refs,
- (unsigned long long)__entry->buf_start,
+ __entry->buf_start,
__entry->buf_level,
- (unsigned long long)__entry->cow_start,
+ __entry->cow_start,
__entry->cow_level)
);
@@ -1001,7 +1002,7 @@ TRACE_EVENT(btrfs_space_reservation,
__entry->reserve = reserve;
),
- TP_printk_btrfs("%s: %Lu %s %Lu", __get_str(type), __entry->val,
+ TP_printk_btrfs("%s: %llu %s %llu", __get_str(type), __entry->val,
__entry->reserve ? "reserve" : "release",
__entry->bytes)
);
@@ -1019,29 +1020,27 @@ TRACE_EVENT(btrfs_trigger_flush,
TP_ARGS(fs_info, flags, bytes, flush, reason),
- TP_STRUCT__entry(
- __array( u8, fsid, BTRFS_FSID_SIZE )
+ TP_STRUCT__entry_btrfs(
__field( u64, flags )
__field( u64, bytes )
__field( int, flush )
__string( reason, reason )
),
- TP_fast_assign(
- memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+ TP_fast_assign_btrfs(fs_info,
__entry->flags = flags;
__entry->bytes = bytes;
__entry->flush = flush;
__assign_str(reason, reason)
),
- TP_printk("%pU: %s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
- __entry->fsid, __get_str(reason), __entry->flush,
+ TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
+ __get_str(reason), __entry->flush,
show_flush_action(__entry->flush),
- (unsigned long long)__entry->flags,
+ __entry->flags,
__print_flags((unsigned long)__entry->flags, "|",
BTRFS_GROUP_FLAGS),
- (unsigned long long)__entry->bytes)
+ __entry->bytes)
);
#define show_flush_state(state) \
@@ -1060,29 +1059,27 @@ TRACE_EVENT(btrfs_flush_space,
TP_ARGS(fs_info, flags, num_bytes, state, ret),
- TP_STRUCT__entry(
- __array( u8, fsid, BTRFS_FSID_SIZE )
+ TP_STRUCT__entry_btrfs(
__field( u64, flags )
__field( u64, num_bytes )
__field( int, state )
__field( int, ret )
),
- TP_fast_assign(
- memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+ TP_fast_assign_btrfs(fs_info,
__entry->flags = flags;
__entry->num_bytes = num_bytes;
__entry->state = state;
__entry->ret = ret;
),
- TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
- __entry->fsid, __entry->state,
+ TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
+ __entry->state,
show_flush_state(__entry->state),
- (unsigned long long)__entry->flags,
+ __entry->flags,
__print_flags((unsigned long)__entry->flags, "|",
BTRFS_GROUP_FLAGS),
- (unsigned long long)__entry->num_bytes, __entry->ret)
+ __entry->num_bytes, __entry->ret)
);
DECLARE_EVENT_CLASS(btrfs__reserved_extent,
@@ -1103,8 +1100,8 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu",
show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
- (unsigned long long)__entry->start,
- (unsigned long long)__entry->len)
+ __entry->start,
+ __entry->len)
);
DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc,
@@ -1140,7 +1137,7 @@ TRACE_EVENT(find_free_extent,
__entry->data = data;
),
- TP_printk_btrfs("root=%Lu(%s) len=%Lu empty_size=%Lu flags=%Lu(%s)",
+ TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
__entry->num_bytes, __entry->empty_size, __entry->data,
__print_flags((unsigned long)__entry->data, "|",
@@ -1149,11 +1146,10 @@ TRACE_EVENT(find_free_extent,
DECLARE_EVENT_CLASS(btrfs__reserve_extent,
- TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_block_group_cache *block_group, u64 start,
+ TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
u64 len),
- TP_ARGS(fs_info, block_group, start, len),
+ TP_ARGS(block_group, start, len),
TP_STRUCT__entry_btrfs(
__field( u64, bg_objectid )
@@ -1162,15 +1158,15 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
__field( u64, len )
),
- TP_fast_assign_btrfs(fs_info,
+ TP_fast_assign_btrfs(block_group->fs_info,
__entry->bg_objectid = block_group->key.objectid;
__entry->flags = block_group->flags;
__entry->start = start;
__entry->len = len;
),
- TP_printk_btrfs("root=%Lu(%s) block_group=%Lu flags=%Lu(%s) "
- "start=%Lu len=%Lu",
+ TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
+ "start=%llu len=%llu",
show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
__entry->bg_objectid,
__entry->flags, __print_flags((unsigned long)__entry->flags,
@@ -1180,20 +1176,18 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
- TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_block_group_cache *block_group, u64 start,
+ TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
u64 len),
- TP_ARGS(fs_info, block_group, start, len)
+ TP_ARGS(block_group, start, len)
);
DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
- TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_block_group_cache *block_group, u64 start,
+ TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
u64 len),
- TP_ARGS(fs_info, block_group, start, len)
+ TP_ARGS(block_group, start, len)
);
TRACE_EVENT(btrfs_find_cluster,
@@ -1221,8 +1215,8 @@ TRACE_EVENT(btrfs_find_cluster,
__entry->min_bytes = min_bytes;
),
- TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) start=%Lu len=%Lu "
- "empty_size=%Lu min_bytes=%Lu", __entry->bg_objectid,
+ TP_printk_btrfs("block_group=%llu flags=%llu(%s) start=%llu len=%llu "
+ "empty_size=%llu min_bytes=%llu", __entry->bg_objectid,
__entry->flags,
__print_flags((unsigned long)__entry->flags, "|",
BTRFS_GROUP_FLAGS), __entry->start,
@@ -1243,7 +1237,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
__entry->bg_objectid = block_group->key.objectid;
),
- TP_printk_btrfs("block_group=%Lu", __entry->bg_objectid)
+ TP_printk_btrfs("block_group=%llu", __entry->bg_objectid)
);
TRACE_EVENT(btrfs_setup_cluster,
@@ -1272,8 +1266,8 @@ TRACE_EVENT(btrfs_setup_cluster,
__entry->bitmap = bitmap;
),
- TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) window_start=%Lu "
- "size=%Lu max_size=%Lu bitmap=%d",
+ TP_printk_btrfs("block_group=%llu flags=%llu(%s) window_start=%llu "
+ "size=%llu max_size=%llu bitmap=%d",
__entry->bg_objectid,
__entry->flags,
__print_flags((unsigned long)__entry->flags, "|",
@@ -1476,7 +1470,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
TP_STRUCT__entry_btrfs(
__field( u64, rootid )
- __field( unsigned long, ino )
+ __field( u64, ino )
__field( u64, start )
__field( u64, len )
__field( u64, reserved )
@@ -1485,14 +1479,14 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
__entry->rootid = BTRFS_I(inode)->root->objectid;
- __entry->ino = inode->i_ino;
+ __entry->ino = btrfs_ino(BTRFS_I(inode));
__entry->start = start;
__entry->len = len;
__entry->reserved = reserved;
__entry->op = op;
),
- TP_printk_btrfs("root=%llu ino=%lu start=%llu len=%llu reserved=%llu op=%s",
+ TP_printk_btrfs("root=%llu ino=%llu start=%llu len=%llu reserved=%llu op=%s",
__entry->rootid, __entry->ino, __entry->start, __entry->len,
__entry->reserved,
__print_flags((unsigned long)__entry->op, "",
@@ -1584,12 +1578,14 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
TRACE_EVENT(btrfs_qgroup_account_extent,
- TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
+ TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr,
u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
- TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
+ TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots,
+ nr_new_roots),
TP_STRUCT__entry_btrfs(
+ __field( u64, transid )
__field( u64, bytenr )
__field( u64, num_bytes )
__field( u64, nr_old_roots )
@@ -1597,43 +1593,49 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
),
TP_fast_assign_btrfs(fs_info,
+ __entry->transid = transid;
__entry->bytenr = bytenr;
__entry->num_bytes = num_bytes;
__entry->nr_old_roots = nr_old_roots;
__entry->nr_new_roots = nr_new_roots;
),
- TP_printk_btrfs("bytenr=%llu num_bytes=%llu nr_old_roots=%llu "
- "nr_new_roots=%llu",
- __entry->bytenr,
- __entry->num_bytes,
- __entry->nr_old_roots,
- __entry->nr_new_roots)
+ TP_printk_btrfs(
+"transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu",
+ __entry->transid,
+ __entry->bytenr,
+ __entry->num_bytes,
+ __entry->nr_old_roots,
+ __entry->nr_new_roots)
);
TRACE_EVENT(qgroup_update_counters,
- TP_PROTO(const struct btrfs_fs_info *fs_info, u64 qgid,
+ TP_PROTO(const struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup,
u64 cur_old_count, u64 cur_new_count),
- TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),
+ TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count),
TP_STRUCT__entry_btrfs(
__field( u64, qgid )
+ __field( u64, old_rfer )
+ __field( u64, old_excl )
__field( u64, cur_old_count )
__field( u64, cur_new_count )
),
TP_fast_assign_btrfs(fs_info,
- __entry->qgid = qgid;
+ __entry->qgid = qgroup->qgroupid;
+ __entry->old_rfer = qgroup->rfer;
+ __entry->old_excl = qgroup->excl;
__entry->cur_old_count = cur_old_count;
__entry->cur_new_count = cur_new_count;
),
- TP_printk_btrfs("qgid=%llu cur_old_count=%llu cur_new_count=%llu",
- __entry->qgid,
- __entry->cur_old_count,
- __entry->cur_new_count)
+ TP_printk_btrfs("qgid=%llu old_rfer=%llu old_excl=%llu cur_old_count=%llu cur_new_count=%llu",
+ __entry->qgid, __entry->old_rfer, __entry->old_excl,
+ __entry->cur_old_count, __entry->cur_new_count)
);
TRACE_EVENT(qgroup_update_reserve,
@@ -1765,14 +1767,14 @@ DECLARE_EVENT_CLASS(btrfs__prelim_ref,
),
TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
- (unsigned long long)__entry->root_id,
- (unsigned long long)__entry->objectid, __entry->type,
- (unsigned long long)__entry->offset, __entry->level,
+ __entry->root_id,
+ __entry->objectid, __entry->type,
+ __entry->offset, __entry->level,
__entry->old_count, __entry->mod_count,
__entry->old_count + __entry->mod_count,
- (unsigned long long)__entry->parent,
- (unsigned long long)__entry->bytenr,
- (unsigned long long)__entry->tree_size)
+ __entry->parent,
+ __entry->bytenr,
+ __entry->tree_size)
);
DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
@@ -1808,8 +1810,51 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
show_root_type(__entry->root_objectid),
- (unsigned long long)__entry->ino, __entry->mod)
+ __entry->ino, __entry->mod)
);
+
+DECLARE_EVENT_CLASS(btrfs__block_group,
+ TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+ TP_ARGS(bg_cache),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, bytenr )
+ __field( u64, len )
+ __field( u64, used )
+ __field( u64, flags )
+ ),
+
+ TP_fast_assign_btrfs(bg_cache->fs_info,
+ __entry->bytenr = bg_cache->key.objectid,
+ __entry->len = bg_cache->key.offset,
+ __entry->used = btrfs_block_group_used(&bg_cache->item);
+ __entry->flags = bg_cache->flags;
+ ),
+
+ TP_printk_btrfs("bg bytenr=%llu len=%llu used=%llu flags=%llu(%s)",
+ __entry->bytenr, __entry->len, __entry->used, __entry->flags,
+ __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS))
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group,
+ TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+ TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
+ TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+ TP_ARGS(bg_cache)
+);
+
+DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
+ TP_PROTO(const struct btrfs_block_group_cache *bg_cache),
+
+ TP_ARGS(bg_cache)
+);
+
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d8c33298c153..5936aac357ab 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -84,20 +84,21 @@ TRACE_EVENT(rcu_grace_period,
);
/*
- * Tracepoint for future grace-period events, including those for no-callbacks
- * CPUs. The caller should pull the data from the rcu_node structure,
- * other than rcuname, which comes from the rcu_state structure, and event,
- * which is one of the following:
+ * Tracepoint for future grace-period events. The caller should pull
+ * the data from the rcu_node structure, other than rcuname, which comes
+ * from the rcu_state structure, and event, which is one of the following:
*
- * "Startleaf": Request a nocb grace period based on leaf-node data.
+ * "Startleaf": Request a grace period based on leaf-node data.
+ * "Prestarted": Someone beat us to the request
* "Startedleaf": Leaf-node start proved sufficient.
* "Startedleafroot": Leaf-node start proved sufficient after checking root.
* "Startedroot": Requested a nocb grace period based on root-node data.
+ * "NoGPkthread": The RCU grace-period kthread has not yet started.
* "StartWait": Start waiting for the requested grace period.
* "ResumeWait": Resume waiting after signal.
* "EndWait": Complete wait.
* "Cleanup": Clean up rcu_node structure after previous GP.
- * "CleanupMore": Clean up, and another no-CB GP is needed.
+ * "CleanupMore": Clean up, and another GP is needed.
*/
TRACE_EVENT(rcu_future_grace_period,
diff --git a/include/uapi/asm-generic/msgbuf.h b/include/uapi/asm-generic/msgbuf.h
index fb306ebdb36f..9fe4881557cb 100644
--- a/include/uapi/asm-generic/msgbuf.h
+++ b/include/uapi/asm-generic/msgbuf.h
@@ -18,31 +18,30 @@
* On big-endian systems, the padding is in the wrong place.
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct msqid64_ds {
struct ipc64_perm msg_perm;
+#if __BITS_PER_LONG == 64
__kernel_time_t msg_stime; /* last msgsnd time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused1;
-#endif
__kernel_time_t msg_rtime; /* last msgrcv time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused2;
-#endif
__kernel_time_t msg_ctime; /* last change time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused3;
+#else
+ unsigned long msg_stime; /* last msgsnd time */
+ unsigned long msg_stime_high;
+ unsigned long msg_rtime; /* last msgrcv time */
+ unsigned long msg_rtime_high;
+ unsigned long msg_ctime; /* last change time */
+ unsigned long msg_ctime_high;
#endif
- __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */
- __kernel_ulong_t msg_qnum; /* number of messages in queue */
- __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
__kernel_pid_t msg_lspid; /* pid of last msgsnd */
__kernel_pid_t msg_lrpid; /* last receive pid */
- __kernel_ulong_t __unused4;
- __kernel_ulong_t __unused5;
+ unsigned long __unused4;
+ unsigned long __unused5;
};
#endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h
index 5e6ea22bd525..f0733a26ebfc 100644
--- a/include/uapi/asm-generic/posix_types.h
+++ b/include/uapi/asm-generic/posix_types.h
@@ -87,6 +87,7 @@ typedef struct {
typedef __kernel_long_t __kernel_off_t;
typedef long long __kernel_loff_t;
typedef __kernel_long_t __kernel_time_t;
+typedef long long __kernel_time64_t;
typedef __kernel_long_t __kernel_clock_t;
typedef int __kernel_timer_t;
typedef int __kernel_clockid_t;
diff --git a/include/uapi/asm-generic/sembuf.h b/include/uapi/asm-generic/sembuf.h
index cbf9cfe977d6..0bae010f1b64 100644
--- a/include/uapi/asm-generic/sembuf.h
+++ b/include/uapi/asm-generic/sembuf.h
@@ -13,23 +13,29 @@
* everyone just ended up making identical copies without specific
* optimizations, so we may just as well all use the same one.
*
- * 64 bit architectures typically define a 64 bit __kernel_time_t,
+ * 64 bit architectures use a 64-bit __kernel_time_t here, while
+ * 32 bit architectures have a pair of unsigned long values.
* so they do not need the first two padding words.
- * On big-endian systems, the padding is in the wrong place.
*
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
+ * On big-endian systems, the padding is in the wrong place for
+ * historic reasons, so user space has to reconstruct a time_t
+ * value using
+ *
+ * user_semid_ds.sem_otime = kernel_semid64_ds.sem_otime +
+ * ((long long)kernel_semid64_ds.sem_otime_high << 32)
+ *
+ * Pad space is left for 2 miscellaneous 32-bit values
*/
struct semid64_ds {
struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+#if __BITS_PER_LONG == 64
__kernel_time_t sem_otime; /* last semop time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused1;
-#endif
__kernel_time_t sem_ctime; /* last change time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused2;
+#else
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_ctime_high;
#endif
unsigned long sem_nsems; /* no. of semaphores in array */
unsigned long __unused3;
diff --git a/include/uapi/asm-generic/shmbuf.h b/include/uapi/asm-generic/shmbuf.h
index 2b6c3bb97f97..e504422fc501 100644
--- a/include/uapi/asm-generic/shmbuf.h
+++ b/include/uapi/asm-generic/shmbuf.h
@@ -19,42 +19,41 @@
*
*
* Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
* - 2 miscellaneous 32-bit values
*/
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
size_t shm_segsz; /* size of segment (bytes) */
+#if __BITS_PER_LONG == 64
__kernel_time_t shm_atime; /* last attach time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused1;
-#endif
__kernel_time_t shm_dtime; /* last detach time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused2;
-#endif
__kernel_time_t shm_ctime; /* last change time */
-#if __BITS_PER_LONG != 64
- unsigned long __unused3;
+#else
+ unsigned long shm_atime; /* last attach time */
+ unsigned long shm_atime_high;
+ unsigned long shm_dtime; /* last detach time */
+ unsigned long shm_dtime_high;
+ unsigned long shm_ctime; /* last change time */
+ unsigned long shm_ctime_high;
#endif
__kernel_pid_t shm_cpid; /* pid of creator */
__kernel_pid_t shm_lpid; /* pid of last operator */
- __kernel_ulong_t shm_nattch; /* no. of current attaches */
- __kernel_ulong_t __unused4;
- __kernel_ulong_t __unused5;
+ unsigned long shm_nattch; /* no. of current attaches */
+ unsigned long __unused4;
+ unsigned long __unused5;
};
struct shminfo64 {
- __kernel_ulong_t shmmax;
- __kernel_ulong_t shmmin;
- __kernel_ulong_t shmmni;
- __kernel_ulong_t shmseg;
- __kernel_ulong_t shmall;
- __kernel_ulong_t __unused1;
- __kernel_ulong_t __unused2;
- __kernel_ulong_t __unused3;
- __kernel_ulong_t __unused4;
+ unsigned long shmmax;
+ unsigned long shmmin;
+ unsigned long shmmni;
+ unsigned long shmseg;
+ unsigned long shmall;
+ unsigned long __unused1;
+ unsigned long __unused2;
+ unsigned long __unused3;
+ unsigned long __unused4;
};
#endif /* __ASM_GENERIC_SHMBUF_H */
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 558b902f18d4..80e2a7227205 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -249,7 +249,8 @@ typedef struct siginfo {
#define TRAP_TRACE 2 /* process trace trap */
#define TRAP_BRANCH 3 /* process taken branch trap */
#define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */
-#define NSIGTRAP 4
+#define TRAP_UNK 5 /* undiagnosed trap */
+#define NSIGTRAP 5
/*
* There is an additional set of SIGTRAP si_codes used by ptrace
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8bcb186c6f67..42990676a55e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_io_pgetevents 292
+__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..ed0185945bb2 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,6 +29,7 @@
#include <linux/types.h>
#include <linux/fs.h>
+#include <linux/signal.h>
#include <asm/byteorder.h>
typedef __kernel_ulong_t aio_context_t;
@@ -38,10 +39,8 @@ enum {
IOCB_CMD_PWRITE = 1,
IOCB_CMD_FSYNC = 2,
IOCB_CMD_FDSYNC = 3,
- /* These two are experimental.
- * IOCB_CMD_PREADX = 4,
- * IOCB_CMD_POLL = 5,
- */
+ /* 4 was the experimental IOCB_CMD_PREADX */
+ IOCB_CMD_POLL = 5,
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
@@ -108,5 +107,10 @@ struct iocb {
#undef IFBIG
#undef IFLITTLE
+struct __aio_sigset {
+ sigset_t __user *sigmask;
+ size_t sigsetsize;
+};
+
#endif /* __LINUX__AIO_ABI_H */
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index c8d99b9ca550..5ca1d21fc4a7 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -422,6 +422,21 @@ struct btrfs_ioctl_ino_lookup_args {
char name[BTRFS_INO_LOOKUP_PATH_MAX];
};
+#define BTRFS_INO_LOOKUP_USER_PATH_MAX (4080 - BTRFS_VOL_NAME_MAX - 1)
+struct btrfs_ioctl_ino_lookup_user_args {
+ /* in, inode number containing the subvolume of 'subvolid' */
+ __u64 dirid;
+ /* in */
+ __u64 treeid;
+ /* out, name of the subvolume of 'treeid' */
+ char name[BTRFS_VOL_NAME_MAX + 1];
+ /*
+ * out, constructed path from the directory with which the ioctl is
+ * called to dirid
+ */
+ char path[BTRFS_INO_LOOKUP_USER_PATH_MAX];
+};
+
/* Search criteria for the btrfs SEARCH ioctl family. */
struct btrfs_ioctl_search_key {
/*
@@ -725,6 +740,82 @@ struct btrfs_ioctl_send_args {
__u64 reserved[4]; /* in */
};
+/*
+ * Information about a fs tree root.
+ *
+ * All items are filled by the ioctl
+ */
+struct btrfs_ioctl_get_subvol_info_args {
+ /* Id of this subvolume */
+ __u64 treeid;
+
+ /* Name of this subvolume, used to get the real name at mount point */
+ char name[BTRFS_VOL_NAME_MAX + 1];
+
+ /*
+ * Id of the subvolume which contains this subvolume.
+ * Zero for top-level subvolume or a deleted subvolume.
+ */
+ __u64 parent_id;
+
+ /*
+ * Inode number of the directory which contains this subvolume.
+ * Zero for top-level subvolume or a deleted subvolume
+ */
+ __u64 dirid;
+
+ /* Latest transaction id of this subvolume */
+ __u64 generation;
+
+ /* Flags of this subvolume */
+ __u64 flags;
+
+ /* UUID of this subvolume */
+ __u8 uuid[BTRFS_UUID_SIZE];
+
+ /*
+ * UUID of the subvolume of which this subvolume is a snapshot.
+ * All zero for a non-snapshot subvolume.
+ */
+ __u8 parent_uuid[BTRFS_UUID_SIZE];
+
+ /*
+ * UUID of the subvolume from which this subvolume was received.
+ * All zero for non-received subvolume.
+ */
+ __u8 received_uuid[BTRFS_UUID_SIZE];
+
+ /* Transaction id indicating when change/create/send/receive happened */
+ __u64 ctransid;
+ __u64 otransid;
+ __u64 stransid;
+ __u64 rtransid;
+ /* Time corresponding to c/o/s/rtransid */
+ struct btrfs_ioctl_timespec ctime;
+ struct btrfs_ioctl_timespec otime;
+ struct btrfs_ioctl_timespec stime;
+ struct btrfs_ioctl_timespec rtime;
+
+ /* Must be zero */
+ __u64 reserved[8];
+};
+
+#define BTRFS_MAX_ROOTREF_BUFFER_NUM 255
+struct btrfs_ioctl_get_subvol_rootref_args {
+ /* in/out, minimum id of rootref's treeid to be searched */
+ __u64 min_treeid;
+
+ /* out */
+ struct {
+ __u64 treeid;
+ __u64 dirid;
+ } rootref[BTRFS_MAX_ROOTREF_BUFFER_NUM];
+
+ /* out, number of found items */
+ __u8 num_items;
+ __u8 align[7];
+};
+
/* Error codes as returned by the kernel */
enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -843,5 +934,11 @@ enum btrfs_err_code {
struct btrfs_ioctl_vol_args_v2)
#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
struct btrfs_ioctl_logical_ino_args)
+#define BTRFS_IOC_GET_SUBVOL_INFO _IOR(BTRFS_IOCTL_MAGIC, 60, \
+ struct btrfs_ioctl_get_subvol_info_args)
+#define BTRFS_IOC_GET_SUBVOL_ROOTREF _IOWR(BTRFS_IOCTL_MAGIC, 61, \
+ struct btrfs_ioctl_get_subvol_rootref_args)
+#define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
+ struct btrfs_ioctl_ino_lookup_user_args)
#endif /* _UAPI_LINUX_BTRFS_H */
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 6f0da42fc5ef..83429a05b698 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -35,6 +35,10 @@ struct signalfd_siginfo {
__u64 ssi_stime;
__u64 ssi_addr;
__u16 ssi_addr_lsb;
+ __u16 __pad2;
+ __s32 ssi_syscall;
+ __u64 ssi_call_addr;
+ __u32 ssi_arch;
/*
* Pad strcture to 128 bytes. Remember to update the
@@ -45,7 +49,7 @@ struct signalfd_siginfo {
* comes out of a read(2) and we really don't want to have
* a compat on read(2).
*/
- __u8 __pad[46];
+ __u8 __pad[28];
};
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 4c0338ea308a..fcf936656493 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -42,6 +42,13 @@ struct itimerval {
struct timeval it_value; /* current value */
};
+#ifndef __kernel_timespec
+struct __kernel_timespec {
+ __kernel_time64_t tv_sec; /* seconds */
+ long long tv_nsec; /* nanoseconds */
+};
+#endif
+
/*
* legacy timeval structure, only embedded in structures that
* traditionally used 'timeval' to pass time intervals (not absolute
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index cd4f0b897a48..2fce8b6876e9 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum;
#define __aligned_be64 __be64 __attribute__((aligned(8)))
#define __aligned_le64 __le64 __attribute__((aligned(8)))
-#ifdef __CHECK_POLL
typedef unsigned __bitwise __poll_t;
-#else
-typedef unsigned __poll_t;
-#endif
#endif /* __ASSEMBLY__ */
#endif /* _UAPI_LINUX_TYPES_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a808f29d4c5a..c0d58f390c3b 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -691,7 +691,7 @@ static void __do_notify(struct mqueue_inode_info *info)
wake_up(&info->wait_q);
}
-static int prepare_timeout(const struct timespec __user *u_abs_timeout,
+static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
struct timespec64 *ts)
{
if (get_timespec64(ts, u_abs_timeout))
@@ -1128,7 +1128,7 @@ out:
SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
size_t, msg_len, unsigned int, msg_prio,
- const struct timespec __user *, u_abs_timeout)
+ const struct __kernel_timespec __user *, u_abs_timeout)
{
struct timespec64 ts, *p = NULL;
if (u_abs_timeout) {
@@ -1142,7 +1142,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
size_t, msg_len, unsigned int __user *, u_msg_prio,
- const struct timespec __user *, u_abs_timeout)
+ const struct __kernel_timespec __user *, u_abs_timeout)
{
struct timespec64 ts, *p = NULL;
if (u_abs_timeout) {
@@ -1420,6 +1420,47 @@ COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
return do_mq_open(u_name, oflag, mode, p);
}
+COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
+ const struct compat_sigevent __user *, u_notification)
+{
+ struct sigevent n, *p = NULL;
+ if (u_notification) {
+ if (get_compat_sigevent(&n, u_notification))
+ return -EFAULT;
+ if (n.sigev_notify == SIGEV_THREAD)
+ n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
+ p = &n;
+ }
+ return do_mq_notify(mqdes, p);
+}
+
+COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
+ const struct compat_mq_attr __user *, u_mqstat,
+ struct compat_mq_attr __user *, u_omqstat)
+{
+ int ret;
+ struct mq_attr mqstat, omqstat;
+ struct mq_attr *new = NULL, *old = NULL;
+
+ if (u_mqstat) {
+ new = &mqstat;
+ if (get_compat_mq_attr(new, u_mqstat))
+ return -EFAULT;
+ }
+ if (u_omqstat)
+ old = &omqstat;
+
+ ret = do_mq_getsetattr(mqdes, new, old);
+ if (ret || !old)
+ return ret;
+
+ if (put_compat_mq_attr(old, u_omqstat))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
static int compat_prepare_timeout(const struct compat_timespec __user *p,
struct timespec64 *ts)
{
@@ -1459,45 +1500,6 @@ COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
}
return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
}
-
-COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
- const struct compat_sigevent __user *, u_notification)
-{
- struct sigevent n, *p = NULL;
- if (u_notification) {
- if (get_compat_sigevent(&n, u_notification))
- return -EFAULT;
- if (n.sigev_notify == SIGEV_THREAD)
- n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
- p = &n;
- }
- return do_mq_notify(mqdes, p);
-}
-
-COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
- const struct compat_mq_attr __user *, u_mqstat,
- struct compat_mq_attr __user *, u_omqstat)
-{
- int ret;
- struct mq_attr mqstat, omqstat;
- struct mq_attr *new = NULL, *old = NULL;
-
- if (u_mqstat) {
- new = &mqstat;
- if (get_compat_mq_attr(new, u_mqstat))
- return -EFAULT;
- }
- if (u_omqstat)
- old = &omqstat;
-
- ret = do_mq_getsetattr(mqdes, new, old);
- if (ret || !old)
- return ret;
-
- if (put_compat_mq_attr(old, u_omqstat))
- return -EFAULT;
- return 0;
-}
#endif
static const struct inode_operations mqueue_dir_inode_operations = {
diff --git a/ipc/msg.c b/ipc/msg.c
index 56fd1c73eedc..3b6545302598 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -537,6 +537,11 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
p->msg_stime = msq->q_stime;
p->msg_rtime = msq->q_rtime;
p->msg_ctime = msq->q_ctime;
+#ifndef CONFIG_64BIT
+ p->msg_stime_high = msq->q_stime >> 32;
+ p->msg_rtime_high = msq->q_rtime >> 32;
+ p->msg_ctime_high = msq->q_ctime >> 32;
+#endif
p->msg_cbytes = msq->q_cbytes;
p->msg_qnum = msq->q_qnum;
p->msg_qbytes = msq->q_qbytes;
@@ -646,9 +651,12 @@ static int copy_compat_msqid_to_user(void __user *buf, struct msqid64_ds *in,
struct compat_msqid64_ds v;
memset(&v, 0, sizeof(v));
to_compat_ipc64_perm(&v.msg_perm, &in->msg_perm);
- v.msg_stime = in->msg_stime;
- v.msg_rtime = in->msg_rtime;
- v.msg_ctime = in->msg_ctime;
+ v.msg_stime = lower_32_bits(in->msg_stime);
+ v.msg_stime_high = upper_32_bits(in->msg_stime);
+ v.msg_rtime = lower_32_bits(in->msg_rtime);
+ v.msg_rtime_high = upper_32_bits(in->msg_rtime);
+ v.msg_ctime = lower_32_bits(in->msg_ctime);
+ v.msg_ctime_high = upper_32_bits(in->msg_ctime);
v.msg_cbytes = in->msg_cbytes;
v.msg_qnum = in->msg_qnum;
v.msg_qbytes = in->msg_qbytes;
@@ -758,7 +766,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG));
} else {
ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk));
- msq->q_rtime = get_seconds();
+ msq->q_rtime = ktime_get_real_seconds();
wake_q_add(wake_q, msr->r_tsk);
WRITE_ONCE(msr->r_msg, msg);
@@ -859,7 +867,7 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
}
ipc_update_pid(&msq->q_lspid, task_tgid(current));
- msq->q_stime = get_seconds();
+ msq->q_stime = ktime_get_real_seconds();
if (!pipelined_send(msq, msg, &wake_q)) {
/* no one is waiting for this message, enqueue it */
@@ -1087,7 +1095,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
list_del(&msg->m_list);
msq->q_qnum--;
- msq->q_rtime = get_seconds();
+ msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
atomic_sub(msg->m_ts, &ns->msg_bytes);
diff --git a/ipc/sem.c b/ipc/sem.c
index 06be75d9217a..cfd94d48a9aa 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -70,6 +70,7 @@
* The worst-case behavior is nevertheless O(N^2) for N wakeups.
*/
+#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/init.h>
@@ -104,7 +105,7 @@ struct sem {
/* that alter the semaphore */
struct list_head pending_const; /* pending single-sop operations */
/* that do not alter the semaphore*/
- time_t sem_otime; /* candidate for sem_otime */
+ time64_t sem_otime; /* candidate for sem_otime */
} ____cacheline_aligned_in_smp;
/* One sem_array data structure for each set of semaphores in the system. */
@@ -984,10 +985,10 @@ again:
static void set_semotime(struct sem_array *sma, struct sembuf *sops)
{
if (sops == NULL) {
- sma->sems[0].sem_otime = get_seconds();
+ sma->sems[0].sem_otime = ktime_get_real_seconds();
} else {
sma->sems[sops[0].sem_num].sem_otime =
- get_seconds();
+ ktime_get_real_seconds();
}
}
@@ -1214,6 +1215,7 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
int cmd, struct semid64_ds *semid64)
{
struct sem_array *sma;
+ time64_t semotime;
int id = 0;
int err;
@@ -1257,8 +1259,13 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
}
kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
- semid64->sem_otime = get_semotime(sma);
+ semotime = get_semotime(sma);
+ semid64->sem_otime = semotime;
semid64->sem_ctime = sma->sem_ctime;
+#ifndef CONFIG_64BIT
+ semid64->sem_otime_high = semotime >> 32;
+ semid64->sem_ctime_high = sma->sem_ctime >> 32;
+#endif
semid64->sem_nsems = sma->sem_nsems;
ipc_unlock_object(&sma->sem_perm);
@@ -1704,8 +1711,10 @@ static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
struct compat_semid64_ds v;
memset(&v, 0, sizeof(v));
to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
- v.sem_otime = in->sem_otime;
- v.sem_ctime = in->sem_ctime;
+ v.sem_otime = lower_32_bits(in->sem_otime);
+ v.sem_otime_high = upper_32_bits(in->sem_otime);
+ v.sem_ctime = lower_32_bits(in->sem_ctime);
+ v.sem_ctime_high = upper_32_bits(in->sem_ctime);
v.sem_nsems = in->sem_nsems;
return copy_to_user(buf, &v, sizeof(v));
} else {
@@ -2168,7 +2177,7 @@ out_free:
}
long ksys_semtimedop(int semid, struct sembuf __user *tsops,
- unsigned int nsops, const struct timespec __user *timeout)
+ unsigned int nsops, const struct __kernel_timespec __user *timeout)
{
if (timeout) {
struct timespec64 ts;
@@ -2180,12 +2189,12 @@ long ksys_semtimedop(int semid, struct sembuf __user *tsops,
}
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
- unsigned int, nsops, const struct timespec __user *, timeout)
+ unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
{
return ksys_semtimedop(semid, tsops, nsops, timeout);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
unsigned int nsops,
const struct compat_timespec __user *timeout)
diff --git a/ipc/shm.c b/ipc/shm.c
index d73269381ec7..29978ee76c2e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1002,6 +1002,11 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
tbuf->shm_atime = shp->shm_atim;
tbuf->shm_dtime = shp->shm_dtim;
tbuf->shm_ctime = shp->shm_ctim;
+#ifndef CONFIG_64BIT
+ tbuf->shm_atime_high = shp->shm_atim >> 32;
+ tbuf->shm_dtime_high = shp->shm_dtim >> 32;
+ tbuf->shm_ctime_high = shp->shm_ctim >> 32;
+#endif
tbuf->shm_cpid = pid_vnr(shp->shm_cprid);
tbuf->shm_lpid = pid_vnr(shp->shm_lprid);
tbuf->shm_nattch = shp->shm_nattch;
@@ -1233,9 +1238,12 @@ static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in,
struct compat_shmid64_ds v;
memset(&v, 0, sizeof(v));
to_compat_ipc64_perm(&v.shm_perm, &in->shm_perm);
- v.shm_atime = in->shm_atime;
- v.shm_dtime = in->shm_dtime;
- v.shm_ctime = in->shm_ctime;
+ v.shm_atime = lower_32_bits(in->shm_atime);
+ v.shm_atime_high = upper_32_bits(in->shm_atime);
+ v.shm_dtime = lower_32_bits(in->shm_dtime);
+ v.shm_dtime_high = upper_32_bits(in->shm_dtime);
+ v.shm_ctime = lower_32_bits(in->shm_ctime);
+ v.shm_ctime_high = upper_32_bits(in->shm_ctime);
v.shm_segsz = in->shm_segsz;
v.shm_nattch = in->shm_nattch;
v.shm_cpid = in->shm_cpid;
diff --git a/ipc/syscall.c b/ipc/syscall.c
index 77a883ef2eca..65d405f1ba0c 100644
--- a/ipc/syscall.c
+++ b/ipc/syscall.c
@@ -30,9 +30,14 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
return ksys_semtimedop(first, (struct sembuf __user *)ptr,
second, NULL);
case SEMTIMEDOP:
- return ksys_semtimedop(first, (struct sembuf __user *)ptr,
- second,
- (const struct timespec __user *)fifth);
+ if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
+ return ksys_semtimedop(first, ptr, second,
+ (const struct __kernel_timespec __user *)fifth);
+ else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME))
+ return compat_ksys_semtimedop(first, ptr, second,
+ (const struct compat_timespec __user *)fifth);
+ else
+ return -ENOSYS;
case SEMGET:
return ksys_semget(first, second, third);
@@ -130,6 +135,8 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
/* struct sembuf is the same on 32 and 64bit :)) */
return ksys_semtimedop(first, compat_ptr(ptr), second, NULL);
case SEMTIMEDOP:
+ if (!IS_ENABLED(CONFIG_COMPAT_32BIT_TIME))
+ return -ENOSYS;
return compat_ksys_semtimedop(first, compat_ptr(ptr), second,
compat_ptr(fifth));
case SEMGET:
diff --git a/ipc/util.h b/ipc/util.h
index acc5159e96d0..0aba3230d007 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -251,7 +251,7 @@ static inline int compat_ipc_parse_version(int *cmd)
/* for __ARCH_WANT_SYS_IPC */
long ksys_semtimedop(int semid, struct sembuf __user *tsops,
unsigned int nsops,
- const struct timespec __user *timeout);
+ const struct __kernel_timespec __user *timeout);
long ksys_semget(key_t key, int nsems, int semflg);
long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg);
long ksys_msgget(key_t key, int msgflg);
@@ -265,10 +265,10 @@ long ksys_shmdt(char __user *shmaddr);
long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
/* for CONFIG_ARCH_WANT_OLD_COMPAT_IPC */
-#ifdef CONFIG_COMPAT
long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
unsigned int nsops,
const struct compat_timespec __user *timeout);
+#ifdef CONFIG_COMPAT
long compat_ksys_semctl(int semid, int semnum, int cmd, int arg);
long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr);
long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz,
diff --git a/kernel/compat.c b/kernel/compat.c
index 92d8c98c0f57..702aa846ddac 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -121,50 +121,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-static int __compat_get_timespec64(struct timespec64 *ts64,
- const struct compat_timespec __user *cts)
-{
- struct compat_timespec ts;
- int ret;
-
- ret = copy_from_user(&ts, cts, sizeof(ts));
- if (ret)
- return -EFAULT;
-
- ts64->tv_sec = ts.tv_sec;
- ts64->tv_nsec = ts.tv_nsec;
-
- return 0;
-}
-
-static int __compat_put_timespec64(const struct timespec64 *ts64,
- struct compat_timespec __user *cts)
-{
- struct compat_timespec ts = {
- .tv_sec = ts64->tv_sec,
- .tv_nsec = ts64->tv_nsec
- };
- return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
-}
-
-int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_get_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec64);
-
-int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_put_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec64);
-
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
@@ -368,6 +324,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
+/* Todo: Delete these extern declarations when get/put_compat_itimerspec64()
+ * are moved to kernel/time/time.c .
+ */
+extern int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts);
+extern int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts);
+
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e2764d767f18..ca8ac2824f0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
- spin_lock_init(&tsk->delays->lock);
+ raw_spin_lock_init(&tsk->delays->lock);
}
/*
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
+ u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(lock, flags);
+ raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(lock, flags);
+ raw_spin_unlock_irqrestore(lock, flags);
}
}
@@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
- spin_lock_irqsave(&tsk->delays->lock, flags);
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
@@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
}
@@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
__u64 ret;
unsigned long flags;
- spin_lock_irqsave(&tsk->delays->lock, flags);
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..08f5e1b42b43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5120,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+ case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+ case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
@@ -6668,7 +6670,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -7333,7 +7335,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -8686,8 +8688,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;
list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8784,7 +8785,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -8858,7 +8859,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8971,19 +8971,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;
/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;
- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;
ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;
event->addr_filters.nr_file_filters++;
@@ -10521,19 +10520,20 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader &&
- (is_software_event(event) != is_software_event(group_leader))) {
- if (is_software_event(event)) {
+ if (group_leader) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If event and group_leader are not both a software
- * event, and event is, then group leader is not.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
*
- * Allow the addition of software events to !software
- * groups, this is safe because software events never
- * fail to schedule.
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
*/
- pmu = group_leader->pmu;
- } else if (is_software_event(group_leader) &&
+ pmu = group_leader->ctx->pmu;
+ } else if (!is_software_event(event) &&
+ is_software_event(group_leader) &&
(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index fc4f361a86bb..dd20d0d528d4 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,11 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
*/
#include <linux/slab.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2a8571f72b17..4ca2fd46645d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -76,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data,
data->chip->irq_write_msi_msg(data, msg);
}
+static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the MSI provider has messed with the second message and
+ * not advertized that it is level-capable, signal the breakage.
+ */
+ WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+ (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) &&
+ (msg[1].address_lo || msg[1].address_hi || msg[1].data));
+}
+
/**
* msi_domain_set_affinity - Generic affinity setter function for MSI domains
* @irq_data: The irq data associated to the interrupt
@@ -89,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
}
return ret;
@@ -104,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
static int msi_domain_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early)
{
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
- struct msi_msg msg;
+ struct msi_msg msg[2];
- memset(&msg, 0, sizeof(msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ memset(msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, msg);
}
static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 023386338269..edcac5de7ebc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock)
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
{
- int i, depth = curr->lockdep_depth;
+ int i, depth = READ_ONCE(p->lockdep_depth);
- if (!depth) {
- printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ if (!depth)
+ printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+ else
+ printk("%d lock%s held by %s/%d:\n", depth,
+ depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ /*
+ * It's not reliable to print a task's held locks if it's not sleeping
+ * and it's not the current task.
+ */
+ if (p->state == TASK_RUNNING && p != current)
return;
- }
- printk("%d lock%s held by %s/%d:\n",
- depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
- print_lock(curr->held_locks + i);
+ print_lock(p->held_locks + i);
}
}
@@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
if (unlikely(!debug_locks)) {
pr_warn("INFO: lockdep is turned off.\n");
@@ -4460,50 +4462,18 @@ void debug_show_all_locks(void)
}
pr_warn("\nShowing all locks held in the system:\n");
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- pr_warn("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- pr_cont(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- pr_cont(" ignoring it.\n");
- unlock = 0;
- } else {
- if (count != 10)
- pr_cont(" locked it.\n");
- }
-
- do_each_thread(g, p) {
- /*
- * It's not reliable to print a task's held locks
- * if it's not sleeping (or if it's not the current
- * task):
- */
- if (p->state == TASK_RUNNING && p != current)
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (!p->lockdep_depth)
continue;
- if (p->lockdep_depth)
- lockdep_print_held_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
+ lockdep_print_held_locks(p);
touch_nmi_watchdog();
- } while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+ }
+ rcu_read_unlock();
pr_warn("\n");
pr_warn("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);
#endif
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index f046b7ce9dd6..5e10153b4d3c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -23,13 +23,15 @@ struct mcs_spinlock {
#ifndef arch_mcs_spin_lock_contended
/*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
do { \
- while (!(smp_load_acquire(l))) \
- cpu_relax(); \
+ smp_cond_load_acquire(l, VAL); \
} while (0)
#endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2048359f33d2..f44f658ae629 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock)
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
- if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
return false;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d880296245c5..bfaeb05123ff 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -12,11 +12,11 @@
* GNU General Public License for more details.
*
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
- * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -33,6 +33,11 @@
#include <asm/qspinlock.h>
/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
* MCS lock. The paper below provides a good description for this kind
@@ -77,6 +82,18 @@
#endif
/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
* Per-CPU queue node structures; we can never have more than 4 nested
* contexts: task, softirq, hardirq, nmi.
*
@@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
*
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
+ * *,1,* -> *,0,*
*/
-struct __qspinlock {
- union {
- atomic_t val;
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 locked;
- u8 pending;
- };
- struct {
- u16 locked_pending;
- u16 tail;
- };
-#else
- struct {
- u16 tail;
- u16 locked_pending;
- };
- struct {
- u8 reserved[2];
- u8 pending;
- u8 locked;
- };
-#endif
- };
-};
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
-#if _Q_PENDING_BITS == 8
/**
* clear_pending_set_locked - take ownership and clear the pending bit.
* @lock: Pointer to queued spinlock structure
@@ -159,9 +153,7 @@ struct __qspinlock {
*/
static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
}
/*
@@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
*/
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
- struct __qspinlock *l = (void *)lock;
-
/*
- * Use release semantics to make sure that the MCS node is properly
- * initialized before changing the tail code.
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
*/
- return (u32)xchg_release(&l->tail,
+ return (u32)xchg_relaxed(&lock->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
* clear_pending_set_locked - take ownership and clear the pending bit.
* @lock: Pointer to queued spinlock structure
*
@@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
/*
- * Use release semantics to make sure that the MCS node is
- * properly initialized before changing the tail code.
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
*/
- old = atomic_cmpxchg_release(&lock->val, val, new);
+ old = atomic_cmpxchg_relaxed(&lock->val, val, new);
if (old == val)
break;
@@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
*/
static __always_inline void set_locked(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}
@@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
- u32 new, old, tail;
+ u32 old, tail;
int idx;
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
if (pv_enabled())
- goto queue;
+ goto pv_queue;
if (virt_spin_lock(lock))
return;
/*
- * wait for in-progress pending->locked hand-overs
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
*
* 0,1,0 -> 0,0,1
*/
if (val == _Q_PENDING_VAL) {
- while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
- cpu_relax();
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
}
/*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
* trylock || pending
*
* 0,0,0 -> 0,0,1 ; trylock
* 0,0,1 -> 0,1,1 ; pending
*/
- for (;;) {
+ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+ if (!(val & ~_Q_LOCKED_MASK)) {
/*
- * If we observe any contention; queue.
+ * We're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
*/
- if (val & ~_Q_LOCKED_MASK)
- goto queue;
-
- new = _Q_LOCKED_VAL;
- if (val == new)
- new |= _Q_PENDING_VAL;
+ if (val & _Q_LOCKED_MASK) {
+ atomic_cond_read_acquire(&lock->val,
+ !(VAL & _Q_LOCKED_MASK));
+ }
/*
- * Acquire semantic is required here as the function may
- * return immediately if the lock was free.
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
*/
- old = atomic_cmpxchg_acquire(&lock->val, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- /*
- * we won the trylock
- */
- if (new == _Q_LOCKED_VAL)
+ clear_pending_set_locked(lock);
+ qstat_inc(qstat_lock_pending, true);
return;
+ }
/*
- * we're pending, wait for the owner to go away.
- *
- * *,1,1 -> *,1,0
- *
- * this wait loop must be a load-acquire such that we match the
- * store-release that clears the locked bit and create lock
- * sequentiality; this is because not all clear_pending_set_locked()
- * implementations imply full barriers.
- */
- smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-
- /*
- * take ownership and clear the pending bit.
- *
- * *,1,0 -> *,0,1
+ * If pending was clear but there are waiters in the queue, then
+ * we need to undo our setting of pending before we queue ourselves.
*/
- clear_pending_set_locked(lock);
- return;
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
/*
* End of pending bit optimistic spinning and beginning of MCS
* queuing.
*/
queue:
+ qstat_inc(qstat_lock_slowpath, true);
+pv_queue:
node = this_cpu_ptr(&mcs_nodes[0]);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -400,12 +397,18 @@ queue:
goto release;
/*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
* We have already touched the queueing cacheline; don't bother with
* pending stuff.
*
* p,*,* -> n,*,*
- *
- * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -417,14 +420,8 @@ queue:
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
- /*
- * We must ensure that the stores to @node are observed before
- * the write to prev->next. The address dependency from
- * xchg_tail is not sufficient to ensure this because the read
- * component of xchg_tail is unordered with respect to the
- * initialisation of @node.
- */
- smp_store_release(&prev->next, node);
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
@@ -453,8 +450,8 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_load_acquire() call. As the next PV queue head hasn't been
- * designated yet, there is no way for the locked value to become
+ * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+ * been designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
*
@@ -464,44 +461,38 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
+ val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
* claim the lock:
*
* n,0,0 -> 0,0,1 : lock, uncontended
- * *,0,0 -> *,0,1 : lock, contended
+ * *,*,0 -> *,*,1 : lock, contended
*
- * If the queue head is the only one in the queue (lock value == tail),
- * clear the tail code and grab the lock. Otherwise, we only need
- * to grab the lock.
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
*/
- for (;;) {
- /* In the PV case we might already have _Q_LOCKED_VAL set */
- if ((val & _Q_TAIL_MASK) != tail) {
- set_locked(lock);
- break;
- }
- /*
- * The smp_cond_load_acquire() call above has provided the
- * necessary acquire semantics required for locking. At most
- * two iterations of this loop may be ran.
- */
- old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
- if (old == val)
- goto release; /* No contention */
- val = old;
- }
+ /*
+ * In the PV case we might already have _Q_LOCKED_VAL set.
+ *
+ * The atomic_cond_read_acquire() call above has provided the
+ * necessary acquire semantics required for locking.
+ */
+ if (((val & _Q_TAIL_MASK) == tail) &&
+ atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+
+ /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+ set_locked(lock);
/*
* contended path; wait for next if not observed yet, release.
*/
- if (!next) {
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
- }
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6ee477765e6c..5a0cf5f9008c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -56,11 +56,6 @@ struct pv_node {
};
/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
* Hybrid PV queued/unfair lock
*
* By replacing the regular queued_spin_trylock() with the function below,
@@ -87,8 +82,6 @@ struct pv_node {
#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
/*
* Stay in unfair lock mode as long as queued mode waiters are
* present in the MCS wait queue but the pending bit isn't set.
@@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
int val = atomic_read(&lock->val);
if (!(val & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
qstat_inc(qstat_pv_lock_stealing, true);
return true;
}
@@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 1);
-}
-
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 0);
+ WRITE_ONCE(lock->pending, 1);
}
/*
@@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock)
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- return !READ_ONCE(l->locked) &&
- (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+ return !READ_ONCE(lock->locked) &&
+ (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
_Q_LOCKED_VAL) == _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
@@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
int val = atomic_read(&lock->val);
@@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
/*
* If the vCPU is indeed halted, advance its state to match that of
@@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* the hash table later on at unlock time, no atomic instruction is
* needed.
*/
- WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
(void)pv_hash(lock, pn);
}
@@ -428,7 +404,6 @@ static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
@@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
/*
* Tracking # of slowpath locking operations
*/
- qstat_inc(qstat_pv_lock_slowpath, true);
+ qstat_inc(qstat_lock_slowpath, true);
for (;; waitcnt++) {
/*
@@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+ if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
/*
* The lock was free and now we own the lock.
* Change the lock value back to _Q_LOCKED_VAL
* and unhash the table.
*/
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
goto gotlock;
}
@@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
- pv_wait(&l->locked, _Q_SLOW_VAL);
+ pv_wait(&lock->locked, _Q_SLOW_VAL);
/*
* Because of lock stealing, the queue head vCPU may not be
@@ -518,7 +493,6 @@ gotlock:
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
- struct __qspinlock *l = (void *)lock;
struct pv_node *node;
if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* Now that we have a reference to the (likely) blocked pv_node,
* release the lock.
*/
- smp_store_release(&l->locked, 0);
+ smp_store_release(&lock->locked, 0);
/*
* At this point the memory pointed at by lock can be freed/reused,
@@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
u8 locked;
/*
@@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
+ locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 4a30ef63c607..6bd78c0740fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,13 +22,14 @@
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
- * pv_lock_slowpath - # of locking operations via the slowpath
* pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
* pv_wait_again - # of wait's after a queue head vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
+ * lock_pending - # of locking operations via pending code
+ * lock_slowpath - # of locking operations via MCS lock queue
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
@@ -46,13 +47,14 @@ enum qlock_stats {
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
- qstat_pv_lock_slowpath,
qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
qstat_pv_wait_early,
qstat_pv_wait_head,
qstat_pv_wait_node,
+ qstat_lock_pending,
+ qstat_lock_slowpath,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
@@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
- [qstat_pv_lock_slowpath] = "pv_lock_slowpath",
[qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_early] = "pv_wait_early",
[qstat_pv_wait_head] = "pv_wait_head",
[qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_lock_pending] = "lock_pending",
+ [qstat_lock_slowpath] = "lock_slowpath",
[qstat_reset_cnts] = "reset_counters",
};
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a90336779375..3064c50e181e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -347,6 +347,15 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
}
}
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+ /*
+ * As lock holder preemption issue, we both skip spinning if
+ * task is not on cpu or its cpu is preempted
+ */
+ return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
@@ -359,17 +368,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!owner || !is_rwsem_owner_spinnable(owner)) {
- ret = !owner; /* !owner is spinnable */
- goto done;
+ if (owner) {
+ ret = is_rwsem_owner_spinnable(owner) &&
+ owner_on_cpu(owner);
}
-
- /*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
- */
- ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
rcu_read_unlock();
return ret;
}
@@ -398,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
* abort spinning when need_resched or owner is not running or
* owner's cpu is preempted.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (need_resched() || !owner_on_cpu(owner)) {
rcu_read_unlock();
return false;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e31184a..40cea6735c2d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+/* Returns first leaf rcu_node of the specified RCU flavor. */
+#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
@@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+ for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ for ((rnp) = rcu_first_leaf_node(rsp); \
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
@@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 88cba7c2956c..5aff271adf1e 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
}
/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq". We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq)
-{
- int i;
-
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rsclp->tails[i - 1] != rsclp->tails[i] &&
- ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
- return true;
- return false;
-}
-
-/*
* Merge the source rcu_segcblist structure into the destination
* rcu_segcblist structure, then initialize the source. Any pending
* callbacks from the source get to start over. It is best to
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 581c12b63544..948470cef385 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 777e7a6a0292..e232846516b3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
*/
static void rcu_perf_wait_shutdown(void)
{
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
return;
while (!torture_must_stop())
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d8c00f..e628fcfd1bde 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- cleanup_srcu_struct(&srcu_ctld);
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ if (torture_random(&rand) & 0x800)
+ cleanup_srcu_struct(&srcu_ctld);
+ else
+ cleanup_srcu_struct_quiesced(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
@@ -1609,6 +1614,9 @@ static enum cpuhp_state rcutor_hp;
static void
rcu_torture_cleanup(void)
{
+ int flags = 0;
+ unsigned long gpnum = 0;
+ unsigned long completed = 0;
int i;
rcutorture_record_test_transition();
@@ -1639,6 +1647,11 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
+ rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+ &flags, &gpnum, &completed);
+ pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n",
+ cur_ops->name, gpnum, completed, flags);
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
for (i = 0; i < ncbflooders; i++)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
- flush_work(&sp->srcu_work);
+ if (quiesced)
+ WARN_ON(work_pending(&sp->srcu_work));
+ else
+ flush_work(&sp->srcu_work);
WARN_ON(sp->srcu_gp_running);
WARN_ON(sp->srcu_gp_waiting);
WARN_ON(sp->srcu_cb_head);
WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fca9ef4..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
return SRCU_INTERVAL;
}
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
int cpu;
if (WARN_ON(!srcu_get_delay(sp)))
- return; /* Leakage unless caller handles error. */
+ return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- flush_delayed_work(&sp->work);
+ return; /* Just leak it! */
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&sp->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&sp->work);
+ }
for_each_possible_cpu(cpu)
- flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ }
if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(sp))) {
pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
free_percpu(sp->sda);
sp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..aa7cade1b9f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644);
static ulong jiffies_till_sched_qs = HZ / 10;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
}
/*
- * Is there any need for future grace periods?
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_future_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
- int *fp = &rnp->need_future_gp[idx];
-
- lockdep_assert_irqs_disabled();
- return READ_ONCE(*fp);
-}
-
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static bool
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_gp_in_progress(rsp))
- return false; /* No, a grace period is already in progress. */
- if (rcu_future_needs_gp(rsp))
- return true; /* Yes, a no-CBs CPU needs one. */
- if (!rcu_segcblist_is_enabled(&rdp->cblist))
- return false; /* No, this is a no-CBs (or offline) CPU. */
- if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
- return true; /* Yes, CPU has newly registered callbacks. */
- if (rcu_segcblist_future_gp_needed(&rdp->cblist,
- READ_ONCE(rsp->completed)))
- return true; /* Yes, CBs for future grace period. */
- return false; /* No grace period needed. */
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * Has this CPU encountered a cond_resched_rcu_qs() since the
- * beginning of the grace period? For this to be the case,
- * the CPU has to have noticed the current grace period. This
- * might not be the case for nohz_full CPUs looping in the kernel.
+ * Has this CPU encountered a cond_resched() since the beginning
+ * of the grace period? For this to be the case, the CPU has to
+ * have noticed the current grace period. This might not be the
+ * case for nohz_full CPUs looping in the kernel.
*/
jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
return rnp->completed + 1;
/*
+ * If the current rcu_node structure believes that RCU is
+ * idle, and if the rcu_state structure does not yet reflect
+ * the start of a new grace period, then the next grace period
+ * will suffice. The memory barrier is needed to accurately
+ * sample the rsp->gpnum, and pairs with the second lock
+ * acquisition in rcu_gp_init(), which is augmented with
+ * smp_mb__after_unlock_lock() for this purpose.
+ */
+ if (rnp->gpnum == rnp->completed) {
+ smp_mb(); /* See above block comment. */
+ if (READ_ONCE(rsp->gpnum) == rnp->completed)
+ return rnp->completed + 1;
+ }
+
+ /*
* Otherwise, wait for a possible partial grace period and
* then the subsequent full grace period.
*/
return rnp->completed + 2;
}
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, const char *s)
+/* Trace-event wrapper function for trace_rcu_future_grace_period. */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, const char *s)
{
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
rnp->completed, c, rnp->level,
@@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
}
/*
- * Start some future grace period, as needed to handle newly arrived
+ * Start the specified grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field. Returns true if there
+ * rcu_node structure's ->need_future_gp[] field. Returns true if there
* is reason to awaken the grace-period kthread.
*
- * The caller must hold the specified rcu_node structure's ->lock.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
*/
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long *c_out)
+static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c)
{
- unsigned long c;
bool ret = false;
- struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-
- raw_lockdep_assert_held_rcu_node(rnp);
-
- /*
- * Pick up grace-period number for new callbacks. If this
- * grace period is already marked as needed, return to the caller.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp);
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
- if (rnp->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- goto out;
- }
+ struct rcu_state *rsp = rdp->rsp;
+ struct rcu_node *rnp_root;
/*
- * If either this rcu_node structure or the root rcu_node structure
- * believe that a grace period is in progress, then we must wait
- * for the one following, which is in "c". Because our request
- * will be noticed at the end of the current grace period, we don't
- * need to explicitly start one. We only do the lockless check
- * of rnp_root's fields if the current rcu_node structure thinks
- * there is no grace period in flight, and because we hold rnp->lock,
- * the only possible change is when rnp_root's two fields are
- * equal, in which case rnp_root->gpnum might be concurrently
- * incremented. But that is OK, as it will just result in our
- * doing some extra useless work.
+ * Use funnel locking to either acquire the root rcu_node
+ * structure's lock or bail out if the need for this grace period
+ * has already been recorded -- or has already started. If there
+ * is already a grace period in progress in a non-leaf node, no
+ * recording is needed because the end of the grace period will
+ * scan the leaf rcu_node structures. Note that rnp->lock must
+ * not be released.
*/
- if (rnp->gpnum != rnp->completed ||
- READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
- rnp->need_future_gp[c & 0x1]++;
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- goto out;
+ raw_lockdep_assert_held_rcu_node(rnp);
+ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+ for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root);
+ WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum +
+ need_future_gp_mask(), c));
+ if (need_future_gp_element(rnp_root, c) ||
+ ULONG_CMP_GE(rnp_root->gpnum, c) ||
+ (rnp != rnp_root &&
+ rnp_root->gpnum != rnp_root->completed)) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+ goto unlock_out;
+ }
+ need_future_gp_element(rnp_root, c) = true;
+ if (rnp_root != rnp && rnp_root->parent != NULL)
+ raw_spin_unlock_rcu_node(rnp_root);
+ if (!rnp_root->parent)
+ break; /* At root, and perhaps also leaf. */
}
- /*
- * There might be no grace period in progress. If we don't already
- * hold it, acquire the root rcu_node structure's lock in order to
- * start one (if needed).
- */
- if (rnp != rnp_root)
- raw_spin_lock_rcu_node(rnp_root);
-
- /*
- * Get a new grace-period number. If there really is no grace
- * period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp_root);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- (void)rcu_segcblist_accelerate(&rdp->cblist, c);
-
- /*
- * If the needed for the required grace period is already
- * recorded, trace and leave.
- */
- if (rnp_root->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+ /* If GP already in progress, just leave, otherwise start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
goto unlock_out;
}
-
- /* Record the need for the future grace period. */
- rnp_root->need_future_gp[c & 0x1]++;
-
- /* If a grace period is not already in progress, start one. */
- if (rnp_root->gpnum != rnp_root->completed) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
- } else {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
+ if (!rsp->gp_kthread) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+ goto unlock_out;
}
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq"));
+ ret = true; /* Caller must wake GP kthread. */
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock_rcu_node(rnp_root);
-out:
- if (c_out != NULL)
- *c_out = c;
return ret;
}
@@ -1758,16 +1701,16 @@ out:
* Clean up any old requests for the just-ended grace period. Also return
* whether any additional grace periods have been requested.
*/
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
- int c = rnp->completed;
- int needmore;
+ unsigned long c = rnp->completed;
+ bool needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rnp->need_future_gp[c & 0x1] = 0;
- needmore = rnp->need_future_gp[(c + 1) & 0x1];
- trace_rcu_future_gp(rnp, rdp, c,
- needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ need_future_gp_element(rnp, c) = false;
+ needmore = need_any_future_gp(rnp);
+ trace_rcu_this_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
return needmore;
}
@@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ unsigned long c;
bool ret = false;
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* accelerating callback invocation to an earlier grace-period
* number.
*/
- if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
- ret = rcu_start_future_gp(rnp, rdp, NULL);
+ c = rcu_cbs_completed(rsp, rnp);
+ if (rcu_segcblist_accelerate(&rdp->cblist, c))
+ ret = rcu_start_this_gp(rnp, rdp, c);
/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
@@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq_rcu_node(rnp);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
}
@@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
bool needgp = false;
- int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
struct swait_queue_head *sq;
@@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
- nocb += rcu_future_gp_cleanup(rsp, rnp);
+ needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp;
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
- rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->gp_state = RCU_GP_IDLE;
+ /* Check for GP requests since above loop. */
rdp = this_cpu_ptr(rsp->rda);
+ if (need_any_future_gp(rnp)) {
+ trace_rcu_this_gp(rnp, rdp, rsp->completed - 1,
+ TPS("CleanupMore"));
+ needgp = true;
+ }
/* Advance CBs to reduce false positives below. */
- needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
- if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) {
WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name,
- READ_ONCE(rsp->gpnum),
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT);
raw_spin_unlock_irq_rcu_node(rnp);
}
@@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
ret = 0; /* Force full wait till next FQS. */
j = jiffies_till_next_fqs;
@@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
} else {
/* Deal with stray signal. */
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function. This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
- */
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- raw_lockdep_assert_held_rcu_node(rnp);
- if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either we have not yet spawned the grace-period
- * task, this CPU does not need another grace period,
- * or a grace period is already in progress.
- * Either way, don't start a new grace period.
- */
- return false;
- }
- WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
- TPS("newreq"));
-
- /*
- * We can't do wakeups while holding the rnp->lock, as that
- * could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to our caller.
- */
- return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
- bool ret = false;
-
- /*
- * If there is no grace period in progress right now, any
- * callbacks we have up to this point will be satisfied by the
- * next grace period. Also, advancing the callbacks reduces the
- * probability of false positives from cpu_needs_another_gp()
- * resulting in pointless grace periods. So, advance callbacks
- * then start the grace period!
- */
- ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
- ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
- return ret;
-}
-
-/*
* Report a full set of quiescent states to the specified rcu_state data
* structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
* kthread if another grace period is required. Whether we wake
@@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
@@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
@@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp)
unsigned long flags;
bool needwake;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
WARN_ON_ONCE(!rdp->beenonline);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
- /* Does this CPU require a not-yet-started grace period? */
- local_irq_save(flags);
- if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
- } else {
- local_irq_restore(flags);
+ /* No grace period and unregistered callbacks? */
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist)) {
+ local_irq_save(flags);
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) {
+ local_irq_restore(flags);
+ } else {
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
}
/* If there are callbacks ready, invoke them. */
@@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
- struct rcu_node *rnp_root = rcu_get_root(rsp);
+ struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp_root);
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_lock_rcu_node(rnp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp))
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist) &&
+ !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
/* Has another RCU grace period completed? */
@@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu)
return 0;
}
+static DEFINE_PER_CPU(int, rcu_cpu_started);
+
/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
@@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu)
struct rcu_node *rnp;
struct rcu_state *rsp;
+ if (per_cpu(rcu_cpu_started, cpu))
+ return;
+
+ per_cpu(rcu_cpu_started, cpu) = 1;
+
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
rnp = rdp->mynode;
@@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu)
preempt_enable();
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
+
+ per_cpu(rcu_cpu_started, cpu) = 0;
}
/* Migrate the dead CPU's callbacks to the current CPU. */
@@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
struct rcu_data *my_rdp;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ bool needwake;
if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
@@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
return;
}
raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
- rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ /* Leverage recent GPs and set GP for new callbacks. */
+ needwake = rcu_advance_cbs(rsp, rnp_root, rdp) ||
+ rcu_advance_cbs(rsp, rnp_root, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
init_swait_queue_head(&rsp->gp_wq);
init_swait_queue_head(&rsp->expedited_wq);
- rnp = rsp->level[rcu_num_lvls - 1];
+ rnp = rcu_first_leaf_node(rsp);
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
@@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
}
struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
void __init rcu_init(void)
{
@@ -4199,6 +4103,8 @@ void __init rcu_init(void)
/* Create workqueue for expedited GPs and for Tree SRCU. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4f2e8e..78e051dffc5b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+ smp_call_func_t rew_func;
+ struct rcu_state *rew_rsp;
+ unsigned long rew_s;
+ struct work_struct rew_work;
+};
+
/* RCU's kthread states for tracing. */
#define RCU_KTHREAD_STOPPED 0
#define RCU_KTHREAD_RUNNING 1
@@ -150,15 +158,32 @@ struct rcu_node {
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- int need_future_gp[2];
- /* Counts of upcoming no-CB GP requests. */
+ u8 need_future_gp[4]; /* Counts of upcoming GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
unsigned long exp_seq_rq;
wait_queue_head_t exp_wq[4];
+ struct rcu_exp_work rew;
+ bool exp_need_flush; /* Need to flush workitem? */
} ____cacheline_internodealigned_in_smp;
+/* Accessors for ->need_future_gp[] array. */
+#define need_future_gp_mask() \
+ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1)
+#define need_future_gp_element(rnp, c) \
+ ((rnp)->need_future_gp[(c) & need_future_gp_mask()])
+#define need_any_future_gp(rnp) \
+({ \
+ int __i; \
+ bool __nonzero = false; \
+ \
+ for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \
+ __nonzero = __nonzero || \
+ READ_ONCE((rnp)->need_future_gp[__i]); \
+ __nonzero; \
+})
+
/*
* Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
* are indexed relative to this interval rather than the global CPU ID space.
@@ -224,10 +249,6 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
- atomic_long_t exp_workdone0; /* # done by workqueue. */
- atomic_long_t exp_workdone1; /* # done by others #1. */
- atomic_long_t exp_workdone2; /* # done by others #2. */
- atomic_long_t exp_workdone3; /* # done by others #3. */
int exp_dynticks_snap; /* Double-check need for IPI. */
/* 6) Callback offloading. */
@@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefab8543..d40708e8c5d6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,6 +20,8 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/lockdep.h>
+
/*
* Record the start of an expedited grace period.
*/
@@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold the rcu_state's exp_mutex.
+ * Caller must hold the specificed rcu_node structure's ->lock
*/
static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
+ raw_lockdep_assert_held_rcu_node(rnp);
+
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
+ * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
+ * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
+ * itself
+ */
+static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ret = sync_rcu_preempt_exp_done(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ return ret;
+}
+
+
+/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
* grace period. This event is reported either to the rcu_node structure on
@@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
+ * Caller must hold the specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
@@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the rcu_state's
- * exp_mutex.
+ * specified leaf rcu_node structure.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
@@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
- unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
{
if (rcu_exp_gp_seq_done(rsp, s)) {
trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(stat);
return true;
}
return false;
@@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
* promoting locality and is not strictly needed for correctness.
*/
for (; rnp != NULL; rnp = rnp->parent) {
- if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ if (sync_exp_work_done(rsp, s))
return true;
/* Work not done, either wait here or go up. */
@@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
rnp->grplo, rnp->grphi,
TPS("wait"));
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone2, s));
+ sync_exp_work_done(rsp, s));
return true;
}
rnp->exp_seq_rq = s; /* Followers can wait on us. */
@@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
}
mutex_lock(&rsp->exp_mutex);
fastpath:
- if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ if (sync_exp_work_done(rsp, s)) {
mutex_unlock(&rsp->exp_mutex);
return true;
}
@@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
}
/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
- smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
+ smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_node *rnp;
-
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
- sync_exp_reset_tree(rsp);
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+ struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+ struct rcu_state *rsp = rewp->rew_rsp;
- /* Each pass checks a CPU for identity, offline, and idle. */
- mask_ofl_test = 0;
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
- int snap;
+ func = rewp->rew_func;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (raw_smp_processor_id() == cpu ||
- !(rnp->qsmaskinitnext & mask)) {
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+ int snap;
+
+ if (raw_smp_processor_id() == cpu ||
+ !(rnp->qsmaskinitnext & mask)) {
+ mask_ofl_test |= mask;
+ } else {
+ snap = rcu_dynticks_snap(rdtp);
+ if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
- } else {
- snap = rcu_dynticks_snap(rdtp);
- if (rcu_dynticks_in_eqs(snap))
- mask_ofl_test |= mask;
- else
- rdp->exp_dynticks_snap = snap;
- }
+ else
+ rdp->exp_dynticks_snap = snap;
}
- mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited
- * GP until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (!(mask_ofl_ipi & mask))
- continue;
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (!(mask_ofl_ipi & mask))
+ continue;
retry_ipi:
- if (rcu_dynticks_in_eqs_since(rdp->dynticks,
- rdp->exp_dynticks_snap)) {
- mask_ofl_test |= mask;
- continue;
- }
- ret = smp_call_function_single(cpu, func, rsp, 0);
- if (!ret) {
- mask_ofl_ipi &= ~mask;
- continue;
- }
- /* Failed, raced with CPU hotplug operation. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rnp->qsmaskinitnext & mask) &&
- (rnp->expmask & mask)) {
- /* Online, so delay for a bit and try again. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
- schedule_timeout_uninterruptible(1);
- goto retry_ipi;
- }
- /* CPU really is offline, so we can ignore it. */
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
+ mask_ofl_test |= mask;
+ continue;
+ }
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with CPU hotplug operation. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if ((rnp->qsmaskinitnext & mask) &&
+ (rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+ schedule_timeout_uninterruptible(1);
+ goto retry_ipi;
+ }
+ /* CPU really is offline, so we can ignore it. */
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ struct rcu_node *rnp;
+
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+ sync_exp_reset_tree(rsp);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+ /* Schedule work for each leaf rcu_node structure. */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ rnp->exp_need_flush = false;
+ if (!READ_ONCE(rnp->expmask))
+ continue; /* Avoid early boot non-existent wq. */
+ rnp->rew.rew_func = func;
+ rnp->rew.rew_rsp = rsp;
+ if (!READ_ONCE(rcu_par_gp_wq) ||
+ rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ /* No workqueues yet. */
+ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+ continue;
}
- /* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+ rnp->exp_need_flush = true;
}
+
+ /* Wait for workqueue jobs (if any) to complete. */
+ rcu_for_each_leaf_node(rsp, rnp)
+ if (rnp->exp_need_flush)
+ flush_work(&rnp->rew.rew_work);
}
static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
for (;;) {
ret = swait_event_timeout(
rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root),
+ sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+ if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
return;
WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress)
@@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done(rnp))
+ if (sync_rcu_preempt_exp_done_unlocked(rnp))
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
@@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
mutex_unlock(&rsp->exp_wake_mutex);
}
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
- smp_call_func_t rew_func;
- struct rcu_state *rew_rsp;
- unsigned long rew_s;
- struct work_struct rew_work;
-};
-
/*
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
@@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
rnp = rcu_get_root(rsp);
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+ sync_exp_work_done(rsp, s));
smp_mb(); /* Workqueue actions happen before return. */
/* Let the next expedited grace period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 84fbee4686d3..7fd12039e512 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
raw_lockdep_assert_held_rcu_node(rnp);
WARN_ON_ONCE(rdp->mynode != rnp);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
/*
* Decide where to queue the newly blocked task. In theory,
@@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
@@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void)
t->rcu_read_unlock_special.b.need_qs = true;
}
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
- rcu_preempt_do_callbacks();
+ rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
}
static void rcu_cpu_kthread_setup(unsigned int cpu)
@@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self,
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
}
/* Unconditionally decrement: no need to wake ourselves up. */
@@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
swake_up_all(sq);
}
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures. This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
- rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return &rnp->nocb_gp_wq[rnp->completed & 0x1];
@@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- needwake = rcu_start_future_gp(rnp, rdp, &c);
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ needwake = rcu_start_this_gp(rnp, rdp, c);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
@@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
- trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
@@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
if (likely(d))
break;
WARN_ON(signal_pending(current));
- trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
}
- trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
@@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
@@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
void __init rcu_init_nohz(void)
{
int cpu;
- bool need_rcu_nocb_mask = true;
+ bool need_rcu_nocb_mask = false;
struct rcu_state *rsp;
#if defined(CONFIG_NO_HZ_FULL)
@@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void)
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
@@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return NULL;
@@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
*/
static void rcu_bind_gp_kthread(void)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a5e7bd..4c230a60ece4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode);
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
-#ifdef CONFIG_PREEMPT_RCU
-
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
- } else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = READ_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
@@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
* to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 211890edf37e..e9866f86f304 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
-#ifdef CONFIG_NUMA_BALANCING
- if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- p->mm->numa_scan_seq = 0;
- }
-
- if (clone_flags & CLONE_VM)
- p->numa_preferred_nid = current->numa_preferred_nid;
- else
- p->numa_preferred_nid = -1;
-
- p->node_stamp = 0ULL;
- p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
+ init_numa_balancing(clone_flags, p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu)
}
/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
* idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
@@ -5025,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
-int __sched __cond_resched_softirq(void)
-{
- BUG_ON(!in_softirq());
-
- if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
- local_bh_enable();
- preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
/**
* yield - yield the current processor to other threads.
*
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e13df951aca7..28592b62b1d5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long util;
- if (rq->rt.rt_nr_running) {
- util = sg_cpu->max;
- } else {
- util = sg_cpu->util_dl;
- if (rq->cfs.h_nr_running)
- util += sg_cpu->util_cfs;
- }
+ if (rq->rt.rt_nr_running)
+ return sg_cpu->max;
/*
+ * Utilization required by DEADLINE must always be granted while, for
+ * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+ * gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
*/
- return min(util, sg_cpu->max);
+ return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..e497c05aab7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->numa_group = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = -1;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
/*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ util_est_enqueue(&rq->cfs, p);
+
+ /*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
add_nr_running(rq, 1);
- util_est_enqueue(&rq->cfs, p);
hrtick_update(rq);
}
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
- if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
- return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
- if (idle_cpu(i)) {
+ if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
+ /*
+ * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+ * last_update_time.
+ */
+ if (!(sd_flag & SD_BALANCE_FORK))
+ sync_entity_load_avg(&p->se);
+
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core)
continue;
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
goto unlock;
}
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
return cpu;
}
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
break;
}
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (idle_cpu(target))
+ if (available_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- idle_cpu(recent_used_cpu) &&
+ available_idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
- struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
- affine_sd = tmp;
+ if (cpu != prev_cpu)
+ new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+ sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
- if (affine_sd) {
- sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu == prev_cpu)
- goto pick_cpu;
-
- new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
- }
-
- if (sd && !(sd_flag & SD_BALANCE_FORK)) {
- /*
- * We're going to need the task's util for capacity_spare_wake
- * in find_idlest_group. Sync it up to prev_cpu's
- * last_update_time.
- */
- sync_entity_load_avg(&p->se);
- }
+ if (unlikely(sd)) {
+ /* Slow path */
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ /* Fast path */
- if (!sd) {
-pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- if (want_affine)
- current->recent_used_cpu = cpu;
- }
- } else {
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
}
rcu_read_unlock();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb467c221b15..6601baf2361c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
diff --git a/kernel/signal.c b/kernel/signal.c
index 9c33163a6165..0f865d67415d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr
return send_sig_info(info.si_signo, &info, t);
}
-#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
struct siginfo info;
@@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *
return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);
-#endif
-#ifdef SEGV_BNDERR
int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
struct siginfo info;
@@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
info.si_upper = upper;
return force_sig_info(info.si_signo, &info, current);
}
-#endif
#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
@@ -2837,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
[SIGPOLL] = { NSIGPOLL, SIL_POLL },
[SIGSYS] = { NSIGSYS, SIL_SYS },
};
- if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) {
layout = filter[sig].layout;
+ /* Handle the exceptions */
+ if ((sig == SIGBUS) &&
+ (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
+ layout = SIL_FAULT_MCEERR;
+ else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
+ layout = SIL_FAULT_BNDERR;
+#ifdef SEGV_PKUERR
+ else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
+ layout = SIL_FAULT_PKUERR;
+#endif
+ }
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
} else {
@@ -2848,104 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
layout = SIL_POLL;
else if (si_code < 0)
layout = SIL_RT;
- /* Tests to support buggy kernel ABIs */
-#ifdef TRAP_FIXME
- if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
- layout = SIL_FAULT;
-#endif
-#ifdef FPE_FIXME
- if ((sig == SIGFPE) && (si_code == FPE_FIXME))
- layout = SIL_FAULT;
-#endif
}
return layout;
}
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
- int err;
-
- if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+ if (copy_to_user(to, from , sizeof(struct siginfo)))
return -EFAULT;
- if (from->si_code < 0)
- return __copy_to_user(to, from, sizeof(siginfo_t))
- ? -EFAULT : 0;
- /*
- * If you change siginfo_t structure, please be sure
- * this code is fixed accordingly.
- * Please remember to update the signalfd_copyinfo() function
- * inside fs/signalfd.c too, in case siginfo_t changes.
- * It should never copy any pad contained in the structure
- * to avoid security leaks, but must copy the generic
- * 3 ints plus the relevant union member.
- */
- err = __put_user(from->si_signo, &to->si_signo);
- err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user(from->si_code, &to->si_code);
- switch (siginfo_layout(from->si_signo, from->si_code)) {
- case SIL_KILL:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
- case SIL_TIMER:
- /* Unreached SI_TIMER is negative */
- break;
- case SIL_POLL:
- err |= __put_user(from->si_band, &to->si_band);
- err |= __put_user(from->si_fd, &to->si_fd);
- break;
- case SIL_FAULT:
- err |= __put_user(from->si_addr, &to->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- err |= __put_user(from->si_trapno, &to->si_trapno);
-#endif
-#ifdef __ia64__
- err |= __put_user(from->si_imm, &to->si_imm);
- err |= __put_user(from->si_flags, &to->si_flags);
- err |= __put_user(from->si_isr, &to->si_isr);
-#endif
- /*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
- */
-#ifdef BUS_MCEERR_AR
- if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
- err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef BUS_MCEERR_AO
- if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
- err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef SEGV_BNDERR
- if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
- err |= __put_user(from->si_lower, &to->si_lower);
- err |= __put_user(from->si_upper, &to->si_upper);
- }
-#endif
-#ifdef SEGV_PKUERR
- if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
- err |= __put_user(from->si_pkey, &to->si_pkey);
-#endif
- break;
- case SIL_CHLD:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(from->si_status, &to->si_status);
- err |= __put_user(from->si_utime, &to->si_utime);
- err |= __put_user(from->si_stime, &to->si_stime);
- break;
- case SIL_RT:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(from->si_ptr, &to->si_ptr);
- break;
- case SIL_SYS:
- err |= __put_user(from->si_call_addr, &to->si_call_addr);
- err |= __put_user(from->si_syscall, &to->si_syscall);
- err |= __put_user(from->si_arch, &to->si_arch);
- break;
- }
- return err;
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -2984,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
#ifdef __ARCH_SI_TRAPNO
new.si_trapno = from->si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
- new.si_addr_lsb = from->si_addr_lsb;
-#endif
-#ifdef BUS_MCEERR_AO
- if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
- new.si_addr_lsb = from->si_addr_lsb;
+ break;
+ case SIL_FAULT_MCEERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-#ifdef SEGV_BNDERR
- if ((from->si_signo == SIGSEGV) &&
- (from->si_code == SEGV_BNDERR)) {
- new.si_lower = ptr_to_compat(from->si_lower);
- new.si_upper = ptr_to_compat(from->si_upper);
- }
+ new.si_addr_lsb = from->si_addr_lsb;
+ break;
+ case SIL_FAULT_BNDERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-#ifdef SEGV_PKUERR
- if ((from->si_signo == SIGSEGV) &&
- (from->si_code == SEGV_PKUERR))
- new.si_pkey = from->si_pkey;
+ new.si_lower = ptr_to_compat(from->si_lower);
+ new.si_upper = ptr_to_compat(from->si_upper);
+ break;
+ case SIL_FAULT_PKUERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-
+ new.si_pkey = from->si_pkey;
break;
case SIL_CHLD:
new.si_pid = from->si_pid;
@@ -3070,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to,
#ifdef __ARCH_SI_TRAPNO
to->si_trapno = from.si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
- to->si_addr_lsb = from.si_addr_lsb;
-#endif
-#ifdef BUS_MCEER_AO
- if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
- to->si_addr_lsb = from.si_addr_lsb;
+ break;
+ case SIL_FAULT_MCEERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
-#ifdef SEGV_BNDERR
- if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
- to->si_lower = compat_ptr(from.si_lower);
- to->si_upper = compat_ptr(from.si_upper);
- }
+ to->si_addr_lsb = from.si_addr_lsb;
+ break;
+ case SIL_FAULT_BNDERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
-#ifdef SEGV_PKUERR
- if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
- to->si_pkey = from.si_pkey;
+ to->si_lower = compat_ptr(from.si_lower);
+ to->si_upper = compat_ptr(from.si_upper);
+ break;
+ case SIL_FAULT_PKUERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
+ to->si_pkey = from.si_pkey;
break;
case SIL_CHLD:
to->si_pid = from.si_pid;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de3640c78..de2f57fddc04 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -49,8 +49,8 @@
*/
#ifndef __ARCH_IRQ_STAT
-irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
-EXPORT_SYMBOL(irq_stat);
+DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
#endif
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
}
/*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
* without processing still-pending softirqs:
*/
void _local_bh_enable(void)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 64c0291b579c..f89014a2c238 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,7 +37,7 @@ struct cpu_stop_done {
struct cpu_stopper {
struct task_struct *thread;
- spinlock_t lock;
+ raw_spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
@@ -81,13 +81,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
unsigned long flags;
bool enabled;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
__cpu_stop_queue_work(stopper, work, &wakeq);
else if (work->done)
cpu_stop_signal_done(work->done);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
wake_up_q(&wakeq);
@@ -237,8 +237,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
DEFINE_WAKE_Q(wakeq);
int err;
retry:
- spin_lock_irq(&stopper1->lock);
- spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_irq(&stopper1->lock);
+ raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
@@ -261,8 +261,8 @@ retry:
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
unlock:
- spin_unlock(&stopper2->lock);
- spin_unlock_irq(&stopper1->lock);
+ raw_spin_unlock(&stopper2->lock);
+ raw_spin_unlock_irq(&stopper1->lock);
if (unlikely(err == -EDEADLK)) {
while (stop_cpus_in_progress)
@@ -457,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu)
unsigned long flags;
int run;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
run = !list_empty(&stopper->works);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
return run;
}
@@ -470,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu)
repeat:
work = NULL;
- spin_lock_irq(&stopper->lock);
+ raw_spin_lock_irq(&stopper->lock);
if (!list_empty(&stopper->works)) {
work = list_first_entry(&stopper->works,
struct cpu_stop_work, list);
list_del_init(&work->list);
}
- spin_unlock_irq(&stopper->lock);
+ raw_spin_unlock_irq(&stopper->lock);
if (work) {
cpu_stop_fn_t fn = work->fn;
@@ -550,7 +550,7 @@ static int __init cpu_stop_init(void)
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- spin_lock_init(&stopper->lock);
+ raw_spin_lock_init(&stopper->lock);
INIT_LIST_HEAD(&stopper->works);
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 9791364925dc..183169c2a75b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -43,7 +43,9 @@ COND_SYSCALL(io_submit);
COND_SYSCALL_COMPAT(io_submit);
COND_SYSCALL(io_cancel);
COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
/* fs/xattr.c */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 84f37420fcf5..f89a78e2792b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -129,31 +129,19 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags)
spin_unlock_irqrestore(&watchdog_lock, *flags);
}
-static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
/*
* Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-static void clocksource_watchdog_work(struct work_struct *work)
-{
- /*
- * If kthread_run fails the next watchdog scan over the
- * watchdog_list will find the unstable clock again.
- */
- kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
-}
-
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
/*
- * If the clocksource is registered clocksource_watchdog_kthread() will
+ * If the clocksource is registered clocksource_watchdog_work() will
* re-rate and re-select.
*/
if (list_empty(&cs->list)) {
@@ -164,7 +152,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
- /* kick clocksource_watchdog_kthread() */
+ /* kick clocksource_watchdog_work() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -174,7 +162,7 @@ static void __clocksource_unstable(struct clocksource *cs)
* @cs: clocksource to be marked unstable
*
* This function is called by the x86 TSC code to mark clocksources as unstable;
- * it defers demotion and re-selection to a kthread.
+ * it defers demotion and re-selection to a work.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -399,7 +387,9 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
}
}
-static int __clocksource_watchdog_kthread(void)
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
+static int __clocksource_watchdog_work(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -424,13 +414,12 @@ static int __clocksource_watchdog_kthread(void)
return select;
}
-static int clocksource_watchdog_kthread(void *data)
+static void clocksource_watchdog_work(struct work_struct *work)
{
mutex_lock(&clocksource_mutex);
- if (__clocksource_watchdog_kthread())
+ if (__clocksource_watchdog_work())
clocksource_select();
mutex_unlock(&clocksource_mutex);
- return 0;
}
static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -449,12 +438,12 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static inline int __clocksource_watchdog_work(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
-static void inline clocksource_watchdog_lock(unsigned long *flags) { }
-static void inline clocksource_watchdog_unlock(unsigned long *flags) { }
+static inline void clocksource_watchdog_lock(unsigned long *flags) { }
+static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -683,7 +672,7 @@ static int __init clocksource_done_booting(void)
/*
* Run the watchdog first to eliminate unstable clock sources
*/
- __clocksource_watchdog_kthread();
+ __clocksource_watchdog_work();
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 14e858753d76..055a4a728c00 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1759,8 +1759,10 @@ out:
return ret;
}
-SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
struct timespec64 tu;
@@ -1775,7 +1777,9 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
-#ifdef CONFIG_COMPAT
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 69a937c3cd81..26aa9569e24a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -59,7 +59,7 @@ SYS_NI(alarm);
*/
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- const struct timespec __user *, tp)
+ const struct __kernel_timespec __user *, tp)
{
struct timespec64 new_tp;
@@ -90,7 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *, tp)
+ struct __kernel_timespec __user *, tp)
{
int ret;
struct timespec64 kernel_tp;
@@ -104,7 +104,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
return 0;
}
-SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp)
{
struct timespec64 rtn_tp = {
.tv_sec = 0,
@@ -124,8 +124,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
}
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
- const struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+ const struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
@@ -158,7 +158,9 @@ COMPAT_SYS_NI(timer_settime);
COMPAT_SYS_NI(timer_gettime);
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
+#endif
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 10b7186d0638..e08ce3f27447 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1040,7 +1040,7 @@ void exit_itimers(struct signal_struct *sig)
}
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- const struct timespec __user *, tp)
+ const struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 new_tp;
@@ -1055,7 +1055,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+ struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 kernel_tp;
@@ -1096,7 +1096,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
- struct timespec __user *, tp)
+ struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 rtn_tp;
@@ -1113,7 +1113,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
return error;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
@@ -1148,6 +1148,10 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return err;
}
+#endif
+
+#ifdef CONFIG_COMPAT
+
COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
struct compat_timex __user *, utp)
{
@@ -1172,6 +1176,10 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
return err;
}
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
@@ -1203,8 +1211,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
}
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
- const struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+ const struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
@@ -1227,7 +1235,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return kc->nsleep(which_clock, flags, &t);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
@@ -1252,6 +1261,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return kc->nsleep(which_clock, flags, &t);
}
+
#endif
static const struct k_clock clock_realtime = {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..78e598334007 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,7 +277,8 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
*/
return !curdev ||
newdev->rating > curdev->rating ||
- !cpumask_equal(curdev->cpumask, newdev->cpumask);
+ (!cpumask_equal(curdev->cpumask, newdev->cpumask) &&
+ !tick_check_percpu(curdev, newdev, smp_processor_id()));
}
/*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3044d48ebe56..6fa99213fc72 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
-#if __BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec);
-#endif
/**
* ns_to_timeval - Convert nanoseconds to timeval
@@ -853,9 +851,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
}
int get_timespec64(struct timespec64 *ts,
- const struct timespec __user *uts)
+ const struct __kernel_timespec __user *uts)
{
- struct timespec kts;
+ struct __kernel_timespec kts;
int ret;
ret = copy_from_user(&kts, uts, sizeof(kts));
@@ -863,6 +861,11 @@ int get_timespec64(struct timespec64 *ts,
return -EFAULT;
ts->tv_sec = kts.tv_sec;
+
+ /* Zero out the padding for 32 bit systems or in compat mode */
+ if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+ kts.tv_nsec &= 0xFFFFFFFFUL;
+
ts->tv_nsec = kts.tv_nsec;
return 0;
@@ -870,16 +873,61 @@ int get_timespec64(struct timespec64 *ts,
EXPORT_SYMBOL_GPL(get_timespec64);
int put_timespec64(const struct timespec64 *ts,
- struct timespec __user *uts)
+ struct __kernel_timespec __user *uts)
{
- struct timespec kts = {
+ struct __kernel_timespec kts = {
.tv_sec = ts->tv_sec,
.tv_nsec = ts->tv_nsec
};
+
return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);
+int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts;
+ int ret;
+
+ ret = copy_from_user(&ts, cts, sizeof(ts));
+ if (ret)
+ return -EFAULT;
+
+ ts64->tv_sec = ts.tv_sec;
+ ts64->tv_nsec = ts.tv_nsec;
+
+ return 0;
+}
+
+int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts = {
+ .tv_sec = ts64->tv_sec,
+ .tv_nsec = ts64->tv_nsec
+ };
+ return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
int get_itimerspec64(struct itimerspec64 *it,
const struct itimerspec __user *uit)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49cbceef5deb..4786df904c22 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -705,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
/**
- * __getnstimeofday64 - Returns the time of day in a timespec64.
+ * ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
- * Updates the time of day in the timespec.
- * Returns 0 on success, or -ve when suspended (timespec will be undefined).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
-int __getnstimeofday64(struct timespec64 *ts)
+void ktime_get_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
u64 nsecs;
+ WARN_ON(timekeeping_suspended);
+
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -727,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts)
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
-
- /*
- * Do not bail out early, in case there were callers still using
- * the value, even in the face of the WARN_ON.
- */
- if (unlikely(timekeeping_suspended))
- return -EAGAIN;
- return 0;
-}
-EXPORT_SYMBOL(__getnstimeofday64);
-
-/**
- * getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts: pointer to the timespec64 to be set
- *
- * Returns the time of day in a timespec64 (WARN if suspended).
- */
-void getnstimeofday64(struct timespec64 *ts)
-{
- WARN_ON(__getnstimeofday64(ts));
}
-EXPORT_SYMBOL(getnstimeofday64);
+EXPORT_SYMBOL(ktime_get_real_ts64);
ktime_t ktime_get(void)
{
@@ -814,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);
+ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base, *offset = offsets[offs];
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return base;
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
+
/**
* ktime_mono_to_any() - convert mononotic time to any other time
* @tmono: time to convert.
@@ -1410,12 +1410,12 @@ int timekeeping_notify(struct clocksource *clock)
}
/**
- * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
* @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic64(struct timespec64 *ts)
+void ktime_get_raw_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
@@ -1431,7 +1431,7 @@ void getrawmonotonic64(struct timespec64 *ts)
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
}
-EXPORT_SYMBOL(getrawmonotonic64);
+EXPORT_SYMBOL(ktime_get_raw_ts64);
/**
@@ -2133,23 +2133,20 @@ unsigned long get_seconds(void)
}
EXPORT_SYMBOL(get_seconds);
-struct timespec64 current_kernel_time64(void)
+void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 now;
unsigned long seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ *ts = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
-
- return now;
}
-EXPORT_SYMBOL(current_kernel_time64);
+EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
-struct timespec64 get_monotonic_coarse64(void)
+void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
@@ -2162,12 +2159,10 @@ struct timespec64 get_monotonic_coarse64(void)
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
- set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
+ set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
-
- return now;
}
-EXPORT_SYMBOL(get_monotonic_coarse64);
+EXPORT_SYMBOL(ktime_get_coarse_ts64);
/*
* Must hold jiffies_lock
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4a4fd567fb26..cc2d23e6ff61 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1251,18 +1251,18 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*
* Note: For !irqsafe timers, you must not hold locks that are held in
* interrupt context while calling this function. Even if the lock has
- * nothing to do with the timer in question. Here's why:
+ * nothing to do with the timer in question. Here's why::
*
* CPU0 CPU1
* ---- ----
- * <SOFTIRQ>
- * call_timer_fn();
- * base->running_timer = mytimer;
- * spin_lock_irq(somelock);
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
* <IRQ>
* spin_lock(somelock);
- * del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
+ * del_timer_sync(mytimer);
+ * while (base->running_timer == mytimer);
*
* Now del_timer_sync() will never return and never release somelock.
* The interrupt on the other CPU is waiting to grab somelock but
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 675c4e9563a9..d647dabdac97 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -28,8 +28,6 @@ struct timer_list_iter {
u64 now;
};
-typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-
/*
* This allows printing both to /proc/timer_list and
* to the console (on SysRq-Q):
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b94012a3f8..3de1efbecd6a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
{
int spt;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
if (spt == 1) {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee766081b..80e0b2aca703 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
- * Note the _rcu_qs() version of cond_resched() will
+ * Note the tasks_rcu_qs() version of cond_resched() will
* notify synchronize_rcu_tasks() that this thread has
* passed a quiescent state for rcu_tasks. Otherwise
* this thread will never voluntarily schedule which would
* block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched();
+ cond_resched_tasks_rcu_qs();
}
return 0;
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe577673b98..7a913937888b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -429,15 +429,50 @@ config SGL_ALLOC
bool
default n
+config NEED_SG_DMA_LENGTH
+ bool
+
+config NEED_DMA_MAP_STATE
+ bool
+
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool 64BIT || PHYS_ADDR_T_64BIT
+
+config IOMMU_HELPER
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+ bool
+ select NEED_DMA_MAP_STATE
+
config DMA_DIRECT_OPS
bool
- depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
- default n
+ depends on HAS_DMA
+
+config DMA_NONCOHERENT_OPS
+ bool
+ depends on HAS_DMA
+ select DMA_DIRECT_OPS
+
+config DMA_NONCOHERENT_MMAP
+ bool
+ depends on DMA_NONCOHERENT_OPS
+
+config DMA_NONCOHERENT_CACHE_SYNC
+ bool
+ depends on DMA_NONCOHERENT_OPS
config DMA_VIRT_OPS
bool
- depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
- default n
+ depends on HAS_DMA
+
+config SWIOTLB
+ bool
+ select DMA_DIRECT_OPS
+ select NEED_DMA_MAP_STATE
config CHECK_SIGNATURE
bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c40c7b734cd1..76555479ae36 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1634,7 +1634,7 @@ config PROVIDE_OHCI1394_DMA_INIT
config DMA_API_DEBUG
bool "Enable debugging of DMA-API usage"
- depends on HAVE_DMA_API_DEBUG
+ select NEED_DMA_MAP_STATE
help
Enable this option to debug the use of the DMA API by device drivers.
With this option you will be able to detect common bugs in device
@@ -1651,6 +1651,23 @@ config DMA_API_DEBUG
If unsure, say N.
+config DMA_API_DEBUG_SG
+ bool "Debug DMA scatter-gather usage"
+ default y
+ depends on DMA_API_DEBUG
+ help
+ Perform extra checking that callers of dma_map_sg() have respected the
+ appropriate segment length/boundary limits for the given device when
+ preparing DMA scatterlists.
+
+ This is particularly likely to have been overlooked in cases where the
+ dma_map_sg() API is used for general bulk mapping of pages rather than
+ preparing literal scatter-gather descriptors, where there is a risk of
+ unexpected behaviour from DMA API implementations if the scatterlist
+ is technically out-of-spec.
+
+ If unsure, say N.
+
menuconfig RUNTIME_TESTING_MENU
bool "Runtime Testing"
def_bool y
diff --git a/lib/Makefile b/lib/Makefile
index ce20696d5a92..9f18c8152281 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ lib-$(CONFIG_PRINTK) += dump_stack.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o
+lib-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o
lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
lib-y += kobject.o klist.o
@@ -147,7 +148,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 7f5cdc1e6b29..c007d25bee09 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -41,6 +41,11 @@
#define HASH_FN_SHIFT 13
#define HASH_FN_MASK (HASH_SIZE - 1)
+/* allow architectures to override this if absolutely required */
+#ifndef PREALLOC_DMA_DEBUG_ENTRIES
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+#endif
+
enum {
dma_debug_single,
dma_debug_page,
@@ -127,7 +132,7 @@ static u32 min_free_entries;
static u32 nr_total_entries;
/* number of preallocated entries requested by kernel cmdline */
-static u32 req_entries;
+static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
/* debugfs dentry's for the stuff above */
static struct dentry *dma_debug_dent __read_mostly;
@@ -439,7 +444,6 @@ void debug_dma_dump_mappings(struct device *dev)
spin_unlock_irqrestore(&bucket->lock, flags);
}
}
-EXPORT_SYMBOL(debug_dma_dump_mappings);
/*
* For each mapping (initial cacheline in the case of
@@ -748,7 +752,6 @@ int dma_debug_resize_entries(u32 num_entries)
return ret;
}
-EXPORT_SYMBOL(dma_debug_resize_entries);
/*
* DMA-API debugging init code
@@ -1004,10 +1007,7 @@ void dma_debug_add_bus(struct bus_type *bus)
bus_register_notifier(bus, nb);
}
-/*
- * Let the architectures decide how many entries should be preallocated.
- */
-void dma_debug_init(u32 num_entries)
+static int dma_debug_init(void)
{
int i;
@@ -1015,7 +1015,7 @@ void dma_debug_init(u32 num_entries)
* called to set dma_debug_initialized
*/
if (global_disable)
- return;
+ return 0;
for (i = 0; i < HASH_SIZE; ++i) {
INIT_LIST_HEAD(&dma_entry_hash[i].list);
@@ -1026,17 +1026,14 @@ void dma_debug_init(u32 num_entries)
pr_err("DMA-API: error creating debugfs entries - disabling\n");
global_disable = true;
- return;
+ return 0;
}
- if (req_entries)
- num_entries = req_entries;
-
- if (prealloc_memory(num_entries) != 0) {
+ if (prealloc_memory(nr_prealloc_entries) != 0) {
pr_err("DMA-API: debugging out of memory error - disabled\n");
global_disable = true;
- return;
+ return 0;
}
nr_total_entries = num_free_entries;
@@ -1044,7 +1041,9 @@ void dma_debug_init(u32 num_entries)
dma_debug_initialized = true;
pr_info("DMA-API: debugging enabled by kernel config\n");
+ return 0;
}
+core_initcall(dma_debug_init);
static __init int dma_debug_cmdline(char *str)
{
@@ -1061,16 +1060,10 @@ static __init int dma_debug_cmdline(char *str)
static __init int dma_debug_entries_cmdline(char *str)
{
- int res;
-
if (!str)
return -EINVAL;
-
- res = get_option(&str, &req_entries);
-
- if (!res)
- req_entries = 0;
-
+ if (!get_option(&str, &nr_prealloc_entries))
+ nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
return 0;
}
@@ -1293,6 +1286,32 @@ out:
put_hash_bucket(bucket, &flags);
}
+static void check_sg_segment(struct device *dev, struct scatterlist *sg)
+{
+#ifdef CONFIG_DMA_API_DEBUG_SG
+ unsigned int max_seg = dma_get_max_seg_size(dev);
+ u64 start, end, boundary = dma_get_seg_boundary(dev);
+
+ /*
+ * Either the driver forgot to set dma_parms appropriately, or
+ * whoever generated the list forgot to check them.
+ */
+ if (sg->length > max_seg)
+ err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+ sg->length, max_seg);
+ /*
+ * In some cases this could potentially be the DMA API
+ * implementation's fault, but it would usually imply that
+ * the scatterlist was built inappropriately to begin with.
+ */
+ start = sg_dma_address(sg);
+ end = start + sg_dma_len(sg) - 1;
+ if ((start ^ end) & ~boundary)
+ err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+ start, end, boundary);
+#endif
+}
+
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
size_t size, int direction, dma_addr_t dma_addr,
bool map_single)
@@ -1423,6 +1442,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
}
+ check_sg_segment(dev, s);
+
add_dma_entry(entry);
}
}
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index bbfb229aa067..8be8106270c2 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -34,6 +34,13 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
const char *caller)
{
if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+ if (!dev->dma_mask) {
+ dev_err(dev,
+ "%s: call on device without dma_mask\n",
+ caller);
+ return false;
+ }
+
if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
dev_err(dev,
"%s: overflow %pad+%zu of device mask %llx\n",
@@ -84,6 +91,13 @@ again:
__free_pages(page, page_order);
page = NULL;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+ !(gfp & (GFP_DMA32 | GFP_DMA))) {
+ gfp |= GFP_DMA32;
+ goto again;
+ }
+
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
!(gfp & GFP_DMA)) {
@@ -121,7 +135,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
free_pages((unsigned long)cpu_addr, page_order);
}
-static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -132,8 +146,8 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
return dma_addr;
}
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction dir, unsigned long attrs)
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
{
int i;
struct scatterlist *sg;
@@ -165,10 +179,16 @@ int dma_direct_supported(struct device *dev, u64 mask)
if (mask < DMA_BIT_MASK(32))
return 0;
#endif
+ /*
+ * Various PCI/PCIe bridges have broken support for > 32bit DMA even
+ * if the device itself might support it.
+ */
+ if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+ return 0;
return 1;
}
-static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
return dma_addr == DIRECT_MAPPING_ERROR;
}
@@ -180,6 +200,5 @@ const struct dma_map_ops dma_direct_ops = {
.map_sg = dma_direct_map_sg,
.dma_supported = dma_direct_supported,
.mapping_error = dma_direct_mapping_error,
- .is_phys = 1,
};
EXPORT_SYMBOL(dma_direct_ops);
diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c
new file mode 100644
index 000000000000..79e9a757387f
--- /dev/null
+++ b/lib/dma-noncoherent.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without providing cache
+ * coherence.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/scatterlist.h>
+
+static void dma_noncoherent_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t addr;
+
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
+ size, dir);
+ return addr;
+}
+
+static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
+ if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
+ return nents;
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
+const struct dma_map_ops dma_noncoherent_ops = {
+ .alloc = arch_dma_alloc,
+ .free = arch_dma_free,
+ .mmap = arch_dma_mmap,
+ .sync_single_for_device = dma_noncoherent_sync_single_for_device,
+ .sync_sg_for_device = dma_noncoherent_sync_sg_for_device,
+ .map_page = dma_noncoherent_map_page,
+ .map_sg = dma_noncoherent_map_sg,
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+ .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu,
+ .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu,
+ .unmap_page = dma_noncoherent_unmap_page,
+ .unmap_sg = dma_noncoherent_unmap_sg,
+#endif
+ .dma_supported = dma_direct_supported,
+ .mapping_error = dma_direct_mapping_error,
+ .cache_sync = arch_dma_cache_sync,
+};
+EXPORT_SYMBOL(dma_noncoherent_ops);
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index 23633c0fda4a..92a9f243c0e2 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -3,19 +3,8 @@
* IOMMU helper functions for the free area management
*/
-#include <linux/export.h>
#include <linux/bitmap.h>
-#include <linux/bug.h>
-
-int iommu_is_span_boundary(unsigned int index, unsigned int nr,
- unsigned long shift,
- unsigned long boundary_size)
-{
- BUG_ON(!is_power_of_2(boundary_size));
-
- shift = (shift + index) & (boundary_size - 1);
- return shift + nr > boundary_size;
-}
+#include <linux/iommu-helper.h>
unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr,
@@ -38,4 +27,3 @@ again:
}
return -1;
}
-EXPORT_SYMBOL(iommu_area_alloc);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index fdae394172fa..7e43cd54c84c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
}
EXPORT_SYMBOL(_copy_to_iter);
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ kasan_check_read(from, n);
+ n = copy_to_user_mcsafe((__force void *) to, from, n);
+ }
+ return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+ const char *from, size_t len)
+{
+ unsigned long ret;
+ char *to;
+
+ to = kmap_atomic(page);
+ ret = memcpy_mcsafe(to + offset, from, len);
+ kunmap_atomic(to);
+
+ return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+ const char *from = addr;
+ unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
+ if (iter_is_iovec(i))
+ might_fault();
+ iterate_and_advance(i, bytes, v,
+ copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+ ({
+ rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+ (from += v.bv_len) - v.bv_len, v.bv_len);
+ if (rem) {
+ curr_addr = (unsigned long) from;
+ bytes = curr_addr - s_addr - rem;
+ return bytes;
+ }
+ }),
+ ({
+ rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len,
+ v.iov_len);
+ if (rem) {
+ curr_addr = (unsigned long) from;
+ bytes = curr_addr - s_addr - rem;
+ return bytes;
+ }
+ })
+ )
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe);
+#endif /* CONFIG_ARCH_HAS_UACCESS_MCSAFE */
+
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index cc640588f145..04b68d9dffac 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -593,9 +593,8 @@ found:
}
/*
- * Allocates bounce buffer and returns its kernel virtual address.
+ * Allocates bounce buffer and returns its physical address.
*/
-
static phys_addr_t
map_single(struct device *hwdev, phys_addr_t phys, size_t size,
enum dma_data_direction dir, unsigned long attrs)
@@ -614,7 +613,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
}
/*
- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ * tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir,
@@ -692,7 +691,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
}
}
-#ifdef CONFIG_DMA_DIRECT_OPS
static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
size_t size)
{
@@ -727,7 +725,7 @@ swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
out_unmap:
dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
- (unsigned long long)(dev ? dev->coherent_dma_mask : 0),
+ (unsigned long long)dev->coherent_dma_mask,
(unsigned long long)*dma_handle);
/*
@@ -764,7 +762,6 @@ static bool swiotlb_free_buffer(struct device *dev, size_t size,
DMA_ATTR_SKIP_CPU_SYNC);
return true;
}
-#endif
static void
swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
@@ -1045,7 +1042,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
}
-#ifdef CONFIG_DMA_DIRECT_OPS
void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs)
{
@@ -1089,4 +1085,3 @@ const struct dma_map_ops swiotlb_dma_ops = {
.unmap_page = swiotlb_unmap_page,
.dma_supported = dma_direct_supported,
};
-#endif /* CONFIG_DMA_DIRECT_OPS */
diff --git a/mm/Kconfig b/mm/Kconfig
index e14c01513bfd..3e0b6e87f65d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -266,7 +266,7 @@ config ARCH_ENABLE_THP_MIGRATION
bool
config PHYS_ADDR_T_64BIT
- def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+ def_bool 64BIT
config BOUNCE
bool "Enable bounce buffers"
@@ -305,7 +305,7 @@ config KSM
the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.txt for more information: KSM is inactive
+ See Documentation/vm/ksm.rst for more information: KSM is inactive
until a program has madvised that an area is MADV_MERGEABLE, and
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
@@ -530,7 +530,7 @@ config MEM_SOFT_DIRTY
into a page just as regular dirty bit, but unlike the latter
it can be cleared by hands.
- See Documentation/vm/soft-dirty.txt for more details.
+ See Documentation/admin-guide/mm/soft-dirty.rst for more details.
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
@@ -657,7 +657,8 @@ config IDLE_PAGE_TRACKING
be useful to tune memory cgroup limits and/or for job placement
within a compute cluster.
- See Documentation/vm/idle_page_tracking.txt for more details.
+ See Documentation/admin-guide/mm/idle_page_tracking.rst for
+ more details.
# arch_add_memory() comprehends device memory
config ARCH_HAS_ZONE_DEVICE
diff --git a/mm/cleancache.c b/mm/cleancache.c
index f7b9fdc79d97..126548b5a292 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -3,7 +3,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of cleancache. See
- * Documentation/vm/cleancache.txt for more information.
+ * Documentation/vm/cleancache.rst for more information.
*
* Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fec8b5044040..4f5476a0f955 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -3,7 +3,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of frontswap. See
- * Documentation/vm/frontswap.txt for more information.
+ * Documentation/vm/frontswap.rst for more information.
*
* Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
diff --git a/mm/hmm.c b/mm/hmm.c
index 486dc394a5a3..e63e353830e8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,7 +37,7 @@
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
/*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
*/
DEFINE_STATIC_KEY_FALSE(device_private_key);
EXPORT_SYMBOL(device_private_key);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b9f3dbd885bd..ac5591d8622c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1185,7 +1185,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
* mmu_notifier_invalidate_range_end() happens which can lead to a
* device seeing memory write in different order than CPU.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
@@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
* replacing a zero pmd write protected page with a zero pte write
* protected page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
pmdp_huge_clear_flush(vma, haddr, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 218679138255..129088710510 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3291,7 +3291,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
@@ -4357,7 +4357,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/ksm.c b/mm/ksm.c
index e3cbf9a92f3c..7d6558f3bac9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -51,7 +51,9 @@
#define DO_NUMA(x) do { } while (0)
#endif
-/*
+/**
+ * DOC: Overview
+ *
* A few notes about the KSM scanning process,
* to make it easier to understand the data structures below:
*
@@ -67,6 +69,21 @@
* this tree is fully assured to be working (except when pages are unmapped),
* and therefore this tree is called the stable tree.
*
+ * The stable tree node includes information required for reverse
+ * mapping from a KSM page to virtual addresses that map this page.
+ *
+ * In order to avoid large latencies of the rmap walks on KSM pages,
+ * KSM maintains two types of nodes in the stable tree:
+ *
+ * * the regular nodes that keep the reverse mapping structures in a
+ * linked list
+ * * the "chains" that link nodes ("dups") that represent the same
+ * write protected memory content, but each "dup" corresponds to a
+ * different KSM page copy of that content
+ *
+ * Internally, the regular nodes, "dups" and "chains" are represented
+ * using the same :c:type:`struct stable_node` structure.
+ *
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
* be "unchanged for a period of time". The unstable tree sorts these pages
@@ -1049,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are downgrading page table to read
* only not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
/*
@@ -1145,7 +1162,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..1695f38630f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
if (ret)
goto out_put_css;
- efile.file->f_op->poll(efile.file, &event->pt);
+ vfs_poll(efile.file, &event->pt);
spin_lock(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
diff --git a/mm/mmap.c b/mm/mmap.c
index fc41c0543d7f..d817764a9974 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2828,7 +2828,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
current->comm, current->pid);
if (prot)
diff --git a/mm/rmap.c b/mm/rmap.c
index 8d5337fed37b..6db729dc4c50 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
if (ret)
(*cleaned)++;
@@ -1599,7 +1599,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* point at new page while a device still is using this
* page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
dec_mm_counter(mm, mm_counter_file(page));
}
@@ -1609,7 +1609,7 @@ discard:
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
diff --git a/mm/util.c b/mm/util.c
index 45fc3169e7b0..c2d0a7cdb189 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -621,7 +621,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 848969fe7979..588bf88c3305 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
static __poll_t
p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
{
- __poll_t ret, n;
+ __poll_t ret;
struct p9_trans_fd *ts = NULL;
if (client && client->status == Connected)
@@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
return EPOLLERR;
}
- if (!ts->rd->f_op->poll)
- ret = DEFAULT_POLLMASK;
- else
- ret = ts->rd->f_op->poll(ts->rd, pt);
-
- if (ts->rd != ts->wr) {
- if (!ts->wr->f_op->poll)
- n = DEFAULT_POLLMASK;
- else
- n = ts->wr->f_op->poll(ts->wr, pt);
- ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
- }
-
+ ret = vfs_poll(ts->rd, pt);
+ if (ts->rd != ts->wr)
+ ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
return ret;
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 9b6bc5abe946..55fdba05d7d9 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = atalk_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = atalk_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = atalk_compat_ioctl,
diff --git a/net/atm/common.c b/net/atm/common.c
index fc78a0508ae1..1f2af59935db 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -648,16 +648,11 @@ out:
return error;
}
-__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t vcc_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
- struct atm_vcc *vcc;
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
-
- vcc = ATM_SD(sock);
+ struct atm_vcc *vcc = ATM_SD(sock);
+ __poll_t mask = 0;
/* exceptional events */
if (sk->sk_err)
diff --git a/net/atm/common.h b/net/atm/common.h
index 5850649068bb..526796ad230f 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags);
int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
-__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
+__poll_t vcc_poll_mask(struct socket *sock, __poll_t events);
int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int vcc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2cb10af16afc..9f75092fe778 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = pvc_getname,
- .poll = vcc_poll,
+ .poll_mask = vcc_poll_mask,
.ioctl = vcc_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vcc_compat_ioctl,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 2f91b766ac42..53f4ad7087b1 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = svc_accept,
.getname = svc_getname,
- .poll = vcc_poll,
+ .poll_mask = vcc_poll_mask,
.ioctl = svc_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = svc_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c603d33d5410..d1d2442ce573 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = ax25_accept,
.getname = ax25_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = ax25_ioctl,
.listen = ax25_listen,
.shutdown = ax25_shutdown,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 3264e1873219..510ab4f55df5 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -437,16 +437,13 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
return 0;
}
-__poll_t bt_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
__poll_t mask = 0;
BT_DBG("sock %p, sk %p", sock, sk);
- poll_wait(file, sk_sleep(sk), wait);
-
if (sk->sk_state == BT_LISTEN)
return bt_accept_poll(sk);
@@ -478,7 +475,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
return mask;
}
-EXPORT_SYMBOL(bt_sock_poll);
+EXPORT_SYMBOL(bt_sock_poll_mask);
int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index b5116fa9835e..00deacdcb51c 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index ce86a7bae844..e08f28fadd65 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1506e1632394..d6c099861538 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
.sendmsg = hci_sock_sendmsg,
.recvmsg = hci_sock_recvmsg,
.ioctl = hci_sock_ioctl,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = hci_sock_setsockopt,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 008ba439bd62..1eaac01f85de 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -208,7 +208,6 @@ static const struct proto_ops hidp_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 686bdc6b35b0..742a190034e6 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
.getname = l2cap_sock_getname,
.sendmsg = l2cap_sock_sendmsg,
.recvmsg = l2cap_sock_recvmsg,
- .poll = bt_sock_poll,
+ .poll_mask = bt_sock_poll_mask,
.ioctl = bt_sock_ioctl,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d606e9212291..1cf57622473a 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
.setsockopt = rfcomm_sock_setsockopt,
.getsockopt = rfcomm_sock_getsockopt,
.ioctl = rfcomm_sock_ioctl,
- .poll = bt_sock_poll,
+ .poll_mask = bt_sock_poll_mask,
.socketpair = sock_no_socketpair,
.mmap = sock_no_mmap
};
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee49fec..d60dbc61d170 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {
.getname = sco_sock_getname,
.sendmsg = sco_sock_sendmsg,
.recvmsg = sco_sock_recvmsg,
- .poll = bt_sock_poll,
+ .poll_mask = bt_sock_poll_mask,
.ioctl = bt_sock_ioctl,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a6fb1b3bcad9..c7991867d622 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -934,15 +934,11 @@ static int caif_release(struct socket *sock)
}
/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
-static __poll_t caif_poll(struct file *file,
- struct socket *sock, poll_table *wait)
+static __poll_t caif_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
- __poll_t mask;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ __poll_t mask = 0;
/* exceptional events? */
if (sk->sk_err)
@@ -976,7 +972,7 @@ static const struct proto_ops caif_seqpacket_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = caif_poll,
+ .poll_mask = caif_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -997,7 +993,7 @@ static const struct proto_ops caif_stream_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = caif_poll,
+ .poll_mask = caif_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 6ad89f49b341..97fedff3f0c4 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1657,7 +1657,7 @@ static const struct proto_ops bcm_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/can/raw.c b/net/can/raw.c
index 1051eee82581..fd7e2f49ea6a 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = raw_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952c5c78..f19bf3dc2bd6 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
* datagram_poll - generic datagram poll
- * @file: file struct
* @sock: socket
- * @wait: poll table
+ * @events to wait for
*
* Datagram poll: Again totally generic. This also handles
* sequenced packet sockets providing the socket receive queue
@@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
* and you use a different write policy from sock_writeable()
* then please supply your own write_space callback.
*/
-__poll_t datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ __poll_t mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
return mask;
}
-EXPORT_SYMBOL(datagram_poll);
+EXPORT_SYMBOL(datagram_poll_mask);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2af787e8b130..983b277a1229 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2884,11 +2884,7 @@ void netdev_rx_csum_fault(struct net_device *dev)
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
-/* Actually, we should eliminate this check as soon as we know, that:
- * 1. IOMMU is present and allows to map all the memory.
- * 2. No high memory really exists on this machine.
- */
-
+/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
@@ -2902,20 +2898,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
return 1;
}
}
-
- if (PCI_DMA_BUS_IS_PHYS) {
- struct device *pdev = dev->dev.parent;
-
- if (!pdev)
- return 0;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- dma_addr_t addr = page_to_phys(skb_frag_page(frag));
-
- if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
- return 1;
- }
- }
#endif
return 0;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 815770333d91..2aed99a541d5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2567,12 +2567,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
}
EXPORT_SYMBOL(sock_no_getname);
-__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
-{
- return 0;
-}
-EXPORT_SYMBOL(sock_no_poll);
-
int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -EOPNOTSUPP;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f91e3816806b..0ea2ee56ac1b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -316,8 +316,7 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
void dccp_shutdown(struct sock *sk, int how);
int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
+__poll_t dccp_poll_mask(struct socket *sock, __poll_t events);
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
void dccp_req_err(struct sock *sk, u64 seq);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b08feb219b44..a9e478cd3787 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {
.accept = inet_accept,
.getname = inet_getname,
/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
- .poll = dccp_poll,
+ .poll_mask = dccp_poll_mask,
.ioctl = inet_ioctl,
/* FIXME: work on inet_listen to rename it to sock_common_listen */
.listen = inet_dccp_listen,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b18a6a..17fc4e0166ba 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet6_getname,
- .poll = dccp_poll,
+ .poll_mask = dccp_poll_mask,
.ioctl = inet6_ioctl,
.listen = inet_dccp_listen,
.shutdown = inet_shutdown,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d56e36a6db7..ca21c1c76da0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,20 +312,11 @@ int dccp_disconnect(struct sock *sk, int flags)
EXPORT_SYMBOL_GPL(dccp_disconnect);
-/*
- * Wait for a DCCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
- */
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
{
__poll_t mask;
struct sock *sk = sock->sk;
- sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -367,7 +358,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
return mask;
}
-EXPORT_SYMBOL_GPL(dccp_poll);
+EXPORT_SYMBOL_GPL(dccp_poll_mask);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 7d6ff983ba2c..9a686d890bfa 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
}
-static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait)
+static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct dn_scp *scp = DN_SK(sk);
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = datagram_poll_mask(sock, events);
if (!skb_queue_empty(&scp->other_receive_queue))
mask |= EPOLLRDBAND;
@@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = dn_accept,
.getname = dn_getname,
- .poll = dn_poll,
+ .poll_mask = dn_poll_mask,
.ioctl = dn_ioctl,
.listen = dn_listen,
.shutdown = dn_shutdown,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a60658c85a9a..a0768d2759b8 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = ieee802154_sock_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = ieee802154_sock_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed0367e669..8a59428e63ab 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
- .poll = tcp_poll,
+ .poll_mask = tcp_poll_mask,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
@@ -1018,7 +1018,7 @@ const struct proto_ops inet_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
- .poll = udp_poll,
+ .poll_mask = udp_poll_mask,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
@@ -1039,7 +1039,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
- * udp_poll
+ * udp_poll_mask
*/
static const struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
@@ -1050,7 +1050,7 @@ static const struct proto_ops inet_sockraw_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c9d00ef54dec..dec47e6789e7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,32 +494,21 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
}
/*
- * Wait for a TCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
+ * Socket is not locked. We are protected from async events by poll logic and
+ * correct handling of state changes made by other threads is impossible in
+ * any case.
*/
-__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
{
- __poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ __poll_t mask = 0;
int state;
- sock_poll_wait(file, sk_sleep(sk), wait);
-
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
- /* Socket is not locked. We are protected from async events
- * by poll logic and correct handling of state changes
- * made by other threads is impossible in any case.
- */
-
- mask = 0;
-
/*
* EPOLLHUP is certainly not done right. But poll() doesn't
* have a notion of HUP in just one direction, and for a
@@ -600,7 +589,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
return mask;
}
-EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_poll_mask);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 051a43ff3fb8..675433eb53a8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2501,7 +2501,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
* udp_poll - wait for a UDP event.
* @file - file struct
* @sock - socket
- * @wait - poll table
+ * @events - events to wait for
*
* This is same as datagram poll, except for the special case of
* blocking sockets. If application is using a blocking fd
@@ -2510,23 +2510,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
* but then block when reading it. Add special case code
* to work around these arguably broken applications.
*/
-__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t udp_poll_mask(struct socket *sock, __poll_t events)
{
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = datagram_poll_mask(sock, events);
struct sock *sk = sock->sk;
if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */
- if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(EPOLLIN | EPOLLRDNORM);
return mask;
}
-EXPORT_SYMBOL(udp_poll);
+EXPORT_SYMBOL(udp_poll_mask);
int udp_abort(struct sock *sk, int err)
{
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b513f188..d443c18b45fe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -571,7 +571,7 @@ const struct proto_ops inet6_stream_ops = {
.socketpair = sock_no_socketpair, /* a do nothing */
.accept = inet_accept, /* ok */
.getname = inet6_getname,
- .poll = tcp_poll, /* ok */
+ .poll_mask = tcp_poll_mask, /* ok */
.ioctl = inet6_ioctl, /* must change */
.listen = inet_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
@@ -601,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = {
.socketpair = sock_no_socketpair, /* a do nothing */
.accept = sock_no_accept, /* a do nothing */
.getname = inet6_getname,
- .poll = udp_poll, /* ok */
+ .poll_mask = udp_poll_mask, /* ok */
.ioctl = inet6_ioctl, /* must change */
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index afc307c89d1a..ce6f0d15b5dd 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1334,7 +1334,7 @@ void raw6_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-/* Same as inet6_dgram_ops, sans udp_poll. */
+/* Same as inet6_dgram_ops, sans udp_poll_mask. */
const struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
@@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = {
.socketpair = sock_no_socketpair, /* a do nothing */
.accept = sock_no_accept, /* a do nothing */
.getname = inet6_getname,
- .poll = datagram_poll, /* ok */
+ .poll_mask = datagram_poll_mask, /* ok */
.ioctl = inet6_ioctl, /* must change */
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022f9620..68e86257a549 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1488,14 +1488,11 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
return 0;
}
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
__poll_t mask = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
-
if (sk->sk_state == IUCV_LISTEN)
return iucv_accept_poll(sk);
@@ -2388,7 +2385,7 @@ static const struct proto_ops iucv_sock_ops = {
.getname = iucv_sock_getname,
.sendmsg = iucv_sock_sendmsg,
.recvmsg = iucv_sock_recvmsg,
- .poll = iucv_sock_poll,
+ .poll_mask = iucv_sock_poll_mask,
.ioctl = sock_no_ioctl,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d3601d421571..84b7d5c6fec8 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
struct list_head *head;
int index = 0;
- /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
- * we set sk_state, otherwise epoll_wait always returns right away with
- * EPOLLHUP
+ /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state,
+ * so we set sk_state, otherwise epoll_wait always returns right away
+ * with EPOLLHUP
*/
kcm->sk.sk_state = TCP_ESTABLISHED;
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = kcm_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = kcm_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 5e1d2946ffbf..8bdc1cbe490a 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {
/* Now the operations that really occur. */
.release = pfkey_release,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.sendmsg = pfkey_sendmsg,
.recvmsg = pfkey_recvmsg,
};
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a9c05b2bc1b0..181073bf6925 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = l2tp_ip_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 957369192ca1..336e4c00abbc 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = l2tp_ip6_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = inet6_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 830469766c1f..3d8ca1231f8f 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1788,7 +1788,7 @@ static const struct proto_ops pppol2tp_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = pppol2tp_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = pppol2tp_setsockopt,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9549fa..804de8490186 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {
.socketpair = sock_no_socketpair,
.accept = llc_ui_accept,
.getname = llc_ui_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = llc_ui_ioctl,
.listen = llc_ui_listen,
.shutdown = llc_ui_shutdown,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 393573a99a5a..1189b84413d5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = netlink_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index c2888c78d4c1..b97eb766a1d5 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = nr_accept,
.getname = nr_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = nr_ioctl,
.listen = nr_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ea0c0c6f1874..ab5bb14b49af 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -548,16 +548,13 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
return 0;
}
-static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
__poll_t mask = 0;
pr_debug("%p\n", sk);
- sock_poll_wait(file, sk_sleep(sk), wait);
-
if (sk->sk_state == LLCP_LISTEN)
return llcp_accept_poll(sk);
@@ -899,7 +896,7 @@ static const struct proto_ops llcp_sock_ops = {
.socketpair = sock_no_socketpair,
.accept = llcp_sock_accept,
.getname = llcp_sock_getname,
- .poll = llcp_sock_poll,
+ .poll_mask = llcp_sock_poll_mask,
.ioctl = sock_no_ioctl,
.listen = llcp_sock_listen,
.shutdown = sock_no_shutdown,
@@ -919,7 +916,7 @@ static const struct proto_ops llcp_rawsock_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = llcp_sock_getname,
- .poll = llcp_sock_poll,
+ .poll_mask = llcp_sock_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index e2188deb08dc..60c322531c49 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f9cdd27a7f6f..674390b1f084 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4110,12 +4110,11 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
return 0;
}
-static __poll_t packet_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = datagram_poll_mask(sock, events);
spin_lock_bh(&sk->sk_receive_queue.lock);
if (po->rx_ring.pg_vec) {
@@ -4457,7 +4456,7 @@ static const struct proto_ops packet_ops_spkt = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = packet_getname_spkt,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = packet_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -4478,7 +4477,7 @@ static const struct proto_ops packet_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = packet_getname,
- .poll = packet_poll,
+ .poll_mask = packet_poll_mask,
.ioctl = packet_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 30187990257f..c295c4e20f01 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -340,15 +340,12 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
return sizeof(struct sockaddr_pn);
}
-static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct pep_sock *pn = pep_sk(sk);
__poll_t mask = 0;
- poll_wait(file, sk_sleep(sk), wait);
-
if (sk->sk_state == TCP_CLOSE)
return EPOLLERR;
if (!skb_queue_empty(&sk->sk_receive_queue))
@@ -448,7 +445,7 @@ const struct proto_ops phonet_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = pn_socket_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = pn_socket_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -473,7 +470,7 @@ const struct proto_ops phonet_stream_ops = {
.socketpair = sock_no_socketpair,
.accept = pn_socket_accept,
.getname = pn_socket_getname,
- .poll = pn_socket_poll,
+ .poll_mask = pn_socket_poll_mask,
.ioctl = pn_socket_ioctl,
.listen = pn_socket_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 2aa07b547b16..1b5025ea5b04 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {
.recvmsg = qrtr_recvmsg,
.getname = qrtr_getname,
.ioctl = qrtr_ioctl,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
.getsockopt = sock_no_getsockopt,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 22a7f2b413ac..5b73fea849df 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = rose_accept,
.getname = rose_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = rose_ioctl,
.listen = rose_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2b463047dd7b..3b1ac93efee2 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -734,15 +734,11 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,
/*
* permit an RxRPC socket to be polled
*/
-static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct rxrpc_sock *rx = rxrpc_sk(sk);
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ __poll_t mask = 0;
/* the socket is readable if there are any messages waiting on the Rx
* queue */
@@ -949,7 +945,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
- .poll = rxrpc_poll,
+ .poll_mask = rxrpc_poll_mask,
.ioctl = sock_no_ioctl,
.listen = rxrpc_listen,
.shutdown = rxrpc_shutdown,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0cd2e764f47f..7339918a805d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = sctp_getname,
- .poll = sctp_poll,
+ .poll_mask = sctp_poll_mask,
.ioctl = inet6_ioctl,
.listen = sctp_inet_listen,
.shutdown = inet_shutdown,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 6bf0a9971888..11d93377ba5e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname, /* Semantics are different. */
- .poll = sctp_poll,
+ .poll_mask = sctp_poll_mask,
.ioctl = inet_ioctl,
.listen = sctp_inet_listen,
.shutdown = inet_shutdown, /* Looks harmless. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae7e7c606f72..bf747094d26b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7722,14 +7722,12 @@ out:
* here, again, by modeling the current TCP/UDP code. We don't have
* a good way to test with it yet.
*/
-__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct sctp_sock *sp = sctp_sk(sk);
__poll_t mask;
- poll_wait(file, sk_sleep(sk), wait);
-
sock_rps_record_flow(sk);
/* A TCP-style listening socket becomes readable when the accept queue
diff --git a/net/socket.c b/net/socket.c
index f10f1d947c78..2d752e9eb3f9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -117,8 +117,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
static int sock_mmap(struct file *file, struct vm_area_struct *vma);
static int sock_close(struct inode *inode, struct file *file);
-static __poll_t sock_poll(struct file *file,
- struct poll_table_struct *wait);
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+ __poll_t events);
+static __poll_t sock_poll_mask(struct file *file, __poll_t);
+static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
@@ -141,6 +143,8 @@ static const struct file_operations socket_file_ops = {
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
+ .get_poll_head = sock_get_poll_head,
+ .poll_mask = sock_poll_mask,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
@@ -1114,27 +1118,48 @@ out_release:
}
EXPORT_SYMBOL(sock_create_lite);
-/* No kernel lock held - perfect */
-static __poll_t sock_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+ __poll_t events)
{
- __poll_t busy_flag = 0;
- struct socket *sock;
+ struct socket *sock = file->private_data;
+
+ if (!sock->ops->poll_mask)
+ return NULL;
+ sock_poll_busy_loop(sock, events);
+ return sk_sleep(sock->sk);
+}
+
+static __poll_t sock_poll_mask(struct file *file, __poll_t events)
+{
+ struct socket *sock = file->private_data;
/*
- * We can't return errors to poll, so it's either yes or no.
+ * We need to be sure we are in sync with the socket flags modification.
+ *
+ * This memory barrier is paired in the wq_has_sleeper.
*/
- sock = file->private_data;
+ smp_mb();
+
+ /* this socket can poll_ll so tell the system call */
+ return sock->ops->poll_mask(sock, events) |
+ (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
+}
- if (sk_can_busy_loop(sock->sk)) {
- /* this socket can poll_ll so tell the system call */
- busy_flag = POLL_BUSY_LOOP;
+/* No kernel lock held - perfect */
+static __poll_t sock_poll(struct file *file, poll_table *wait)
+{
+ struct socket *sock = file->private_data;
+ __poll_t events = poll_requested_events(wait), mask = 0;
- /* once, only if requested by syscall */
- if (wait && (wait->_key & POLL_BUSY_LOOP))
- sk_busy_loop(sock->sk, 1);
+ if (sock->ops->poll) {
+ sock_poll_busy_loop(sock, events);
+ mask = sock->ops->poll(file, sock, wait);
+ } else if (sock->ops->poll_mask) {
+ sock_poll_wait(file, sock_get_poll_head(file, events), wait);
+ mask = sock->ops->poll_mask(sock, events);
}
- return busy_flag | sock->ops->poll(file, sock, wait);
+ return mask | sock_poll_busy_flag(sock);
}
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 6be21575503a..3bb45042e833 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -692,10 +692,9 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
}
/**
- * tipc_poll - read and possibly block on pollmask
+ * tipc_poll - read pollmask
* @file: file structure associated with the socket
* @sock: socket for which to calculate the poll bits
- * @wait: ???
*
* Returns pollmask value
*
@@ -709,15 +708,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
* imply that the operation will succeed, merely that it should be performed
* and will not block.
*/
-static __poll_t tipc_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
__poll_t revents = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
-
if (sk->sk_shutdown & RCV_SHUTDOWN)
revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
@@ -3028,7 +3024,7 @@ static const struct proto_ops msg_ops = {
.socketpair = tipc_socketpair,
.accept = sock_no_accept,
.getname = tipc_getname,
- .poll = tipc_poll,
+ .poll_mask = tipc_poll_mask,
.ioctl = tipc_ioctl,
.listen = sock_no_listen,
.shutdown = tipc_shutdown,
@@ -3049,7 +3045,7 @@ static const struct proto_ops packet_ops = {
.socketpair = tipc_socketpair,
.accept = tipc_accept,
.getname = tipc_getname,
- .poll = tipc_poll,
+ .poll_mask = tipc_poll_mask,
.ioctl = tipc_ioctl,
.listen = tipc_listen,
.shutdown = tipc_shutdown,
@@ -3070,7 +3066,7 @@ static const struct proto_ops stream_ops = {
.socketpair = tipc_socketpair,
.accept = tipc_accept,
.getname = tipc_getname,
- .poll = tipc_poll,
+ .poll_mask = tipc_poll_mask,
.ioctl = tipc_ioctl,
.listen = tipc_listen,
.shutdown = tipc_shutdown,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e5473c03d667..95b02a71fd47 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -638,9 +638,8 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
static int unix_socketpair(struct socket *, struct socket *);
static int unix_accept(struct socket *, struct socket *, int, bool);
static int unix_getname(struct socket *, struct sockaddr *, int);
-static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
-static __poll_t unix_dgram_poll(struct file *, struct socket *,
- poll_table *);
+static __poll_t unix_poll_mask(struct socket *, __poll_t);
+static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -681,7 +680,7 @@ static const struct proto_ops unix_stream_ops = {
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = unix_poll,
+ .poll_mask = unix_poll_mask,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -704,7 +703,7 @@ static const struct proto_ops unix_dgram_ops = {
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
- .poll = unix_dgram_poll,
+ .poll_mask = unix_dgram_poll_mask,
.ioctl = unix_ioctl,
.listen = sock_no_listen,
.shutdown = unix_shutdown,
@@ -726,7 +725,7 @@ static const struct proto_ops unix_seqpacket_ops = {
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
- .poll = unix_dgram_poll,
+ .poll_mask = unix_dgram_poll_mask,
.ioctl = unix_ioctl,
.listen = unix_listen,
.shutdown = unix_shutdown,
@@ -2630,13 +2629,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return err;
}
-static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
+static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk;
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ __poll_t mask = 0;
/* exceptional events? */
if (sk->sk_err)
@@ -2665,15 +2661,11 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
return mask;
}
-static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
{
struct sock *sk = sock->sk, *other;
- unsigned int writable;
- __poll_t mask;
-
- sock_poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ int writable;
+ __poll_t mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -2699,7 +2691,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
}
/* No write status requested, avoid expensive OUT tests. */
- if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
+ if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
return mask;
writable = unix_writable(sk);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index c1076c19b858..bb5d5fa68c35 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -850,18 +850,11 @@ static int vsock_shutdown(struct socket *sock, int mode)
return err;
}
-static __poll_t vsock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events)
{
- struct sock *sk;
- __poll_t mask;
- struct vsock_sock *vsk;
-
- sk = sock->sk;
- vsk = vsock_sk(sk);
-
- poll_wait(file, sk_sleep(sk), wait);
- mask = 0;
+ struct sock *sk = sock->sk;
+ struct vsock_sock *vsk = vsock_sk(sk);
+ __poll_t mask = 0;
if (sk->sk_err)
/* Signify that there has been an error on this socket. */
@@ -1091,7 +1084,7 @@ static const struct proto_ops vsock_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = vsock_getname,
- .poll = vsock_poll,
+ .poll_mask = vsock_poll_mask,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = vsock_shutdown,
@@ -1849,7 +1842,7 @@ static const struct proto_ops vsock_stream_ops = {
.socketpair = sock_no_socketpair,
.accept = vsock_accept,
.getname = vsock_getname,
- .poll = vsock_poll,
+ .poll_mask = vsock_poll_mask,
.ioctl = sock_no_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d49aa79b7997..f93365ae0fdd 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = x25_accept,
.getname = x25_getname,
- .poll = datagram_poll,
+ .poll_mask = datagram_poll_mask,
.ioctl = x25_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_x25_ioctl,
diff --git a/scripts/documentation-file-ref-check b/scripts/documentation-file-ref-check
index bc1659900e89..2520bc14ffac 100755
--- a/scripts/documentation-file-ref-check
+++ b/scripts/documentation-file-ref-check
@@ -1,15 +1,116 @@
-#!/bin/sh
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+#
# Treewide grep for references to files under Documentation, and report
# non-existing files in stderr.
-for f in $(git ls-files); do
- for ref in $(grep -ho "Documentation/[A-Za-z0-9_.,~/*+-]*" "$f"); do
- # presume trailing . and , are not part of the name
- ref=${ref%%[.,]}
-
- # use ls to handle wildcards
- if ! ls $ref >/dev/null 2>&1; then
- echo "$f: $ref" >&2
- fi
- done
-done
+use warnings;
+use strict;
+use Getopt::Long qw(:config no_auto_abbrev);
+
+my $scriptname = $0;
+$scriptname =~ s,.*/([^/]+/),$1,;
+
+# Parse arguments
+my $help = 0;
+my $fix = 0;
+
+GetOptions(
+ 'fix' => \$fix,
+ 'h|help|usage' => \$help,
+);
+
+if ($help != 0) {
+ print "$scriptname [--help] [--fix-rst]\n";
+ exit -1;
+}
+
+# Step 1: find broken references
+print "Finding broken references. This may take a while... " if ($fix);
+
+my %broken_ref;
+
+open IN, "git grep 'Documentation/'|"
+ or die "Failed to run git grep";
+while (<IN>) {
+ next if (!m/^([^:]+):(.*)/);
+
+ my $f = $1;
+ my $ln = $2;
+
+ # Makefiles contain nasty expressions to parse docs
+ next if ($f =~ m/Makefile/);
+ # Skip this script
+ next if ($f eq $scriptname);
+
+ if ($ln =~ m,\b(\S*)(Documentation/[A-Za-z0-9\_\.\,\~/\*+-]*),) {
+ my $prefix = $1;
+ my $ref = $2;
+ my $base = $2;
+
+ $ref =~ s/[\,\.]+$//;
+
+ my $fulref = "$prefix$ref";
+
+ $fulref =~ s/^(\<file|ref)://;
+ $fulref =~ s/^[\'\`]+//;
+ $fulref =~ s,^\$\(.*\)/,,;
+ $base =~ s,.*/,,;
+
+ # Remove URL false-positives
+ next if ($fulref =~ m/^http/);
+
+ # Check if exists, evaluating wildcards
+ next if (grep -e, glob("$ref $fulref"));
+
+ if ($fix) {
+ if (!($ref =~ m/(devicetree|scripts|Kconfig|Kbuild)/)) {
+ $broken_ref{$ref}++;
+ }
+ } else {
+ print STDERR "$f: $fulref\n";
+ }
+ }
+}
+
+exit 0 if (!$fix);
+
+# Step 2: Seek for file name alternatives
+print "Auto-fixing broken references. Please double-check the results\n";
+
+foreach my $ref (keys %broken_ref) {
+ my $new =$ref;
+
+ # get just the basename
+ $new =~ s,.*/,,;
+
+ # Seek for the same name on another place, as it may have been moved
+ my $f="";
+
+ $f = qx(find . -iname $new) if ($new);
+
+ # usual reason for breakage: file renamed to .rst
+ if (!$f) {
+ $new =~ s/\.txt$/.rst/;
+ $f=qx(find . -iname $new) if ($new);
+ }
+
+ my @find = split /\s+/, $f;
+
+ if (!$f) {
+ print STDERR "ERROR: Didn't find a replacement for $ref\n";
+ } elsif (scalar(@find) > 1) {
+ print STDERR "WARNING: Won't auto-replace, as found multiple files close to $ref:\n";
+ foreach my $j (@find) {
+ $j =~ s,^./,,;
+ print STDERR " $j\n";
+ }
+ } else {
+ $f = $find[0];
+ $f =~ s,^./,,;
+ print "INFO: Replacing $ref to $f\n";
+ foreach my $j (qx(git grep -l $ref)) {
+ qx(sed "s\@$ref\@$f\@g" -i $j);
+ }
+ }
+}
diff --git a/scripts/faddr2line b/scripts/faddr2line
index 1876a741087c..a0149db00be7 100755
--- a/scripts/faddr2line
+++ b/scripts/faddr2line
@@ -56,7 +56,7 @@ command -v ${SIZE} >/dev/null 2>&1 || die "size isn't installed"
command -v ${NM} >/dev/null 2>&1 || die "nm isn't installed"
usage() {
- echo "usage: faddr2line <object file> <func+offset> <func+offset>..." >&2
+ echo "usage: faddr2line [--list] <object file> <func+offset> <func+offset>..." >&2
exit 1
}
@@ -166,15 +166,25 @@ __faddr2line() {
local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;")
[[ -z $file_lines ]] && return
+ if [[ $LIST = 0 ]]; then
+ echo "$file_lines" | while read -r line
+ do
+ echo $line
+ done
+ DONE=1;
+ return
+ fi
+
# show each line with context
echo "$file_lines" | while read -r line
do
+ echo
echo $line
n=$(echo $line | sed 's/.*:\([0-9]\+\).*/\1/g')
n1=$[$n-5]
n2=$[$n+5]
f=$(echo $line | sed 's/.*at \(.\+\):.*/\1/g')
- awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") {printf("%d\t%s\n", NR, $0)}' $f
+ awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") { if (NR=='$n') printf(">%d<", NR); else printf(" %d ", NR); printf("\t%s\n", $0)}' $f
done
DONE=1
@@ -185,6 +195,10 @@ __faddr2line() {
[[ $# -lt 2 ]] && usage
objfile=$1
+
+LIST=0
+[[ "$objfile" == "--list" ]] && LIST=1 && shift && objfile=$1
+
[[ ! -f $objfile ]] && die "can't find objfile $objfile"
shift
diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py
new file mode 100755
index 000000000000..7deaef297f52
--- /dev/null
+++ b/scripts/spdxcheck.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: GPL-2.0
+# Copyright Thomas Gleixner <tglx@linutronix.de>
+
+from argparse import ArgumentParser
+from ply import lex, yacc
+import traceback
+import sys
+import git
+import re
+import os
+
+class ParserException(Exception):
+ def __init__(self, tok, txt):
+ self.tok = tok
+ self.txt = txt
+
+class SPDXException(Exception):
+ def __init__(self, el, txt):
+ self.el = el
+ self.txt = txt
+
+class SPDXdata(object):
+ def __init__(self):
+ self.license_files = 0
+ self.exception_files = 0
+ self.licenses = [ ]
+ self.exceptions = { }
+
+# Read the spdx data from the LICENSES directory
+def read_spdxdata(repo):
+
+ # The subdirectories of LICENSES in the kernel source
+ license_dirs = [ "preferred", "other", "exceptions" ]
+ lictree = repo.heads.master.commit.tree['LICENSES']
+
+ spdx = SPDXdata()
+
+ for d in license_dirs:
+ for el in lictree[d].traverse():
+ if not os.path.isfile(el.path):
+ continue
+
+ exception = None
+ for l in open(el.path).readlines():
+ if l.startswith('Valid-License-Identifier:'):
+ lid = l.split(':')[1].strip().upper()
+ if lid in spdx.licenses:
+ raise SPDXException(el, 'Duplicate License Identifier: %s' %lid)
+ else:
+ spdx.licenses.append(lid)
+
+ elif l.startswith('SPDX-Exception-Identifier:'):
+ exception = l.split(':')[1].strip().upper()
+ spdx.exceptions[exception] = []
+
+ elif l.startswith('SPDX-Licenses:'):
+ for lic in l.split(':')[1].upper().strip().replace(' ', '').replace('\t', '').split(','):
+ if not lic in spdx.licenses:
+ raise SPDXException(None, 'Exception %s missing license %s' %(ex, lic))
+ spdx.exceptions[exception].append(lic)
+
+ elif l.startswith("License-Text:"):
+ if exception:
+ if not len(spdx.exceptions[exception]):
+ raise SPDXException(el, 'Exception %s is missing SPDX-Licenses' %excid)
+ spdx.exception_files += 1
+ else:
+ spdx.license_files += 1
+ break
+ return spdx
+
+class id_parser(object):
+
+ reserved = [ 'AND', 'OR', 'WITH' ]
+ tokens = [ 'LPAR', 'RPAR', 'ID', 'EXC' ] + reserved
+
+ precedence = ( ('nonassoc', 'AND', 'OR'), )
+
+ t_ignore = ' \t'
+
+ def __init__(self, spdx):
+ self.spdx = spdx
+ self.lasttok = None
+ self.lastid = None
+ self.lexer = lex.lex(module = self, reflags = re.UNICODE)
+ # Initialize the parser. No debug file and no parser rules stored on disk
+ # The rules are small enough to be generated on the fly
+ self.parser = yacc.yacc(module = self, write_tables = False, debug = False)
+ self.lines_checked = 0
+ self.checked = 0
+ self.spdx_valid = 0
+ self.spdx_errors = 0
+ self.curline = 0
+ self.deepest = 0
+
+ # Validate License and Exception IDs
+ def validate(self, tok):
+ id = tok.value.upper()
+ if tok.type == 'ID':
+ if not id in self.spdx.licenses:
+ raise ParserException(tok, 'Invalid License ID')
+ self.lastid = id
+ elif tok.type == 'EXC':
+ if not self.spdx.exceptions.has_key(id):
+ raise ParserException(tok, 'Invalid Exception ID')
+ if self.lastid not in self.spdx.exceptions[id]:
+ raise ParserException(tok, 'Exception not valid for license %s' %self.lastid)
+ self.lastid = None
+ elif tok.type != 'WITH':
+ self.lastid = None
+
+ # Lexer functions
+ def t_RPAR(self, tok):
+ r'\)'
+ self.lasttok = tok.type
+ return tok
+
+ def t_LPAR(self, tok):
+ r'\('
+ self.lasttok = tok.type
+ return tok
+
+ def t_ID(self, tok):
+ r'[A-Za-z.0-9\-+]+'
+
+ if self.lasttok == 'EXC':
+ print(tok)
+ raise ParserException(tok, 'Missing parentheses')
+
+ tok.value = tok.value.strip()
+ val = tok.value.upper()
+
+ if val in self.reserved:
+ tok.type = val
+ elif self.lasttok == 'WITH':
+ tok.type = 'EXC'
+
+ self.lasttok = tok.type
+ self.validate(tok)
+ return tok
+
+ def t_error(self, tok):
+ raise ParserException(tok, 'Invalid token')
+
+ def p_expr(self, p):
+ '''expr : ID
+ | ID WITH EXC
+ | expr AND expr
+ | expr OR expr
+ | LPAR expr RPAR'''
+ pass
+
+ def p_error(self, p):
+ if not p:
+ raise ParserException(None, 'Unfinished license expression')
+ else:
+ raise ParserException(p, 'Syntax error')
+
+ def parse(self, expr):
+ self.lasttok = None
+ self.lastid = None
+ self.parser.parse(expr, lexer = self.lexer)
+
+ def parse_lines(self, fd, maxlines, fname):
+ self.checked += 1
+ self.curline = 0
+ try:
+ for line in fd:
+ self.curline += 1
+ if self.curline > maxlines:
+ break
+ self.lines_checked += 1
+ if line.find("SPDX-License-Identifier:") < 0:
+ continue
+ expr = line.split(':')[1].replace('*/', '').strip()
+ self.parse(expr)
+ self.spdx_valid += 1
+ #
+ # Should we check for more SPDX ids in the same file and
+ # complain if there are any?
+ #
+ break
+
+ except ParserException as pe:
+ if pe.tok:
+ col = line.find(expr) + pe.tok.lexpos
+ tok = pe.tok.value
+ sys.stdout.write('%s: %d:%d %s: %s\n' %(fname, self.curline, col, pe.txt, tok))
+ else:
+ sys.stdout.write('%s: %d:0 %s\n' %(fname, self.curline, col, pe.txt))
+ self.spdx_errors += 1
+
+def scan_git_tree(tree):
+ for el in tree.traverse():
+ # Exclude stuff which would make pointless noise
+ # FIXME: Put this somewhere more sensible
+ if el.path.startswith("LICENSES"):
+ continue
+ if el.path.find("license-rules.rst") >= 0:
+ continue
+ if el.path == 'scripts/checkpatch.pl':
+ continue
+ if not os.path.isfile(el.path):
+ continue
+ parser.parse_lines(open(el.path), args.maxlines, el.path)
+
+def scan_git_subtree(tree, path):
+ for p in path.strip('/').split('/'):
+ tree = tree[p]
+ scan_git_tree(tree)
+
+if __name__ == '__main__':
+
+ ap = ArgumentParser(description='SPDX expression checker')
+ ap.add_argument('path', nargs='*', help='Check path or file. If not given full git tree scan. For stdin use "-"')
+ ap.add_argument('-m', '--maxlines', type=int, default=15,
+ help='Maximum number of lines to scan in a file. Default 15')
+ ap.add_argument('-v', '--verbose', action='store_true', help='Verbose statistics output')
+ args = ap.parse_args()
+
+ # Sanity check path arguments
+ if '-' in args.path and len(args.path) > 1:
+ sys.stderr.write('stdin input "-" must be the only path argument\n')
+ sys.exit(1)
+
+ try:
+ # Use git to get the valid license expressions
+ repo = git.Repo(os.getcwd())
+ assert not repo.bare
+
+ # Initialize SPDX data
+ spdx = read_spdxdata(repo)
+
+ # Initilize the parser
+ parser = id_parser(spdx)
+
+ except SPDXException as se:
+ if se.el:
+ sys.stderr.write('%s: %s\n' %(se.el.path, se.txt))
+ else:
+ sys.stderr.write('%s\n' %se.txt)
+ sys.exit(1)
+
+ except Exception as ex:
+ sys.stderr.write('FAIL: %s\n' %ex)
+ sys.stderr.write('%s\n' %traceback.format_exc())
+ sys.exit(1)
+
+ try:
+ if len(args.path) and args.path[0] == '-':
+ parser.parse_lines(sys.stdin, args.maxlines, '-')
+ else:
+ if args.path:
+ for p in args.path:
+ if os.path.isfile(p):
+ parser.parse_lines(open(p), args.maxlines, p)
+ elif os.path.isdir(p):
+ scan_git_subtree(repo.head.reference.commit.tree, p)
+ else:
+ sys.stderr.write('path %s does not exist\n' %p)
+ sys.exit(1)
+ else:
+ # Full git tree scan
+ scan_git_tree(repo.head.commit.tree)
+
+ if args.verbose:
+ sys.stderr.write('\n')
+ sys.stderr.write('License files: %12d\n' %spdx.license_files)
+ sys.stderr.write('Exception files: %12d\n' %spdx.exception_files)
+ sys.stderr.write('License IDs %12d\n' %len(spdx.licenses))
+ sys.stderr.write('Exception IDs %12d\n' %len(spdx.exceptions))
+ sys.stderr.write('\n')
+ sys.stderr.write('Files checked: %12d\n' %parser.checked)
+ sys.stderr.write('Lines checked: %12d\n' %parser.lines_checked)
+ sys.stderr.write('Files with SPDX: %12d\n' %parser.spdx_valid)
+ sys.stderr.write('Files with errors: %12d\n' %parser.spdx_errors)
+
+ sys.exit(0)
+
+ except Exception as ex:
+ sys.stderr.write('FAIL: %s\n' %ex)
+ sys.stderr.write('%s\n' %traceback.format_exc())
+ sys.exit(1)
diff --git a/security/commoncap.c b/security/commoncap.c
index 1ce701fcb3f3..f4c33abd9959 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -919,6 +919,8 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
+ struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
+
/* Ignore non-security xattrs */
if (strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
@@ -931,7 +933,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
if (strcmp(name, XATTR_NAME_CAPS) == 0)
return 0;
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -949,6 +951,8 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
+ struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
+
/* Ignore non-security xattrs */
if (strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
@@ -964,7 +968,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
return 0;
}
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index a46fba322340..facf9cdd577d 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -200,7 +200,8 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
int size;
bool ima_present = false;
- if (!(inode->i_opflags & IOP_XATTR))
+ if (!(inode->i_opflags & IOP_XATTR) ||
+ inode->i_sb->s_user_ns != &init_user_ns)
return -EOPNOTSUPP;
desc = init_desc(type);
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 578793e97431..fb00a2fca990 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
-
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
@@ -207,13 +206,19 @@
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
-
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
-
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -274,9 +279,10 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -334,6 +340,7 @@
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
/*
* BUG word(s)
@@ -363,5 +370,6 @@
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index a3a4427441bf..70fe61295733 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -21,6 +21,9 @@
/* &a[0] degrades to a pointer: a different type from an array */
#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+#ifndef __pure
+#define __pure __attribute__((pure))
+#endif
#define noinline __attribute__((noinline))
#ifndef __packed
#define __packed __attribute__((packed))
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index af5f8c2df87a..db9f15f5db04 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -207,4 +207,16 @@ struct prctl_mm_map {
# define PR_SVE_VL_LEN_MASK 0xffff
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
+/* Per task speculation control */
+#define PR_GET_SPECULATION_CTRL 52
+#define PR_SET_SPECULATION_CTRL 53
+/* Speculation control variants */
+# define PR_SPEC_STORE_BYPASS 0
+/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
+# define PR_SPEC_NOT_AFFECTED 0
+# define PR_SPEC_PRCTL (1UL << 0)
+# define PR_SPEC_ENABLE (1UL << 1)
+# define PR_SPEC_DISABLE (1UL << 2)
+# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+
#endif /* _LINUX_PRCTL_H */
diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c
index 7b7fd0b18551..120037496f77 100644
--- a/tools/lib/api/fs/tracing_path.c
+++ b/tools/lib/api/fs/tracing_path.c
@@ -13,11 +13,9 @@
#include "tracing_path.h"
-
-char tracing_mnt[PATH_MAX] = "/sys/kernel/debug";
-char tracing_path[PATH_MAX] = "/sys/kernel/debug/tracing";
-char tracing_events_path[PATH_MAX] = "/sys/kernel/debug/tracing/events";
-
+static char tracing_mnt[PATH_MAX] = "/sys/kernel/debug";
+static char tracing_path[PATH_MAX] = "/sys/kernel/debug/tracing";
+static char tracing_events_path[PATH_MAX] = "/sys/kernel/debug/tracing/events";
static void __tracing_path_set(const char *tracing, const char *mountpoint)
{
@@ -76,7 +74,7 @@ char *get_tracing_file(const char *name)
{
char *file;
- if (asprintf(&file, "%s/%s", tracing_path, name) < 0)
+ if (asprintf(&file, "%s/%s", tracing_path_mount(), name) < 0)
return NULL;
return file;
@@ -87,6 +85,34 @@ void put_tracing_file(char *file)
free(file);
}
+char *get_events_file(const char *name)
+{
+ char *file;
+
+ if (asprintf(&file, "%s/events/%s", tracing_path_mount(), name) < 0)
+ return NULL;
+
+ return file;
+}
+
+void put_events_file(char *file)
+{
+ free(file);
+}
+
+DIR *tracing_events__opendir(void)
+{
+ DIR *dir = NULL;
+ char *path = get_tracing_file("events");
+
+ if (path) {
+ dir = opendir(path);
+ put_events_file(path);
+ }
+
+ return dir;
+}
+
int tracing_path__strerror_open_tp(int err, char *buf, size_t size,
const char *sys, const char *name)
{
@@ -129,7 +155,7 @@ int tracing_path__strerror_open_tp(int err, char *buf, size_t size,
snprintf(buf, size,
"Error:\tNo permissions to read %s/%s\n"
"Hint:\tTry 'sudo mount -o remount,mode=755 %s'\n",
- tracing_events_path, filename, tracing_mnt);
+ tracing_events_path, filename, tracing_path_mount());
}
break;
default:
diff --git a/tools/lib/api/fs/tracing_path.h b/tools/lib/api/fs/tracing_path.h
index 0066f06cc381..a19136b086dc 100644
--- a/tools/lib/api/fs/tracing_path.h
+++ b/tools/lib/api/fs/tracing_path.h
@@ -3,9 +3,9 @@
#define __API_FS_TRACING_PATH_H
#include <linux/types.h>
+#include <dirent.h>
-extern char tracing_path[];
-extern char tracing_events_path[];
+DIR *tracing_events__opendir(void);
void tracing_path_set(const char *mountpoint);
const char *tracing_path_mount(void);
@@ -13,5 +13,10 @@ const char *tracing_path_mount(void);
char *get_tracing_file(const char *name);
void put_tracing_file(char *file);
+char *get_events_file(const char *name);
+void put_events_file(char *file);
+
+#define zput_events_file(ptr) ({ free(*ptr); *ptr = NULL; })
+
int tracing_path__strerror_open_tp(int err, char *buf, size_t size, const char *sys, const char *name);
#endif /* __API_FS_TRACING_PATH_H */
diff --git a/tools/lib/symbol/kallsyms.c b/tools/lib/symbol/kallsyms.c
index 689b6a130dd7..96d830545bbb 100644
--- a/tools/lib/symbol/kallsyms.c
+++ b/tools/lib/symbol/kallsyms.c
@@ -10,6 +10,12 @@ u8 kallsyms2elf_type(char type)
return (type == 't' || type == 'w') ? STT_FUNC : STT_OBJECT;
}
+bool kallsyms__is_function(char symbol_type)
+{
+ symbol_type = toupper(symbol_type);
+ return symbol_type == 'T' || symbol_type == 'W';
+}
+
int kallsyms__parse(const char *filename, void *arg,
int (*process_symbol)(void *arg, const char *name,
char type, u64 start))
diff --git a/tools/lib/symbol/kallsyms.h b/tools/lib/symbol/kallsyms.h
index bc40101d72c1..72ab9870454b 100644
--- a/tools/lib/symbol/kallsyms.h
+++ b/tools/lib/symbol/kallsyms.h
@@ -20,6 +20,8 @@ static inline u8 kallsyms2elf_binding(char type)
u8 kallsyms2elf_type(char type);
+bool kallsyms__is_function(char symbol_type);
+
int kallsyms__parse(const char *filename, void *arg,
int (*process_symbol)(void *arg, const char *name,
char type, u64 start));
diff --git a/tools/memory-model/Documentation/cheatsheet.txt b/tools/memory-model/Documentation/cheatsheet.txt
index 956b1ae4aafb..33ba98d72b16 100644
--- a/tools/memory-model/Documentation/cheatsheet.txt
+++ b/tools/memory-model/Documentation/cheatsheet.txt
@@ -1,6 +1,6 @@
Prior Operation Subsequent Operation
--------------- ---------------------------
- C Self R W RWM Self R W DR DW RMW SV
+ C Self R W RMW Self R W DR DW RMW SV
-- ---- - - --- ---- - - -- -- --- --
Store, e.g., WRITE_ONCE() Y Y
@@ -14,7 +14,7 @@ smp_wmb() Y W Y Y W
smp_mb() & synchronize_rcu() CP Y Y Y Y Y Y Y Y
Successful full non-void RMW CP Y Y Y Y Y Y Y Y Y Y Y
smp_mb__before_atomic() CP Y Y Y a a a a Y
-smp_mb__after_atomic() CP a a Y Y Y Y Y
+smp_mb__after_atomic() CP a a Y Y Y Y Y Y
Key: C: Ordering is cumulative
@@ -26,4 +26,5 @@ Key: C: Ordering is cumulative
DR: Dependent read (address dependency)
DW: Dependent write (address, data, or control dependency)
RMW: Atomic read-modify-write operation
- SV Same-variable access
+ SELF: Orders self, as opposed to accesses before and/or after
+ SV: Orders later accesses to the same variable
diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index a727c82bd434..1b09f3175a1f 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -27,7 +27,7 @@ Explanation of the Linux-Kernel Memory Consistency Model
19. AND THEN THERE WAS ALPHA
20. THE HAPPENS-BEFORE RELATION: hb
21. THE PROPAGATES-BEFORE RELATION: pb
- 22. RCU RELATIONS: link, gp-link, rscs-link, and rcu-path
+ 22. RCU RELATIONS: rcu-link, gp, rscs, rcu-fence, and rb
23. ODDS AND ENDS
@@ -1451,8 +1451,8 @@ they execute means that it cannot have cycles. This requirement is
the content of the LKMM's "propagation" axiom.
-RCU RELATIONS: link, gp-link, rscs-link, and rcu-path
------------------------------------------------------
+RCU RELATIONS: rcu-link, gp, rscs, rcu-fence, and rb
+----------------------------------------------------
RCU (Read-Copy-Update) is a powerful synchronization mechanism. It
rests on two concepts: grace periods and read-side critical sections.
@@ -1509,8 +1509,8 @@ y, which occurs before the end of the critical section, did not
propagate to P1 before the end of the grace period, violating the
Guarantee.
-In the kernel's implementations of RCU, the business about stores
-propagating to every CPU is realized by placing strong fences at
+In the kernel's implementations of RCU, the requirements for stores
+to propagate to every CPU are fulfilled by placing strong fences at
suitable places in the RCU-related code. Thus, if a critical section
starts before a grace period does then the critical section's CPU will
execute an smp_mb() fence after the end of the critical section and
@@ -1523,72 +1523,124 @@ executes.
What exactly do we mean by saying that a critical section "starts
before" or "ends after" a grace period? Some aspects of the meaning
are pretty obvious, as in the example above, but the details aren't
-entirely clear. The LKMM formalizes this notion by means of a
-relation with the unfortunately generic name "link". It is a very
-general relation; among other things, X ->link Z includes cases where
-X happens-before or is equal to some event Y which is equal to or
-comes before Z in the coherence order. Taking Y = Z, this says that
-X ->rfe Z implies X ->link Z, and taking Y = X, it says that X ->fr Z
-and X ->co Z each imply X ->link Z.
-
-The formal definition of the link relation is more than a little
+entirely clear. The LKMM formalizes this notion by means of the
+rcu-link relation. rcu-link encompasses a very general notion of
+"before": Among other things, X ->rcu-link Z includes cases where X
+happens-before or is equal to some event Y which is equal to or comes
+before Z in the coherence order. When Y = Z this says that X ->rfe Z
+implies X ->rcu-link Z. In addition, when Y = X it says that X ->fr Z
+and X ->co Z each imply X ->rcu-link Z.
+
+The formal definition of the rcu-link relation is more than a little
obscure, and we won't give it here. It is closely related to the pb
relation, and the details don't matter unless you want to comb through
a somewhat lengthy formal proof. Pretty much all you need to know
-about link is the information in the preceding paragraph.
-
-The LKMM goes on to define the gp-link and rscs-link relations. They
-bring grace periods and read-side critical sections into the picture,
-in the following way:
-
- E ->gp-link F means there is a synchronize_rcu() fence event S
- and an event X such that E ->po S, either S ->po X or S = X,
- and X ->link F. In other words, E and F are connected by a
- grace period followed by an instance of link.
-
- E ->rscs-link F means there is a critical section delimited by
- an rcu_read_lock() fence L and an rcu_read_unlock() fence U,
- and an event X such that E ->po U, either L ->po X or L = X,
- and X ->link F. Roughly speaking, this says that some event
- in the same critical section as E is connected by link to F.
-
-If we think of the link relation as standing for an extended "before",
-then E ->gp-link F says that E executes before a grace period which
-ends before F executes. (In fact it says more than this, because it
-includes cases where E executes before a grace period and some store
-propagates to F's CPU before F executes and doesn't propagate to some
-other CPU until after the grace period ends.) Similarly,
-E ->rscs-link F says that E is part of (or before the start of) a
-critical section which starts before F executes.
+about rcu-link is the information in the preceding paragraph.
+
+The LKMM also defines the gp and rscs relations. They bring grace
+periods and read-side critical sections into the picture, in the
+following way:
+
+ E ->gp F means there is a synchronize_rcu() fence event S such
+ that E ->po S and either S ->po F or S = F. In simple terms,
+ there is a grace period po-between E and F.
+
+ E ->rscs F means there is a critical section delimited by an
+ rcu_read_lock() fence L and an rcu_read_unlock() fence U, such
+ that E ->po U and either L ->po F or L = F. You can think of
+ this as saying that E and F are in the same critical section
+ (in fact, it also allows E to be po-before the start of the
+ critical section and F to be po-after the end).
+
+If we think of the rcu-link relation as standing for an extended
+"before", then X ->gp Y ->rcu-link Z says that X executes before a
+grace period which ends before Z executes. (In fact it covers more
+than this, because it also includes cases where X executes before a
+grace period and some store propagates to Z's CPU before Z executes
+but doesn't propagate to some other CPU until after the grace period
+ends.) Similarly, X ->rscs Y ->rcu-link Z says that X is part of (or
+before the start of) a critical section which starts before Z
+executes.
+
+The LKMM goes on to define the rcu-fence relation as a sequence of gp
+and rscs links separated by rcu-link links, in which the number of gp
+links is >= the number of rscs links. For example:
+
+ X ->gp Y ->rcu-link Z ->rscs T ->rcu-link U ->gp V
+
+would imply that X ->rcu-fence V, because this sequence contains two
+gp links and only one rscs link. (It also implies that X ->rcu-fence T
+and Z ->rcu-fence V.) On the other hand:
+
+ X ->rscs Y ->rcu-link Z ->rscs T ->rcu-link U ->gp V
+
+does not imply X ->rcu-fence V, because the sequence contains only
+one gp link but two rscs links.
+
+The rcu-fence relation is important because the Grace Period Guarantee
+means that rcu-fence acts kind of like a strong fence. In particular,
+if W is a write and we have W ->rcu-fence Z, the Guarantee says that W
+will propagate to every CPU before Z executes.
+
+To prove this in full generality requires some intellectual effort.
+We'll consider just a very simple case:
+
+ W ->gp X ->rcu-link Y ->rscs Z.
+
+This formula means that there is a grace period G and a critical
+section C such that:
+
+ 1. W is po-before G;
+
+ 2. X is equal to or po-after G;
+
+ 3. X comes "before" Y in some sense;
+
+ 4. Y is po-before the end of C;
+
+ 5. Z is equal to or po-after the start of C.
+
+From 2 - 4 we deduce that the grace period G ends before the critical
+section C. Then the second part of the Grace Period Guarantee says
+not only that G starts before C does, but also that W (which executes
+on G's CPU before G starts) must propagate to every CPU before C
+starts. In particular, W propagates to every CPU before Z executes
+(or finishes executing, in the case where Z is equal to the
+rcu_read_lock() fence event which starts C.) This sort of reasoning
+can be expanded to handle all the situations covered by rcu-fence.
+
+Finally, the LKMM defines the RCU-before (rb) relation in terms of
+rcu-fence. This is done in essentially the same way as the pb
+relation was defined in terms of strong-fence. We will omit the
+details; the end result is that E ->rb F implies E must execute before
+F, just as E ->pb F does (and for much the same reasons).
Putting this all together, the LKMM expresses the Grace Period
-Guarantee by requiring that there are no cycles consisting of gp-link
-and rscs-link connections in which the number of gp-link instances is
->= the number of rscs-link instances. It does this by defining the
-rcu-path relation to link events E and F whenever it is possible to
-pass from E to F by a sequence of gp-link and rscs-link connections
-with at least as many of the former as the latter. The LKMM's "rcu"
-axiom then says that there are no events E such that E ->rcu-path E.
-
-Justifying this axiom takes some intellectual effort, but it is in
-fact a valid formalization of the Grace Period Guarantee. We won't
-attempt to go through the detailed argument, but the following
-analysis gives a taste of what is involved. Suppose we have a
-violation of the first part of the Guarantee: A critical section
-starts before a grace period, and some store propagates to the
-critical section's CPU before the end of the critical section but
-doesn't propagate to some other CPU until after the end of the grace
-period.
+Guarantee by requiring that the rb relation does not contain a cycle.
+Equivalently, this "rcu" axiom requires that there are no events E and
+F with E ->rcu-link F ->rcu-fence E. Or to put it a third way, the
+axiom requires that there are no cycles consisting of gp and rscs
+alternating with rcu-link, where the number of gp links is >= the
+number of rscs links.
+
+Justifying the axiom isn't easy, but it is in fact a valid
+formalization of the Grace Period Guarantee. We won't attempt to go
+through the detailed argument, but the following analysis gives a
+taste of what is involved. Suppose we have a violation of the first
+part of the Guarantee: A critical section starts before a grace
+period, and some store propagates to the critical section's CPU before
+the end of the critical section but doesn't propagate to some other
+CPU until after the end of the grace period.
Putting symbols to these ideas, let L and U be the rcu_read_lock() and
rcu_read_unlock() fence events delimiting the critical section in
question, and let S be the synchronize_rcu() fence event for the grace
period. Saying that the critical section starts before S means there
are events E and F where E is po-after L (which marks the start of the
-critical section), E is "before" F in the sense of the link relation,
-and F is po-before the grace period S:
+critical section), E is "before" F in the sense of the rcu-link
+relation, and F is po-before the grace period S:
- L ->po E ->link F ->po S.
+ L ->po E ->rcu-link F ->po S.
Let W be the store mentioned above, let Z come before the end of the
critical section and witness that W propagates to the critical
@@ -1600,16 +1652,19 @@ some event X which is po-after S. Symbolically, this amounts to:
The fr link from Y to W indicates that W has not propagated to Y's CPU
at the time that Y executes. From this, it can be shown (see the
-discussion of the link relation earlier) that X and Z are connected by
-link, yielding:
+discussion of the rcu-link relation earlier) that X and Z are related
+by rcu-link, yielding:
+
+ S ->po X ->rcu-link Z ->po U.
+
+The formulas say that S is po-between F and X, hence F ->gp X. They
+also say that Z comes before the end of the critical section and E
+comes after its start, hence Z ->rscs E. From all this we obtain:
- S ->po X ->link Z ->po U.
+ F ->gp X ->rcu-link Z ->rscs E ->rcu-link F,
-These formulas say that S is po-between F and X, hence F ->gp-link Z
-via X. They also say that Z comes before the end of the critical
-section and E comes after its start, hence Z ->rscs-link F via E. But
-now we have a forbidden cycle: F ->gp-link Z ->rscs-link F. Thus the
-"rcu" axiom rules out this violation of the Grace Period Guarantee.
+a forbidden cycle. Thus the "rcu" axiom rules out this violation of
+the Grace Period Guarantee.
For something a little more down-to-earth, let's see how the axiom
works out in practice. Consider the RCU code example from above, this
@@ -1635,18 +1690,18 @@ time with statement labels added to the memory access instructions:
}
-If r2 = 0 at the end then P0's store at X overwrites the value
-that P1's load at Z reads from, so we have Z ->fre X and thus
-Z ->link X. In addition, there is a synchronize_rcu() between Y and
-Z, so therefore we have Y ->gp-link X.
+If r2 = 0 at the end then P0's store at X overwrites the value that
+P1's load at Z reads from, so we have Z ->fre X and thus Z ->rcu-link X.
+In addition, there is a synchronize_rcu() between Y and Z, so therefore
+we have Y ->gp Z.
If r1 = 1 at the end then P1's load at Y reads from P0's store at W,
-so we have W ->link Y. In addition, W and X are in the same critical
-section, so therefore we have X ->rscs-link Y.
+so we have W ->rcu-link Y. In addition, W and X are in the same critical
+section, so therefore we have X ->rscs W.
-This gives us a cycle, Y ->gp-link X ->rscs-link Y, with one gp-link
-and one rscs-link, violating the "rcu" axiom. Hence the outcome is
-not allowed by the LKMM, as we would expect.
+Then X ->rscs W ->rcu-link Y ->gp Z ->rcu-link X is a forbidden cycle,
+violating the "rcu" axiom. Hence the outcome is not allowed by the
+LKMM, as we would expect.
For contrast, let's see what can happen in a more complicated example:
@@ -1682,15 +1737,11 @@ For contrast, let's see what can happen in a more complicated example:
}
If r0 = r1 = r2 = 1 at the end, then similar reasoning to before shows
-that W ->rscs-link Y via X, Y ->gp-link U via Z, and U ->rscs-link W
-via V. And just as before, this gives a cycle:
-
- W ->rscs-link Y ->gp-link U ->rscs-link W.
-
-However, this cycle has fewer gp-link instances than rscs-link
-instances, and consequently the outcome is not forbidden by the LKMM.
-The following instruction timing diagram shows how it might actually
-occur:
+that W ->rscs X ->rcu-link Y ->gp Z ->rcu-link U ->rscs V ->rcu-link W.
+However this cycle is not forbidden, because the sequence of relations
+contains fewer instances of gp (one) than of rscs (two). Consequently
+the outcome is allowed by the LKMM. The following instruction timing
+diagram shows how it might actually occur:
P0 P1 P2
-------------------- -------------------- --------------------
diff --git a/tools/memory-model/Documentation/references.txt b/tools/memory-model/Documentation/references.txt
index ba2e34c2ec3f..b177f3e4a614 100644
--- a/tools/memory-model/Documentation/references.txt
+++ b/tools/memory-model/Documentation/references.txt
@@ -63,15 +63,22 @@ o Shaked Flur, Susmit Sarkar, Christopher Pulte, Kyndylan Nienhuis,
Principles of Programming Languages (POPL 2017). ACM, New York,
NY, USA, 429–442.
+o Christopher Pulte, Shaked Flur, Will Deacon, Jon French,
+ Susmit Sarkar, and Peter Sewell. 2018. "Simplifying ARM concurrency:
+ multicopy-atomic axiomatic and operational models for ARMv8". In
+ Proceedings of the ACM on Programming Languages, Volume 2, Issue
+ POPL, Article No. 19. ACM, New York, NY, USA.
+
Linux-kernel memory model
=========================
-o Andrea Parri, Alan Stern, Luc Maranget, Paul E. McKenney,
- and Jade Alglave. 2017. "A formal model of
- Linux-kernel memory ordering - companion webpage".
- http://moscova.inria.fr/∼maranget/cats7/linux/. (2017). [Online;
- accessed 30-January-2017].
+o Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
+ Alan Stern. 2018. "Frightening small children and disconcerting
+ grown-ups: Concurrency in the Linux kernel". In Proceedings of
+ the 23rd International Conference on Architectural Support for
+ Programming Languages and Operating Systems (ASPLOS 2018). ACM,
+ New York, NY, USA, 405-418. Webpage: http://diy.inria.fr/linux/.
o Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
Alan Stern. 2017. "A formal kernel memory-ordering model (part 1)"
diff --git a/tools/memory-model/README b/tools/memory-model/README
index 0b3a5f3c9ccd..734f7feaa5dc 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -20,7 +20,7 @@ that litmus test to be exercised within the Linux kernel.
REQUIREMENTS
============
-Version 7.48 of the "herd7" and "klitmus7" tools must be downloaded
+Version 7.49 of the "herd7" and "klitmus7" tools must be downloaded
separately:
https://github.com/herd/herdtools7
diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell
index 432c7cf71b23..64f5740e0e75 100644
--- a/tools/memory-model/linux-kernel.bell
+++ b/tools/memory-model/linux-kernel.bell
@@ -5,10 +5,10 @@
* Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
* Andrea Parri <parri.andrea@gmail.com>
*
- * An earlier version of this file appears in the companion webpage for
+ * An earlier version of this file appeared in the companion webpage for
* "Frightening small children and disconcerting grown-ups: Concurrency
* in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
- * which is to appear in ASPLOS 2018.
+ * which appeared in ASPLOS 2018.
*)
"Linux-kernel memory consistency model"
diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat
index df97db03b6c2..59b5cbe6b624 100644
--- a/tools/memory-model/linux-kernel.cat
+++ b/tools/memory-model/linux-kernel.cat
@@ -5,10 +5,10 @@
* Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
* Andrea Parri <parri.andrea@gmail.com>
*
- * An earlier version of this file appears in the companion webpage for
+ * An earlier version of this file appeared in the companion webpage for
* "Frightening small children and disconcerting grown-ups: Concurrency
* in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
- * which is to appear in ASPLOS 2018.
+ * which appeared in ASPLOS 2018.
*)
"Linux-kernel memory consistency model"
@@ -100,22 +100,29 @@ let rscs = po ; crit^-1 ; po?
* one but two non-rf relations, but only in conjunction with an RCU
* read-side critical section.
*)
-let link = hb* ; pb* ; prop
+let rcu-link = hb* ; pb* ; prop
-(* Chains that affect the RCU grace-period guarantee *)
-let gp-link = gp ; link
-let rscs-link = rscs ; link
+(*
+ * Any sequence containing at least as many grace periods as RCU read-side
+ * critical sections (joined by rcu-link) acts as a generalized strong fence.
+ *)
+let rec rcu-fence = gp |
+ (gp ; rcu-link ; rscs) |
+ (rscs ; rcu-link ; gp) |
+ (gp ; rcu-link ; rcu-fence ; rcu-link ; rscs) |
+ (rscs ; rcu-link ; rcu-fence ; rcu-link ; gp) |
+ (rcu-fence ; rcu-link ; rcu-fence)
+
+(* rb orders instructions just as pb does *)
+let rb = prop ; rcu-fence ; hb* ; pb*
+
+irreflexive rb as rcu
(*
- * A cycle containing at least as many grace periods as RCU read-side
- * critical sections is forbidden.
+ * The happens-before, propagation, and rcu constraints are all
+ * expressions of temporal ordering. They could be replaced by
+ * a single constraint on an "executes-before" relation, xb:
+ *
+ * let xb = hb | pb | rb
+ * acyclic xb as executes-before
*)
-let rec rcu-path =
- gp-link |
- (gp-link ; rscs-link) |
- (rscs-link ; gp-link) |
- (rcu-path ; rcu-path) |
- (gp-link ; rcu-path ; rscs-link) |
- (rscs-link ; rcu-path ; gp-link)
-
-irreflexive rcu-path as rcu
diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def
index 397e4e67e8c8..6fa3eb28d40b 100644
--- a/tools/memory-model/linux-kernel.def
+++ b/tools/memory-model/linux-kernel.def
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0+
//
-// An earlier version of this file appears in the companion webpage for
+// An earlier version of this file appeared in the companion webpage for
// "Frightening small children and disconcerting grown-ups: Concurrency
// in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
-// which is to appear in ASPLOS 2018.
+// which appeared in ASPLOS 2018.
// ONCE
READ_ONCE(X) __load{once}(X)
@@ -14,14 +14,15 @@ smp_store_release(X,V) { __store{release}(*X,V); }
smp_load_acquire(X) __load{acquire}(*X)
rcu_assign_pointer(X,V) { __store{release}(X,V); }
rcu_dereference(X) __load{once}(X)
+smp_store_mb(X,V) { __store{once}(X,V); __fence{mb}; }
// Fences
-smp_mb() { __fence{mb} ; }
-smp_rmb() { __fence{rmb} ; }
-smp_wmb() { __fence{wmb} ; }
-smp_mb__before_atomic() { __fence{before-atomic} ; }
-smp_mb__after_atomic() { __fence{after-atomic} ; }
-smp_mb__after_spinlock() { __fence{after-spinlock} ; }
+smp_mb() { __fence{mb}; }
+smp_rmb() { __fence{rmb}; }
+smp_wmb() { __fence{wmb}; }
+smp_mb__before_atomic() { __fence{before-atomic}; }
+smp_mb__after_atomic() { __fence{after-atomic}; }
+smp_mb__after_spinlock() { __fence{after-spinlock}; }
// Exchange
xchg(X,V) __xchg{mb}(X,V)
@@ -34,26 +35,27 @@ cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W)
cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W)
// Spinlocks
-spin_lock(X) { __lock(X) ; }
-spin_unlock(X) { __unlock(X) ; }
+spin_lock(X) { __lock(X); }
+spin_unlock(X) { __unlock(X); }
spin_trylock(X) __trylock(X)
+spin_is_locked(X) __islocked(X)
// RCU
rcu_read_lock() { __fence{rcu-lock}; }
-rcu_read_unlock() { __fence{rcu-unlock};}
+rcu_read_unlock() { __fence{rcu-unlock}; }
synchronize_rcu() { __fence{sync-rcu}; }
synchronize_rcu_expedited() { __fence{sync-rcu}; }
// Atomic
atomic_read(X) READ_ONCE(*X)
-atomic_set(X,V) { WRITE_ONCE(*X,V) ; }
+atomic_set(X,V) { WRITE_ONCE(*X,V); }
atomic_read_acquire(X) smp_load_acquire(X)
atomic_set_release(X,V) { smp_store_release(X,V); }
-atomic_add(V,X) { __atomic_op(X,+,V) ; }
-atomic_sub(V,X) { __atomic_op(X,-,V) ; }
-atomic_inc(X) { __atomic_op(X,+,1) ; }
-atomic_dec(X) { __atomic_op(X,-,1) ; }
+atomic_add(V,X) { __atomic_op(X,+,V); }
+atomic_sub(V,X) { __atomic_op(X,-,V); }
+atomic_inc(X) { __atomic_op(X,+,1); }
+atomic_dec(X) { __atomic_op(X,-,1); }
atomic_add_return(V,X) __atomic_op_return{mb}(X,+,V)
atomic_add_return_relaxed(V,X) __atomic_op_return{once}(X,+,V)
diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore
new file mode 100644
index 000000000000..6e2ddc54152f
--- /dev/null
+++ b/tools/memory-model/litmus-tests/.gitignore
@@ -0,0 +1 @@
+*.litmus.out
diff --git a/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
index 50d5db9ea983..98a3716efa37 100644
--- a/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
@@ -7,7 +7,7 @@ C IRIW+mbonceonces+OnceOnce
* between each pairs of reads. In other words, is smp_mb() sufficient to
* cause two different reading processes to agree on the order of a pair
* of writes, where each write is to a different variable by a different
- * process?
+ * process? This litmus test exercises LKMM's "propagation" rule.
*)
{}
diff --git a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
new file mode 100644
index 000000000000..50f4d62bbf0e
--- /dev/null
+++ b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
@@ -0,0 +1,35 @@
+C MP+polockmbonce+poacquiresilsil
+
+(*
+ * Result: Never
+ *
+ * Do spinlocks combined with smp_mb__after_spinlock() provide order
+ * to outside observers using spin_is_locked() to sense the lock-held
+ * state, ordered by acquire? Note that when the first spin_is_locked()
+ * returns false and the second true, we know that the smp_load_acquire()
+ * executed before the lock was acquired (loosely speaking).
+ *)
+
+{
+}
+
+P0(spinlock_t *lo, int *x)
+{
+ spin_lock(lo);
+ smp_mb__after_spinlock();
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lo);
+}
+
+P1(spinlock_t *lo, int *x)
+{
+ int r1;
+ int r2;
+ int r3;
+
+ r1 = smp_load_acquire(x);
+ r2 = spin_is_locked(lo);
+ r3 = spin_is_locked(lo);
+}
+
+exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1)
diff --git a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
new file mode 100644
index 000000000000..abf81e7a0895
--- /dev/null
+++ b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
@@ -0,0 +1,34 @@
+C MP+polockonce+poacquiresilsil
+
+(*
+ * Result: Sometimes
+ *
+ * Do spinlocks provide order to outside observers using spin_is_locked()
+ * to sense the lock-held state, ordered by acquire? Note that when the
+ * first spin_is_locked() returns false and the second true, we know that
+ * the smp_load_acquire() executed before the lock was acquired (loosely
+ * speaking).
+ *)
+
+{
+}
+
+P0(spinlock_t *lo, int *x)
+{
+ spin_lock(lo);
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lo);
+}
+
+P1(spinlock_t *lo, int *x)
+{
+ int r1;
+ int r2;
+ int r3;
+
+ r1 = smp_load_acquire(x);
+ r2 = spin_is_locked(lo);
+ r3 = spin_is_locked(lo);
+}
+
+exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1)
diff --git a/tools/memory-model/litmus-tests/README b/tools/memory-model/litmus-tests/README
index 04096fb8b8d9..17eb9a8c222d 100644
--- a/tools/memory-model/litmus-tests/README
+++ b/tools/memory-model/litmus-tests/README
@@ -23,7 +23,8 @@ IRIW+mbonceonces+OnceOnce.litmus
between each pairs of reads. In other words, is smp_mb()
sufficient to cause two different reading processes to agree on
the order of a pair of writes, where each write is to a different
- variable by a different process?
+ variable by a different process? This litmus test is forbidden
+ by LKMM's propagation rule.
IRIW+poonceonces+OnceOnce.litmus
Test of independent reads from independent writes with nothing
@@ -63,6 +64,16 @@ LB+poonceonces.litmus
MP+onceassign+derefonce.litmus
As below, but with rcu_assign_pointer() and an rcu_dereference().
+MP+polockmbonce+poacquiresilsil.litmus
+ Protect the access with a lock and an smp_mb__after_spinlock()
+ in one process, and use an acquire load followed by a pair of
+ spin_is_locked() calls in the other process.
+
+MP+polockonce+poacquiresilsil.litmus
+ Protect the access with a lock in one process, and use an
+ acquire load followed by a pair of spin_is_locked() calls
+ in the other process.
+
MP+polocks.litmus
As below, but with the second access of the writer process
and the first access of reader process protected by a lock.
@@ -109,8 +120,10 @@ S+wmbonceonce+poacquireonce.litmus
WRC+poonceonces+Once.litmus
WRC+pooncerelease+rmbonceonce+Once.litmus
- These two are members of an extension of the MP litmus-test class
- in which the first write is moved to a separate process.
+ These two are members of an extension of the MP litmus-test
+ class in which the first write is moved to a separate process.
+ The second is forbidden because smp_store_release() is
+ A-cumulative in LKMM.
Z6.0+pooncelock+pooncelock+pombonce.litmus
Is the ordering provided by a spin_unlock() and a subsequent
diff --git a/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus b/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
index 97fcbffde9a0..ad3448b941e6 100644
--- a/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
@@ -5,7 +5,9 @@ C WRC+pooncerelease+rmbonceonce+Once
*
* This litmus test is an extension of the message-passing pattern, where
* the first write is moved to a separate process. Because it features
- * a release and a read memory barrier, it should be forbidden.
+ * a release and a read memory barrier, it should be forbidden. More
+ * specifically, this litmus test is forbidden because smp_store_release()
+ * is A-cumulative in LKMM.
*)
{}
diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat
index ba4a4ec6d313..305ded17e741 100644
--- a/tools/memory-model/lock.cat
+++ b/tools/memory-model/lock.cat
@@ -4,46 +4,72 @@
* Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>
*)
-(* Generate coherence orders and handle lock operations *)
+(*
+ * Generate coherence orders and handle lock operations
+ *
+ * Warning: spin_is_locked() crashes herd7 versions strictly before 7.48.
+ * spin_is_locked() is functional from herd7 version 7.49.
+ *)
include "cross.cat"
-(* From lock reads to their partner lock writes *)
-let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po)
-let rmw = rmw | lk-rmw
-
(*
- * A paired LKR must always see an unlocked value; spin_lock() calls nested
- * inside a critical section (for the same lock) always deadlock.
+ * The lock-related events generated by herd are as follows:
+ *
+ * LKR Lock-Read: the read part of a spin_lock() or successful
+ * spin_trylock() read-modify-write event pair
+ * LKW Lock-Write: the write part of a spin_lock() or successful
+ * spin_trylock() RMW event pair
+ * UL Unlock: a spin_unlock() event
+ * LF Lock-Fail: a failed spin_trylock() event
+ * RL Read-Locked: a spin_is_locked() event which returns True
+ * RU Read-Unlocked: a spin_is_locked() event which returns False
+ *
+ * LKR and LKW events always come paired, like all RMW event sequences.
+ *
+ * LKR, LF, RL, and RU are read events; LKR has Acquire ordering.
+ * LKW and UL are write events; UL has Release ordering.
+ * LKW, LF, RL, and RU have no ordering properties.
*)
-empty ([LKW] ; po-loc ; [domain(lk-rmw)]) \ (po-loc ; [UL] ; po-loc)
- as lock-nest
-(* The litmus test is invalid if an LKW event is not part of an RMW pair *)
-flag ~empty LKW \ range(lk-rmw) as unpaired-LKW
+(* Backward compatibility *)
+let RL = try RL with emptyset
+let RU = try RU with emptyset
-(* This will be allowed if we implement spin_is_locked() *)
-flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR
+(* Treat RL as a kind of LF: a read with no ordering properties *)
+let LF = LF | RL
-(* There should be no R or W accesses to spinlocks *)
-let ALL-LOCKS = LKR | LKW | UL | LF
+(* There should be no ordinary R or W accesses to spinlocks *)
+let ALL-LOCKS = LKR | LKW | UL | LF | RU
flag ~empty [M \ IW] ; loc ; [ALL-LOCKS] as mixed-lock-accesses
+(* Link Lock-Reads to their RMW-partner Lock-Writes *)
+let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po)
+let rmw = rmw | lk-rmw
+
+(* The litmus test is invalid if an LKR/LKW event is not part of an RMW pair *)
+flag ~empty LKW \ range(lk-rmw) as unpaired-LKW
+flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR
+
+(*
+ * An LKR must always see an unlocked value; spin_lock() calls nested
+ * inside a critical section (for the same lock) always deadlock.
+ *)
+empty ([LKW] ; po-loc ; [LKR]) \ (po-loc ; [UL] ; po-loc) as lock-nest
+
(* The final value of a spinlock should not be tested *)
flag ~empty [FW] ; loc ; [ALL-LOCKS] as lock-final
-
(*
* Put lock operations in their appropriate classes, but leave UL out of W
* until after the co relation has been generated.
*)
-let R = R | LKR | LF
+let R = R | LKR | LF | RU
let W = W | LKW
let Release = Release | UL
let Acquire = Acquire | LKR
-
(* Match LKW events to their corresponding UL events *)
let critical = ([LKW] ; po-loc ; [UL]) \ (po-loc ; [LKW | UL] ; po-loc)
@@ -53,27 +79,48 @@ flag ~empty UL \ range(critical) as unmatched-unlock
let UNMATCHED-LKW = LKW \ domain(critical)
empty ([UNMATCHED-LKW] ; loc ; [UNMATCHED-LKW]) \ id as unmatched-locks
-
(* rfi for LF events: link each LKW to the LF events in its critical section *)
let rfi-lf = ([LKW] ; po-loc ; [LF]) \ ([LKW] ; po-loc ; [UL] ; po-loc)
(* rfe for LF events *)
let all-possible-rfe-lf =
- (*
- * Given an LF event r, compute the possible rfe edges for that event
- * (all those starting from LKW events in other threads),
- * and then convert that relation to a set of single-edge relations.
- *)
- let possible-rfe-lf r =
- let pair-to-relation p = p ++ 0
- in map pair-to-relation ((LKW * {r}) & loc & ext)
- (* Do this for each LF event r that isn't in rfi-lf *)
- in map possible-rfe-lf (LF \ range(rfi-lf))
+ (*
+ * Given an LF event r, compute the possible rfe edges for that event
+ * (all those starting from LKW events in other threads),
+ * and then convert that relation to a set of single-edge relations.
+ *)
+ let possible-rfe-lf r =
+ let pair-to-relation p = p ++ 0
+ in map pair-to-relation ((LKW * {r}) & loc & ext)
+ (* Do this for each LF event r that isn't in rfi-lf *)
+ in map possible-rfe-lf (LF \ range(rfi-lf))
(* Generate all rf relations for LF events *)
with rfe-lf from cross(all-possible-rfe-lf)
-let rf = rf | rfi-lf | rfe-lf
+let rf-lf = rfe-lf | rfi-lf
+
+(*
+ * RU, i.e., spin_is_locked() returning False, is slightly different.
+ * We rely on the memory model to rule out cases where spin_is_locked()
+ * within one of the lock's critical sections returns False.
+ *)
+
+(* rfi for RU events: an RU may read from the last po-previous UL *)
+let rfi-ru = ([UL] ; po-loc ; [RU]) \ ([UL] ; po-loc ; [LKW] ; po-loc)
+
+(* rfe for RU events: an RU may read from an external UL or the initial write *)
+let all-possible-rfe-ru =
+ let possible-rfe-ru r =
+ let pair-to-relation p = p ++ 0
+ in map pair-to-relation (((UL | IW) * {r}) & loc & ext)
+ in map possible-rfe-ru RU
+
+(* Generate all rf relations for RU events *)
+with rfe-ru from cross(all-possible-rfe-ru)
+let rf-ru = rfe-ru | rfi-ru
+(* Final rf relation *)
+let rf = rf | rf-lf | rf-ru
(* Generate all co relations, including LKW events but not UL *)
let co0 = co0 | ([IW] ; loc ; [LKW]) |
diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh
new file mode 100644
index 000000000000..af0aa15ab84e
--- /dev/null
+++ b/tools/memory-model/scripts/checkalllitmus.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+#
+# Run herd tests on all .litmus files in the specified directory (which
+# defaults to litmus-tests) and check each file's result against a "Result:"
+# comment within that litmus test. If the verification result does not
+# match that specified in the litmus test, this script prints an error
+# message prefixed with "^^^". It also outputs verification results to
+# a file whose name is that of the specified litmus test, but with ".out"
+# appended.
+#
+# Usage:
+# sh checkalllitmus.sh [ directory ]
+#
+# The LINUX_HERD_OPTIONS environment variable may be used to specify
+# arguments to herd, whose default is defined by the checklitmus.sh script.
+# Thus, one would normally run this in the directory containing the memory
+# model, specifying the pathname of the litmus test to check.
+#
+# This script makes no attempt to run the litmus tests concurrently.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+litmusdir=${1-litmus-tests}
+if test -d "$litmusdir" -a -r "$litmusdir" -a -x "$litmusdir"
+then
+ :
+else
+ echo ' --- ' error: $litmusdir is not an accessible directory
+ exit 255
+fi
+
+# Find the checklitmus script. If it is not where we expect it, then
+# assume that the caller has the PATH environment variable set
+# appropriately.
+if test -x scripts/checklitmus.sh
+then
+ clscript=scripts/checklitmus.sh
+else
+ clscript=checklitmus.sh
+fi
+
+# Run the script on all the litmus tests in the specified directory
+ret=0
+for i in litmus-tests/*.litmus
+do
+ if ! $clscript $i
+ then
+ ret=1
+ fi
+done
+if test "$ret" -ne 0
+then
+ echo " ^^^ VERIFICATION MISMATCHES"
+else
+ echo All litmus tests verified as was expected.
+fi
+exit $ret
diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh
new file mode 100644
index 000000000000..e2e477472844
--- /dev/null
+++ b/tools/memory-model/scripts/checklitmus.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+#
+# Run a herd test and check the result against a "Result:" comment within
+# the litmus test. If the verification result does not match that specified
+# in the litmus test, this script prints an error message prefixed with
+# "^^^" and exits with a non-zero status. It also outputs verification
+# results to a file whose name is that of the specified litmus test, but
+# with ".out" appended.
+#
+# Usage:
+# sh checklitmus.sh file.litmus
+#
+# The LINUX_HERD_OPTIONS environment variable may be used to specify
+# arguments to herd, which default to "-conf linux-kernel.cfg". Thus,
+# one would normally run this in the directory containing the memory model,
+# specifying the pathname of the litmus test to check.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+litmus=$1
+herdoptions=${LINUX_HERD_OPTIONS--conf linux-kernel.cfg}
+
+if test -f "$litmus" -a -r "$litmus"
+then
+ :
+else
+ echo ' --- ' error: \"$litmus\" is not a readable file
+ exit 255
+fi
+if grep -q '^ \* Result: ' $litmus
+then
+ outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'`
+else
+ outcome=specified
+fi
+
+echo Herd options: $herdoptions > $litmus.out
+/usr/bin/time herd7 -o ~/tmp $herdoptions $litmus >> $litmus.out 2>&1
+grep "Herd options:" $litmus.out
+grep '^Observation' $litmus.out
+if grep -q '^Observation' $litmus.out
+then
+ :
+else
+ cat $litmus.out
+ echo ' ^^^ Verification error'
+ echo ' ^^^ Verification error' >> $litmus.out 2>&1
+ exit 255
+fi
+if test "$outcome" = DEADLOCK
+then
+ echo grep 3 and 4
+ if grep '^Observation' $litmus.out | grep -q 'Never 0 0$'
+ then
+ ret=0
+ else
+ echo " ^^^ Unexpected non-$outcome verification"
+ echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1
+ ret=1
+ fi
+elif grep '^Observation' $litmus.out | grep -q $outcome || test "$outcome" = Maybe
+then
+ ret=0
+else
+ echo " ^^^ Unexpected non-$outcome verification"
+ echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1
+ ret=1
+fi
+tail -2 $litmus.out | head -1
+exit $ret
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index db11478e30b4..42261a9b280e 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -47,7 +47,8 @@ man5dir=$(mandir)/man5
man7dir=$(mandir)/man7
ASCIIDOC=asciidoc
-ASCIIDOC_EXTRA = --unsafe
+ASCIIDOC_EXTRA = --unsafe -f asciidoc.conf
+ASCIIDOC_HTML = xhtml11
MANPAGE_XSL = manpage-normal.xsl
XMLTO_EXTRA =
INSTALL?=install
@@ -55,6 +56,14 @@ RM ?= rm -f
DOC_REF = origin/man
HTML_REF = origin/html
+ifdef USE_ASCIIDOCTOR
+ASCIIDOC = asciidoctor
+ASCIIDOC_EXTRA = -a compat-mode
+ASCIIDOC_EXTRA += -I. -rasciidoctor-extensions
+ASCIIDOC_EXTRA += -a mansource="perf" -a manmanual="perf Manual"
+ASCIIDOC_HTML = xhtml5
+endif
+
infodir?=$(prefix)/share/info
MAKEINFO=makeinfo
INSTALL_INFO=install-info
@@ -73,10 +82,12 @@ ifeq ($(_tmp_tool_path),)
missing_tools = $(ASCIIDOC)
endif
+ifndef USE_ASCIIDOCTOR
_tmp_tool_path := $(call get-executable,$(XMLTO))
ifeq ($(_tmp_tool_path),)
missing_tools += $(XMLTO)
endif
+endif
#
# For asciidoc ...
@@ -264,9 +275,17 @@ clean:
$(MAN_HTML): $(OUTPUT)%.html : %.txt
$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
- $(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
+ $(ASCIIDOC) -b $(ASCIIDOC_HTML) -d manpage \
+ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+ mv $@+ $@
+
+ifdef USE_ASCIIDOCTOR
+$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.txt
+ $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+ $(ASCIIDOC) -b manpage -d manpage \
$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
mv $@+ $@
+endif
$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
$(QUIET_XMLTO)$(RM) $@ && \
@@ -274,7 +293,7 @@ $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
$(OUTPUT)%.xml : %.txt
$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
- $(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
+ $(ASCIIDOC) -b docbook -d manpage \
$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
mv $@+ $@
@@ -321,13 +340,13 @@ howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
mv $@+ $@
$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
- $(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
+ $(QUIET_ASCIIDOC)$(ASCIIDOC) -b $(ASCIIDOC_HTML) $*.txt
WEBDOC_DEST = /pub/software/tools/perf/docs
$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
- sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
+ sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b $(ASCIIDOC_HTML) - >$@+ && \
mv $@+ $@
# UNIMPLEMENTED
diff --git a/tools/perf/Documentation/asciidoctor-extensions.rb b/tools/perf/Documentation/asciidoctor-extensions.rb
new file mode 100644
index 000000000000..d148fe95c0c4
--- /dev/null
+++ b/tools/perf/Documentation/asciidoctor-extensions.rb
@@ -0,0 +1,29 @@
+require 'asciidoctor'
+require 'asciidoctor/extensions'
+
+module Perf
+ module Documentation
+ class LinkPerfProcessor < Asciidoctor::Extensions::InlineMacroProcessor
+ use_dsl
+
+ named :chrome
+
+ def process(parent, target, attrs)
+ if parent.document.basebackend? 'html'
+ %(<a href="#{target}.html">#{target}(#{attrs[1]})</a>\n)
+ elsif parent.document.basebackend? 'manpage'
+ "#{target}(#{attrs[1]})"
+ elsif parent.document.basebackend? 'docbook'
+ "<citerefentry>\n" \
+ "<refentrytitle>#{target}</refentrytitle>" \
+ "<manvolnum>#{attrs[1]}</manvolnum>\n" \
+ "</citerefentry>\n"
+ end
+ end
+ end
+ end
+end
+
+Asciidoctor::Extensions.register do
+ inline_macro Perf::Documentation::LinkPerfProcessor, :linkperf
+end
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 73c2650bd0db..f6de0952ff3c 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -48,6 +48,9 @@ OPTIONS
--purge=::
Purge all cached binaries including older caches which have specified
path from the cache.
+-P::
+--purge-all::
+ Purge all cached binaries. This will flush out entire cache.
-M::
--missing=::
List missing build ids in the cache for the specified file.
@@ -59,7 +62,9 @@ OPTIONS
exactly same build-id, that is replaced by new one. It can be used
to update kallsyms and kernel dso to vmlinux in order to support
annotation.
-
+-l::
+--list::
+ List all valid binaries from cache.
-v::
--verbose::
Be more verbose.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index e6c3b4e555c2..3a822f308e6d 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -116,6 +116,22 @@ Do not aggregate counts across all monitored CPUs.
print counts using a CSV-style output to make it easy to import directly into
spreadsheets. Columns are separated by the string specified in SEP.
+--table:: Display time for each run (-r option), in a table format, e.g.:
+
+ $ perf stat --null -r 5 --table perf bench sched pipe
+
+ Performance counter stats for 'perf bench sched pipe' (5 runs):
+
+ # Table of individual measurements:
+ 5.189 (-0.293) #
+ 5.189 (-0.294) #
+ 5.186 (-0.296) #
+ 5.663 (+0.181) ##
+ 6.186 (+0.703) ####
+
+ # Final result:
+ 5.483 +- 0.198 seconds time elapsed ( +- 3.62% )
+
-G name::
--cgroup name::
monitor only in the container (cgroup) called "name". This option is available only
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index ae7dc46e8f8a..b5ac356ba323 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -885,6 +885,8 @@ endif
# Among the variables below, these:
# perfexecdir
+# perf_include_dir
+# perf_examples_dir
# template_dir
# mandir
# infodir
@@ -904,6 +906,8 @@ bindir = $(abspath $(prefix)/$(bindir_relative))
mandir = share/man
infodir = share/info
perfexecdir = libexec/perf-core
+perf_include_dir = lib/include/perf
+perf_examples_dir = lib/examples/perf
sharedir = $(prefix)/share
template_dir = share/perf-core/templates
STRACE_GROUPS_DIR = share/perf-core/strace/groups
@@ -934,6 +938,8 @@ bindir_SQ = $(subst ','\'',$(bindir))
mandir_SQ = $(subst ','\'',$(mandir))
infodir_SQ = $(subst ','\'',$(infodir))
perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
+perf_include_dir_SQ = $(subst ','\'',$(perf_include_dir))
+perf_examples_dir_SQ = $(subst ','\'',$(perf_examples_dir))
template_dir_SQ = $(subst ','\'',$(template_dir))
htmldir_SQ = $(subst ','\'',$(htmldir))
tipdir_SQ = $(subst ','\'',$(tipdir))
@@ -944,14 +950,20 @@ srcdir_SQ = $(subst ','\'',$(srcdir))
ifneq ($(filter /%,$(firstword $(perfexecdir))),)
perfexec_instdir = $(perfexecdir)
+perf_include_instdir = $(perf_include_dir)
+perf_examples_instdir = $(perf_examples_dir)
STRACE_GROUPS_INSTDIR = $(STRACE_GROUPS_DIR)
tip_instdir = $(tipdir)
else
perfexec_instdir = $(prefix)/$(perfexecdir)
+perf_include_instdir = $(prefix)/$(perf_include_dir)
+perf_examples_instdir = $(prefix)/$(perf_examples_dir)
STRACE_GROUPS_INSTDIR = $(prefix)/$(STRACE_GROUPS_DIR)
tip_instdir = $(prefix)/$(tipdir)
endif
perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+perf_include_instdir_SQ = $(subst ','\'',$(perf_include_instdir))
+perf_examples_instdir_SQ = $(subst ','\'',$(perf_examples_instdir))
STRACE_GROUPS_INSTDIR_SQ = $(subst ','\'',$(STRACE_GROUPS_INSTDIR))
tip_instdir_SQ = $(subst ','\'',$(tip_instdir))
@@ -999,6 +1011,8 @@ $(call detected_var,ETC_PERFCONFIG_SQ)
$(call detected_var,STRACE_GROUPS_DIR_SQ)
$(call detected_var,prefix_SQ)
$(call detected_var,perfexecdir_SQ)
+$(call detected_var,perf_include_dir_SQ)
+$(call detected_var,perf_examples_dir_SQ)
$(call detected_var,tipdir_SQ)
$(call detected_var,srcdir_SQ)
$(call detected_var,LIBDIR)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 83e453de36f8..ecc9fc952655 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -767,6 +767,16 @@ ifndef NO_JVMTI
endif
$(call QUIET_INSTALL, libexec) \
$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+ifndef NO_LIBBPF
+ $(call QUIET_INSTALL, lib) \
+ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'
+ $(call QUIET_INSTALL, include/bpf) \
+ $(INSTALL) include/bpf/*.h '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'
+ $(call QUIET_INSTALL, lib) \
+ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
+ $(call QUIET_INSTALL, examples/bpf) \
+ $(INSTALL) examples/bpf/*.c '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
+endif
$(call QUIET_INSTALL, perf-archive) \
$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
$(call QUIET_INSTALL, perf-with-kcore) \
diff --git a/tools/perf/arch/arm/tests/dwarf-unwind.c b/tools/perf/arch/arm/tests/dwarf-unwind.c
index 8cb347760233..9a0242e74cfc 100644
--- a/tools/perf/arch/arm/tests/dwarf-unwind.c
+++ b/tools/perf/arch/arm/tests/dwarf-unwind.c
@@ -25,7 +25,7 @@ static int sample_ustack(struct perf_sample *sample,
sp = (unsigned long) regs[PERF_REG_ARM_SP];
- map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+ map = map_groups__find(thread->mg, (u64)sp);
if (!map) {
pr_debug("failed to get stack map\n");
free(buf);
diff --git a/tools/perf/arch/arm64/tests/dwarf-unwind.c b/tools/perf/arch/arm64/tests/dwarf-unwind.c
index e907f0f4c20c..5522ce384723 100644
--- a/tools/perf/arch/arm64/tests/dwarf-unwind.c
+++ b/tools/perf/arch/arm64/tests/dwarf-unwind.c
@@ -25,7 +25,7 @@ static int sample_ustack(struct perf_sample *sample,
sp = (unsigned long) regs[PERF_REG_ARM64_SP];
- map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+ map = map_groups__find(thread->mg, (u64)sp);
if (!map) {
pr_debug("failed to get stack map\n");
free(buf);
diff --git a/tools/perf/arch/powerpc/tests/dwarf-unwind.c b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
index 30cbbd6d5be0..5f39efef0856 100644
--- a/tools/perf/arch/powerpc/tests/dwarf-unwind.c
+++ b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
@@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample,
sp = (unsigned long) regs[PERF_REG_POWERPC_R1];
- map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+ map = map_groups__find(thread->mg, (u64)sp);
if (!map) {
pr_debug("failed to get stack map\n");
free(buf);
diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index 0c370f81e002..3598b8b75d27 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -248,8 +248,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain)
ip = chain->ips[2];
- thread__find_addr_location(thread, PERF_RECORD_MISC_USER,
- MAP__FUNCTION, ip, &al);
+ thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al);
if (al.map)
dso = al.map->dso;
diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c
index 95036c7a59e8..7879df34569a 100644
--- a/tools/perf/arch/x86/tests/dwarf-unwind.c
+++ b/tools/perf/arch/x86/tests/dwarf-unwind.c
@@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample,
sp = (unsigned long) regs[PERF_REG_X86_SP];
- map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+ map = map_groups__find(thread->mg, (u64)sp);
if (!map) {
pr_debug("failed to get stack map\n");
free(buf);
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index f95e6f46ef0d..844b8f335532 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -4,6 +4,8 @@ libperf-y += pmu.o
libperf-y += kvm-stat.o
libperf-y += perf_regs.o
libperf-y += group.o
+libperf-y += machine.o
+libperf-y += event.o
libperf-$(CONFIG_DWARF) += dwarf-regs.o
libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
new file mode 100644
index 000000000000..675a0213044d
--- /dev/null
+++ b/tools/perf/arch/x86/util/event.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+
+#include "../../util/machine.h"
+#include "../../util/tool.h"
+#include "../../util/map.h"
+#include "../../util/util.h"
+#include "../../util/debug.h"
+
+#if defined(__x86_64__)
+
+int perf_event__synthesize_extra_kmaps(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine)
+{
+ int rc = 0;
+ struct map *pos;
+ struct map_groups *kmaps = &machine->kmaps;
+ struct maps *maps = &kmaps->maps;
+ union perf_event *event = zalloc(sizeof(event->mmap) +
+ machine->id_hdr_size);
+
+ if (!event) {
+ pr_debug("Not enough memory synthesizing mmap event "
+ "for extra kernel maps\n");
+ return -1;
+ }
+
+ for (pos = maps__first(maps); pos; pos = map__next(pos)) {
+ struct kmap *kmap;
+ size_t size;
+
+ if (!__map__is_extra_kernel_map(pos))
+ continue;
+
+ kmap = map__kmap(pos);
+
+ size = sizeof(event->mmap) - sizeof(event->mmap.filename) +
+ PERF_ALIGN(strlen(kmap->name) + 1, sizeof(u64)) +
+ machine->id_hdr_size;
+
+ memset(event, 0, size);
+
+ event->mmap.header.type = PERF_RECORD_MMAP;
+
+ /*
+ * kernel uses 0 for user space maps, see kernel/perf_event.c
+ * __perf_event_mmap
+ */
+ if (machine__is_host(machine))
+ event->header.misc = PERF_RECORD_MISC_KERNEL;
+ else
+ event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+
+ event->mmap.header.size = size;
+
+ event->mmap.start = pos->start;
+ event->mmap.len = pos->end - pos->start;
+ event->mmap.pgoff = pos->pgoff;
+ event->mmap.pid = machine->pid;
+
+ strlcpy(event->mmap.filename, kmap->name, PATH_MAX);
+
+ if (perf_tool__process_synth_event(tool, event, machine,
+ process) != 0) {
+ rc = -1;
+ break;
+ }
+ }
+
+ free(event);
+ return rc;
+}
+
+#endif
diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c
new file mode 100644
index 000000000000..4520ac53caa9
--- /dev/null
+++ b/tools/perf/arch/x86/util/machine.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <stdlib.h>
+
+#include "../../util/machine.h"
+#include "../../util/map.h"
+#include "../../util/symbol.h"
+#include "../../util/sane_ctype.h"
+
+#include <symbol/kallsyms.h>
+
+#if defined(__x86_64__)
+
+struct extra_kernel_map_info {
+ int cnt;
+ int max_cnt;
+ struct extra_kernel_map *maps;
+ bool get_entry_trampolines;
+ u64 entry_trampoline;
+};
+
+static int add_extra_kernel_map(struct extra_kernel_map_info *mi, u64 start,
+ u64 end, u64 pgoff, const char *name)
+{
+ if (mi->cnt >= mi->max_cnt) {
+ void *buf;
+ size_t sz;
+
+ mi->max_cnt = mi->max_cnt ? mi->max_cnt * 2 : 32;
+ sz = sizeof(struct extra_kernel_map) * mi->max_cnt;
+ buf = realloc(mi->maps, sz);
+ if (!buf)
+ return -1;
+ mi->maps = buf;
+ }
+
+ mi->maps[mi->cnt].start = start;
+ mi->maps[mi->cnt].end = end;
+ mi->maps[mi->cnt].pgoff = pgoff;
+ strlcpy(mi->maps[mi->cnt].name, name, KMAP_NAME_LEN);
+
+ mi->cnt += 1;
+
+ return 0;
+}
+
+static int find_extra_kernel_maps(void *arg, const char *name, char type,
+ u64 start)
+{
+ struct extra_kernel_map_info *mi = arg;
+
+ if (!mi->entry_trampoline && kallsyms2elf_binding(type) == STB_GLOBAL &&
+ !strcmp(name, "_entry_trampoline")) {
+ mi->entry_trampoline = start;
+ return 0;
+ }
+
+ if (is_entry_trampoline(name)) {
+ u64 end = start + page_size;
+
+ return add_extra_kernel_map(mi, start, end, 0, name);
+ }
+
+ return 0;
+}
+
+int machine__create_extra_kernel_maps(struct machine *machine,
+ struct dso *kernel)
+{
+ struct extra_kernel_map_info mi = { .cnt = 0, };
+ char filename[PATH_MAX];
+ int ret;
+ int i;
+
+ machine__get_kallsyms_filename(machine, filename, PATH_MAX);
+
+ if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+ return 0;
+
+ ret = kallsyms__parse(filename, &mi, find_extra_kernel_maps);
+ if (ret)
+ goto out_free;
+
+ if (!mi.entry_trampoline)
+ goto out_free;
+
+ for (i = 0; i < mi.cnt; i++) {
+ struct extra_kernel_map *xm = &mi.maps[i];
+
+ xm->pgoff = mi.entry_trampoline;
+ ret = machine__create_extra_kernel_map(machine, kernel, xm);
+ if (ret)
+ goto out_free;
+ }
+
+ machine->trampolines_mapped = mi.cnt;
+out_free:
+ free(mi.maps);
+ return ret;
+}
+
+#endif
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 51709a961496..da5704240239 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -45,6 +45,7 @@ struct perf_annotate {
bool print_line;
bool skip_missing;
bool has_br_stack;
+ bool group_set;
const char *sym_hist_filter;
const char *cpu_list;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -228,7 +229,7 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
*/
if (al->sym != NULL) {
rb_erase(&al->sym->rb_node,
- &al->map->dso->symbols[al->map->type]);
+ &al->map->dso->symbols);
symbol__delete(al->sym);
dso__reset_find_symbol_cache(al->map->dso);
}
@@ -508,6 +509,9 @@ int cmd_annotate(int argc, const char **argv)
"Don't shorten the displayed pathnames"),
OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
"Skip symbols that cannot be annotated"),
+ OPT_BOOLEAN_SET(0, "group", &symbol_conf.event_group,
+ &annotate.group_set,
+ "Show event group information together"),
OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
OPT_CALLBACK(0, "symfs", NULL, "directory",
"Look for files with symbols relative to this directory",
@@ -570,6 +574,9 @@ int cmd_annotate(int argc, const char **argv)
annotate.has_br_stack = perf_header__has_feat(&annotate.session->header,
HEADER_BRANCH_STACK);
+ if (annotate.group_set)
+ perf_evlist__force_leader(annotate.session->evlist);
+
ret = symbol__annotation_init();
if (ret < 0)
goto out_delete;
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 41db2cba77eb..115110a4796a 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -25,6 +25,7 @@
#include "util/session.h"
#include "util/symbol.h"
#include "util/time-utils.h"
+#include "util/probe-file.h"
static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
{
@@ -239,6 +240,34 @@ out:
return err;
}
+static int build_id_cache__purge_all(void)
+{
+ struct strlist *list;
+ struct str_node *pos;
+ int err = 0;
+ char *buf;
+
+ list = build_id_cache__list_all(false);
+ if (!list) {
+ pr_debug("Failed to get buildids: -%d\n", errno);
+ return -EINVAL;
+ }
+
+ strlist__for_each_entry(pos, list) {
+ buf = build_id_cache__origname(pos->s);
+ err = build_id_cache__remove_s(pos->s);
+ pr_debug("Removing %s (%s): %s\n", buf, pos->s,
+ err ? "FAIL" : "Ok");
+ free(buf);
+ if (err)
+ break;
+ }
+ strlist__delete(list);
+
+ pr_debug("Purged all: %s\n", err ? "FAIL" : "Ok");
+ return err;
+}
+
static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
{
char filename[PATH_MAX];
@@ -297,6 +326,26 @@ static int build_id_cache__update_file(const char *filename, struct nsinfo *nsi)
return err;
}
+static int build_id_cache__show_all(void)
+{
+ struct strlist *bidlist;
+ struct str_node *nd;
+ char *buf;
+
+ bidlist = build_id_cache__list_all(true);
+ if (!bidlist) {
+ pr_debug("Failed to get buildids: -%d\n", errno);
+ return -1;
+ }
+ strlist__for_each_entry(nd, bidlist) {
+ buf = build_id_cache__origname(nd->s);
+ fprintf(stdout, "%s %s\n", nd->s, buf);
+ free(buf);
+ }
+ strlist__delete(bidlist);
+ return 0;
+}
+
int cmd_buildid_cache(int argc, const char **argv)
{
struct strlist *list;
@@ -304,6 +353,9 @@ int cmd_buildid_cache(int argc, const char **argv)
int ret = 0;
int ns_id = -1;
bool force = false;
+ bool list_files = false;
+ bool opts_flag = false;
+ bool purge_all = false;
char const *add_name_list_str = NULL,
*remove_name_list_str = NULL,
*purge_name_list_str = NULL,
@@ -327,6 +379,8 @@ int cmd_buildid_cache(int argc, const char **argv)
"file(s) to remove"),
OPT_STRING('p', "purge", &purge_name_list_str, "file list",
"file(s) to remove (remove old caches too)"),
+ OPT_BOOLEAN('P', "purge-all", &purge_all, "purge all cached files"),
+ OPT_BOOLEAN('l', "list", &list_files, "list all cached files"),
OPT_STRING('M', "missing", &missing_filename, "file",
"to find missing build ids in the cache"),
OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
@@ -344,11 +398,20 @@ int cmd_buildid_cache(int argc, const char **argv)
argc = parse_options(argc, argv, buildid_cache_options,
buildid_cache_usage, 0);
- if (argc || (!add_name_list_str && !kcore_filename &&
- !remove_name_list_str && !purge_name_list_str &&
- !missing_filename && !update_name_list_str))
+ opts_flag = add_name_list_str || kcore_filename ||
+ remove_name_list_str || purge_name_list_str ||
+ missing_filename || update_name_list_str ||
+ purge_all;
+
+ if (argc || !(list_files || opts_flag))
usage_with_options(buildid_cache_usage, buildid_cache_options);
+ /* -l is exclusive. It can not be used with other options. */
+ if (list_files && opts_flag) {
+ usage_with_options_msg(buildid_cache_usage,
+ buildid_cache_options, "-l is exclusive.\n");
+ }
+
if (ns_id > 0)
nsi = nsinfo__new(ns_id);
@@ -366,6 +429,11 @@ int cmd_buildid_cache(int argc, const char **argv)
setup_pager();
+ if (list_files) {
+ ret = build_id_cache__show_all();
+ goto out;
+ }
+
if (add_name_list_str) {
list = strlist__new(add_name_list_str, NULL);
if (list) {
@@ -420,6 +488,13 @@ int cmd_buildid_cache(int argc, const char **argv)
}
}
+ if (purge_all) {
+ if (build_id_cache__purge_all()) {
+ pr_warning("Couldn't remove some caches. Error: %s.\n",
+ str_error_r(errno, sbuf, sizeof(sbuf)));
+ }
+ }
+
if (missing_filename)
ret = build_id_cache__fprintf_missing(session, stdout);
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 40fe919bbcf3..a3b346359ba0 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -440,9 +440,7 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
goto repipe;
}
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, sample->ip, &al);
-
- if (al.map != NULL) {
+ if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) {
if (!al.map->dso->hit) {
al.map->dso->hit = 1;
if (map__load(al.map) >= 0) {
diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c
index bcfb363112d3..90d1a2305b72 100644
--- a/tools/perf/builtin-kallsyms.c
+++ b/tools/perf/builtin-kallsyms.c
@@ -27,7 +27,7 @@ static int __cmd_kallsyms(int argc, const char **argv)
for (i = 0; i < argc; ++i) {
struct map *map;
- struct symbol *symbol = machine__find_kernel_function_by_name(machine, argv[i], &map);
+ struct symbol *symbol = machine__find_kernel_symbol_by_name(machine, argv[i], &map);
if (symbol == NULL) {
printf("%s: not found\n", argv[i]);
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ae11e4c3516a..54d3f21b0e62 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1004,7 +1004,7 @@ static void __print_slab_result(struct rb_root *root,
if (is_caller) {
addr = data->call_site;
if (!raw_ip)
- sym = machine__find_kernel_function(machine, addr, &map);
+ sym = machine__find_kernel_symbol(machine, addr, &map);
} else
addr = data->ptr;
@@ -1068,7 +1068,7 @@ static void __print_page_alloc_result(struct perf_session *session, int n_lines)
char *caller = buf;
data = rb_entry(next, struct page_stat, node);
- sym = machine__find_kernel_function(machine, data->callsite, &map);
+ sym = machine__find_kernel_symbol(machine, data->callsite, &map);
if (sym)
caller = sym->name;
else
@@ -1110,7 +1110,7 @@ static void __print_page_caller_result(struct perf_session *session, int n_lines
char *caller = buf;
data = rb_entry(next, struct page_stat, node);
- sym = machine__find_kernel_function(machine, data->callsite, &map);
+ sym = machine__find_kernel_symbol(machine, data->callsite, &map);
if (sym)
caller = sym->name;
else
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0f198f6d9b77..ad978e3ee2b8 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -194,20 +194,11 @@ out:
return err;
}
-/*
- * Events in data file are not collect in groups, but we still want
- * the group display. Set the artificial group and set the leader's
- * forced_leader flag to notify the display code.
- */
static void setup_forced_leader(struct report *report,
struct perf_evlist *evlist)
{
- if (report->group_set && !evlist->nr_groups) {
- struct perf_evsel *leader = perf_evlist__first(evlist);
-
- perf_evlist__set_leader(evlist);
- leader->forced_leader = true;
- }
+ if (report->group_set)
+ perf_evlist__force_leader(evlist);
}
static int process_feature_event(struct perf_tool *tool,
@@ -523,12 +514,9 @@ static void report__warn_kptr_restrict(const struct report *rep)
"As no suitable kallsyms nor vmlinux was found, kernel samples\n"
"can't be resolved.";
- if (kernel_map) {
- const struct dso *kdso = kernel_map->dso;
- if (!RB_EMPTY_ROOT(&kdso->symbols[MAP__FUNCTION])) {
- desc = "If some relocation was applied (e.g. "
- "kexec) symbols may be misresolved.";
- }
+ if (kernel_map && map__has_symbols(kernel_map)) {
+ desc = "If some relocation was applied (e.g. "
+ "kexec) symbols may be misresolved.";
}
ui__warning(
@@ -718,10 +706,7 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp)
static int map_groups__fprintf_task(struct map_groups *mg, int indent, FILE *fp)
{
- int printed = 0, i;
- for (i = 0; i < MAP__NR_TYPES; ++i)
- printed += maps__fprintf_task(&mg->maps[i], indent, fp);
- return printed;
+ return maps__fprintf_task(&mg->maps, indent, fp);
}
static void task__print_level(struct task *task, FILE *fp, int level)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index e0a9845b6cbc..cefc8813e91e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -153,8 +153,8 @@ static struct {
.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
- PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
- PERF_OUTPUT_PERIOD,
+ PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
+ PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD,
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -165,8 +165,9 @@ static struct {
.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
- PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
- PERF_OUTPUT_PERIOD | PERF_OUTPUT_BPF_OUTPUT,
+ PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
+ PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD |
+ PERF_OUTPUT_BPF_OUTPUT,
.invalid_fields = PERF_OUTPUT_TRACE,
},
@@ -185,10 +186,10 @@ static struct {
.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
- PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
- PERF_OUTPUT_PERIOD | PERF_OUTPUT_ADDR |
- PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT |
- PERF_OUTPUT_PHYS_ADDR,
+ PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
+ PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD |
+ PERF_OUTPUT_ADDR | PERF_OUTPUT_DATA_SRC |
+ PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR,
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -199,8 +200,8 @@ static struct {
.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
- PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
- PERF_OUTPUT_PERIOD,
+ PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
+ PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD,
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -211,8 +212,8 @@ static struct {
.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
- PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
- PERF_OUTPUT_SYNTH,
+ PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
+ PERF_OUTPUT_DSO | PERF_OUTPUT_SYNTH,
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -544,6 +545,7 @@ static int perf_session__check_output_opt(struct perf_session *session)
if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
output[j].fields |= PERF_OUTPUT_IP;
output[j].fields |= PERF_OUTPUT_SYM;
+ output[j].fields |= PERF_OUTPUT_SYMOFFSET;
output[j].fields |= PERF_OUTPUT_DSO;
set_print_ip_opts(attr);
goto out;
@@ -717,8 +719,8 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample,
if (PRINT_FIELD(DSO)) {
memset(&alf, 0, sizeof(alf));
memset(&alt, 0, sizeof(alt));
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
+ thread__find_map(thread, sample->cpumode, from, &alf);
+ thread__find_map(thread, sample->cpumode, to, &alt);
}
printed += fprintf(fp, " 0x%"PRIx64, from);
@@ -764,13 +766,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample,
from = br->entries[i].from;
to = br->entries[i].to;
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
- if (alf.map)
- alf.sym = map__find_symbol(alf.map, alf.addr);
-
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
- if (alt.map)
- alt.sym = map__find_symbol(alt.map, alt.addr);
+ thread__find_symbol(thread, sample->cpumode, from, &alf);
+ thread__find_symbol(thread, sample->cpumode, to, &alt);
printed += symbol__fprintf_symname_offs(alf.sym, &alf, fp);
if (PRINT_FIELD(DSO)) {
@@ -814,12 +811,12 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample,
from = br->entries[i].from;
to = br->entries[i].to;
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
- if (alf.map && !alf.map->dso->adjust_symbols)
+ if (thread__find_map(thread, sample->cpumode, from, &alf) &&
+ !alf.map->dso->adjust_symbols)
from = map__map_ip(alf.map, from);
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
- if (alt.map && !alt.map->dso->adjust_symbols)
+ if (thread__find_map(thread, sample->cpumode, to, &alt) &&
+ !alt.map->dso->adjust_symbols)
to = map__map_ip(alt.map, to);
printed += fprintf(fp, " 0x%"PRIx64, from);
@@ -882,8 +879,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end,
return 0;
}
- thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
- if (!al.map || !al.map->dso) {
+ if (!thread__find_map(thread, *cpumode, start, &al) || !al.map->dso) {
pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
return 0;
}
@@ -933,10 +929,8 @@ static int ip__fprintf_sym(uint64_t addr, struct thread *thread,
memset(&al, 0, sizeof(al));
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
- if (!al.map)
- thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
- addr, &al);
+ thread__find_map(thread, cpumode, addr, &al);
+
if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
return 0;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f17dc601b0f3..a4f662a462c6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -164,6 +164,7 @@ static bool forever = false;
static bool metric_only = false;
static bool force_metric_only = false;
static bool no_merge = false;
+static bool walltime_run_table = false;
static struct timespec ref_time;
static struct cpu_map *aggr_map;
static aggr_get_id_t aggr_get_id;
@@ -173,6 +174,7 @@ static const char *output_name;
static int output_fd;
static int print_free_counters_hint;
static int print_mixed_hw_group_error;
+static u64 *walltime_run;
struct perf_stat {
bool record;
@@ -569,7 +571,7 @@ static struct perf_evsel *perf_evsel__reset_weak_group(struct perf_evsel *evsel)
return leader;
}
-static int __run_perf_stat(int argc, const char **argv)
+static int __run_perf_stat(int argc, const char **argv, int run_idx)
{
int interval = stat_config.interval;
int times = stat_config.times;
@@ -752,6 +754,9 @@ try_again:
t1 = rdclock();
+ if (walltime_run_table)
+ walltime_run[run_idx] = t1 - t0;
+
update_stats(&walltime_nsecs_stats, t1 - t0);
/*
@@ -766,7 +771,7 @@ try_again:
return WEXITSTATUS(status);
}
-static int run_perf_stat(int argc, const char **argv)
+static int run_perf_stat(int argc, const char **argv, int run_idx)
{
int ret;
@@ -779,7 +784,7 @@ static int run_perf_stat(int argc, const char **argv)
if (sync_run)
sync();
- ret = __run_perf_stat(argc, argv);
+ ret = __run_perf_stat(argc, argv, run_idx);
if (ret)
return ret;
@@ -1764,19 +1769,67 @@ static void print_header(int argc, const char **argv)
}
}
+static int get_precision(double num)
+{
+ if (num > 1)
+ return 0;
+
+ return lround(ceil(-log10(num)));
+}
+
+static void print_table(FILE *output, int precision, double avg)
+{
+ char tmp[64];
+ int idx, indent = 0;
+
+ scnprintf(tmp, 64, " %17.*f", precision, avg);
+ while (tmp[indent] == ' ')
+ indent++;
+
+ fprintf(output, "%*s# Table of individual measurements:\n", indent, "");
+
+ for (idx = 0; idx < run_count; idx++) {
+ double run = (double) walltime_run[idx] / NSEC_PER_SEC;
+ int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5);
+
+ fprintf(output, " %17.*f (%+.*f) ",
+ precision, run, precision, run - avg);
+
+ for (h = 0; h < n; h++)
+ fprintf(output, "#");
+
+ fprintf(output, "\n");
+ }
+
+ fprintf(output, "\n%*s# Final result:\n", indent, "");
+}
+
static void print_footer(void)
{
+ double avg = avg_stats(&walltime_nsecs_stats) / NSEC_PER_SEC;
FILE *output = stat_config.output;
int n;
if (!null_run)
fprintf(output, "\n");
- fprintf(output, " %17.9f seconds time elapsed",
- avg_stats(&walltime_nsecs_stats) / NSEC_PER_SEC);
- if (run_count > 1) {
- fprintf(output, " ");
- print_noise_pct(stddev_stats(&walltime_nsecs_stats),
- avg_stats(&walltime_nsecs_stats));
+
+ if (run_count == 1) {
+ fprintf(output, " %17.9f seconds time elapsed", avg);
+ } else {
+ double sd = stddev_stats(&walltime_nsecs_stats) / NSEC_PER_SEC;
+ /*
+ * Display at most 2 more significant
+ * digits than the stddev inaccuracy.
+ */
+ int precision = get_precision(sd) + 2;
+
+ if (walltime_run_table)
+ print_table(output, precision, avg);
+
+ fprintf(output, " %17.*f +- %.*f seconds time elapsed",
+ precision, avg, precision, sd);
+
+ print_noise_pct(sd, avg);
}
fprintf(output, "\n\n");
@@ -1952,6 +2005,8 @@ static const struct option stat_options[] = {
"be more verbose (show counter open errors, etc)"),
OPT_INTEGER('r', "repeat", &run_count,
"repeat command and print average + stddev (max: 100, forever: 0)"),
+ OPT_BOOLEAN(0, "table", &walltime_run_table,
+ "display details about each run (only with -r option)"),
OPT_BOOLEAN('n', "null", &null_run,
"null run - dont start any counters"),
OPT_INCR('d', "detailed", &detailed_run,
@@ -2843,6 +2898,13 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
+ if (walltime_run_table && run_count <= 1) {
+ fprintf(stderr, "--table is only supported with -r\n");
+ parse_options_usage(stat_usage, stat_options, "r", 1);
+ parse_options_usage(NULL, stat_options, "table", 0);
+ goto out;
+ }
+
if (output_fd < 0) {
fprintf(stderr, "argument to --log-fd must be a > 0\n");
parse_options_usage(stat_usage, stat_options, "log-fd", 0);
@@ -2897,6 +2959,14 @@ int cmd_stat(int argc, const char **argv)
run_count = 1;
}
+ if (walltime_run_table) {
+ walltime_run = zalloc(run_count * sizeof(walltime_run[0]));
+ if (!walltime_run) {
+ pr_err("failed to setup -r option");
+ goto out;
+ }
+ }
+
if ((stat_config.aggr_mode == AGGR_THREAD) &&
!target__has_task(&target)) {
if (!target.system_wide || target.cpu_list) {
@@ -3012,7 +3082,7 @@ int cmd_stat(int argc, const char **argv)
fprintf(output, "[ perf stat: executing run #%d ... ]\n",
run_idx + 1);
- status = run_perf_stat(argc, argv);
+ status = run_perf_stat(argc, argv, run_idx);
if (forever && status != -1) {
print_counters(NULL, argc, argv);
perf_stat__reset_stats();
@@ -3060,6 +3130,8 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
out:
+ free(walltime_run);
+
if (smi_cost && smi_reset)
sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 813698a9b8c7..a827919c6263 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -533,12 +533,8 @@ static const char *cat_backtrace(union perf_event *event,
}
tal.filtered = 0;
- thread__find_addr_location(al.thread, cpumode,
- MAP__FUNCTION, ip, &tal);
-
- if (tal.sym)
- fprintf(f, "..... %016" PRIx64 " %s\n", ip,
- tal.sym->name);
+ if (thread__find_symbol(al.thread, cpumode, ip, &tal))
+ fprintf(f, "..... %016" PRIx64 " %s\n", ip, tal.sym->name);
else
fprintf(f, "..... %016" PRIx64 "\n", ip);
}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f39bd60d2708..7a349fcd3864 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -742,7 +742,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
"Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
"Check /proc/sys/kernel/kptr_restrict.\n\n"
"Kernel%s samples will not be resolved.\n",
- al.map && !RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION]) ?
+ al.map && map__has_symbols(al.map) ?
" modules" : "");
if (use_browser <= 0)
sleep(5);
@@ -750,7 +750,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
machine->kptr_restrict_warned = true;
}
- if (al.sym == NULL) {
+ if (al.sym == NULL && al.map != NULL) {
const char *msg = "Kernel samples will not be resolved.\n";
/*
* As we do lazy loading of symtabs we only will know if the
@@ -764,8 +764,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
* invalid --vmlinux ;-)
*/
if (!machine->kptr_restrict_warned && !top->vmlinux_warned &&
- al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
- RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
+ __map__is_kernel(al.map) && map__has_symbols(al.map)) {
if (symbol_conf.vmlinux_name) {
char serr[256];
dso__strerror_load(al.map->dso, serr, sizeof(serr));
@@ -1265,7 +1264,7 @@ int cmd_top(int argc, const char **argv)
.proc_map_timeout = 500,
.overwrite = 1,
},
- .max_stack = sysctl_perf_event_max_stack,
+ .max_stack = sysctl__max_stack(),
.sym_pcnt_filter = 5,
.nr_threads_synthesize = UINT_MAX,
};
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 3ad17ee89403..560aed7da36a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2024,8 +2024,7 @@ static int trace__pgfault(struct trace *trace,
if (trace->summary_only)
goto out;
- thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
- sample->ip, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
@@ -2037,12 +2036,10 @@ static int trace__pgfault(struct trace *trace,
fprintf(trace->output, "] => ");
- thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
- sample->addr, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
if (!al.map) {
- thread__find_addr_location(thread, sample->cpumode,
- MAP__FUNCTION, sample->addr, &al);
+ thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
if (al.map)
map_type = 'x';
@@ -3165,7 +3162,7 @@ int cmd_trace(int argc, const char **argv)
mmap_pages_user_set = false;
if (trace.max_stack == UINT_MAX) {
- trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
+ trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
max_stack_user_set = false;
}
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 9aff89bc7535..10f333e2e825 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -55,22 +55,26 @@ include/uapi/asm-generic/ioctls.h
include/uapi/asm-generic/mman-common.h
'
-check () {
- file=$1
+check_2 () {
+ file1=$1
+ file2=$2
shift
- opts=
- while [ -n "$*" ]; do
- opts="$opts \"$1\""
- shift
- done
+ shift
- cmd="diff $opts ../$file ../../$file > /dev/null"
+ cmd="diff $* $file1 $file2 > /dev/null"
- test -f ../../$file &&
+ test -f $file2 &&
eval $cmd || echo "Warning: Kernel ABI header at 'tools/$file' differs from latest version at '$file'" >&2
}
+check () {
+ file=$1
+
+ shift
+
+ check_2 ../$file ../../$file $*
+}
# Check if we have the kernel headers (tools/perf/../../include), else
# we're probably on a detached tarball, so no point in trying to check
@@ -83,7 +87,7 @@ for i in $HEADERS; do
done
# diff with extra ignore lines
-check arch/x86/lib/memcpy_64.S -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
-check arch/x86/lib/memset_64.S -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
-check include/uapi/asm-generic/mman.h -I "^#include <\(uapi/\)*asm-generic/mman-common.h>"
-check include/uapi/linux/mman.h -I "^#include <\(uapi/\)*asm/mman.h>"
+check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"'
+check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"'
+check include/uapi/asm-generic/mman.h '-I "^#include <\(uapi/\)*asm-generic/mman-common.h>"'
+check include/uapi/linux/mman.h '-I "^#include <\(uapi/\)*asm/mman.h>"'
diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c
new file mode 100644
index 000000000000..b9c203219691
--- /dev/null
+++ b/tools/perf/examples/bpf/5sec.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ Description:
+
+ . Disable strace like syscall tracing (--no-syscalls), or try tracing
+ just some (-e *sleep).
+
+ . Attach a filter function to a kernel function, returning when it should
+ be considered, i.e. appear on the output.
+
+ . Run it system wide, so that any sleep of >= 5 seconds and < than 6
+ seconds gets caught.
+
+ . Ask for callgraphs using DWARF info, so that userspace can be unwound
+
+ . While this is running, run something like "sleep 5s".
+
+ . If we decide to add tv_nsec as well, then it becomes:
+
+ int probe(hrtimer_nanosleep, rqtp->tv_sec rqtp->tv_nsec)(void *ctx, int err, long sec, long nsec)
+
+ I.e. add where it comes from (rqtp->tv_nsec) and where it will be
+ accessible in the function body (nsec)
+
+ # perf trace --no-syscalls -e tools/perf/examples/bpf/5sec.c/call-graph=dwarf/
+ 0.000 perf_bpf_probe:func:(ffffffff9811b5f0) tv_sec=5
+ hrtimer_nanosleep ([kernel.kallsyms])
+ __x64_sys_nanosleep ([kernel.kallsyms])
+ do_syscall_64 ([kernel.kallsyms])
+ entry_SYSCALL_64 ([kernel.kallsyms])
+ __GI___nanosleep (/usr/lib64/libc-2.26.so)
+ rpl_nanosleep (/usr/bin/sleep)
+ xnanosleep (/usr/bin/sleep)
+ main (/usr/bin/sleep)
+ __libc_start_main (/usr/lib64/libc-2.26.so)
+ _start (/usr/bin/sleep)
+ ^C#
+
+ Copyright (C) 2018 Red Hat, Inc., Arnaldo Carvalho de Melo <acme@redhat.com>
+*/
+
+#include <bpf.h>
+
+int probe(hrtimer_nanosleep, rqtp->tv_sec)(void *ctx, int err, long sec)
+{
+ return sec == 5;
+}
+
+license(GPL);
diff --git a/tools/perf/examples/bpf/empty.c b/tools/perf/examples/bpf/empty.c
new file mode 100644
index 000000000000..3776d26db9e7
--- /dev/null
+++ b/tools/perf/examples/bpf/empty.c
@@ -0,0 +1,3 @@
+#include <bpf.h>
+
+license(GPL);
diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h
new file mode 100644
index 000000000000..dd764ad5efdf
--- /dev/null
+++ b/tools/perf/include/bpf/bpf.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _PERF_BPF_H
+#define _PERF_BPF_H
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+#define probe(function, vars) \
+ SEC(#function "=" #function " " #vars) function
+
+#define license(name) \
+char _license[] SEC("license") = #name; \
+int _version SEC("version") = LINUX_VERSION_CODE;
+
+#endif /* _PERF_BPF_H */
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 20a08cb32332..51c81509a315 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -238,7 +238,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
(*argc)--;
} else if (strstarts(cmd, CMD_DEBUGFS_DIR)) {
tracing_path_set(cmd + strlen(CMD_DEBUGFS_DIR));
- fprintf(stderr, "dir: %s\n", tracing_path);
+ fprintf(stderr, "dir: %s\n", tracing_path_mount());
if (envchanged)
*envchanged = 1;
} else if (!strcmp(cmd, "--list-cmds")) {
@@ -421,22 +421,11 @@ void pthread__unblock_sigwinch(void)
pthread_sigmask(SIG_UNBLOCK, &set, NULL);
}
-#ifdef _SC_LEVEL1_DCACHE_LINESIZE
-#define cache_line_size(cacheline_sizep) *cacheline_sizep = sysconf(_SC_LEVEL1_DCACHE_LINESIZE)
-#else
-static void cache_line_size(int *cacheline_sizep)
-{
- if (sysfs__read_int("devices/system/cpu/cpu0/cache/index0/coherency_line_size", cacheline_sizep))
- pr_debug("cannot determine cache line size");
-}
-#endif
-
int main(int argc, const char **argv)
{
int err;
const char *cmd;
char sbuf[STRERR_BUFSIZE];
- int value;
/* libsubcmd init */
exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
@@ -444,13 +433,6 @@ int main(int argc, const char **argv)
/* The page_size is placed in util object. */
page_size = sysconf(_SC_PAGE_SIZE);
- cache_line_size(&cacheline_size);
-
- if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
- sysctl_perf_event_max_stack = value;
-
- if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0)
- sysctl_perf_event_max_contexts_per_stack = value;
cmd = extract_argv0_path(argv[0]);
if (!cmd)
@@ -458,15 +440,11 @@ int main(int argc, const char **argv)
srandom(time(NULL));
- perf_config__init();
err = perf_config(perf_default_config, NULL);
if (err)
return err;
set_buildid_dir(NULL);
- /* get debugfs/tracefs mount point from /proc/mounts */
- tracing_path_mount();
-
/*
* "perf-xxxx" is the same as "perf xxxx", but we obviously:
*
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index cac8f8889bc3..2bde505e2e7e 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -654,6 +654,15 @@ static int perf_test__list(int argc, const char **argv)
continue;
pr_info("%2d: %s\n", i, t->desc);
+
+ if (t->subtest.get_nr) {
+ int subn = t->subtest.get_nr();
+ int subi;
+
+ for (subi = 0; subi < subn; subi++)
+ pr_info("%2d:%1d: %s\n", i, subi + 1,
+ t->subtest.get_desc(subi));
+ }
}
perf_test__list_shell(argc, argv, i);
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 99936352df4f..afa4ce21ba7c 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -236,14 +236,13 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
pr_debug("Reading object code for memory address: %#"PRIx64"\n", addr);
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
- if (!al.map || !al.map->dso) {
+ if (!thread__find_map(thread, cpumode, addr, &al) || !al.map->dso) {
if (cpumode == PERF_RECORD_MISC_HYPERVISOR) {
pr_debug("Hypervisor address can not be resolved - skipping\n");
return 0;
}
- pr_debug("thread__find_addr_map failed\n");
+ pr_debug("thread__find_map failed\n");
return -1;
}
diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index f7c5b613d667..b889a28fd80b 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -131,20 +131,20 @@ struct machine *setup_fake_machine(struct machines *machines)
goto out;
/* emulate dso__load() */
- dso__set_loaded(dso, MAP__FUNCTION);
+ dso__set_loaded(dso);
for (k = 0; k < fake_symbols[i].nr_syms; k++) {
struct symbol *sym;
struct fake_sym *fsym = &fake_symbols[i].syms[k];
sym = symbol__new(fsym->start, fsym->length,
- STB_GLOBAL, fsym->name);
+ STB_GLOBAL, STT_FUNC, fsym->name);
if (sym == NULL) {
dso__put(dso);
goto out;
}
- symbols__insert(&dso->symbols[MAP__FUNCTION], sym);
+ symbols__insert(&dso->symbols, sym);
}
dso__put(dso);
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 868d82b501f4..b1af2499a3c9 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -188,9 +188,8 @@ static int mmap_events(synth_cb synth)
pr_debug("looking for map %p\n", td->map);
- thread__find_addr_map(thread,
- PERF_RECORD_MISC_USER, MAP__FUNCTION,
- (unsigned long) (td->map + 1), &al);
+ thread__find_map(thread, PERF_RECORD_MISC_USER,
+ (unsigned long) (td->map + 1), &al);
thread__put(thread);
@@ -218,7 +217,7 @@ static int mmap_events(synth_cb synth)
* perf_event__synthesize_threads (global)
*
* We test we can find all memory maps via:
- * thread__find_addr_map
+ * thread__find_map
*
* by using all thread objects.
*/
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 18b06444f230..b9ebe15afb13 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1309,18 +1309,26 @@ static int test__checkevent_config_cache(struct perf_evlist *evlist)
return 0;
}
+static int test__intel_pt(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+ TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "intel_pt//u") == 0);
+ return 0;
+}
+
static int count_tracepoints(void)
{
struct dirent *events_ent;
DIR *events_dir;
int cnt = 0;
- events_dir = opendir(tracing_events_path);
+ events_dir = tracing_events__opendir();
TEST_ASSERT_VAL("Can't open events dir", events_dir);
while ((events_ent = readdir(events_dir))) {
- char sys_path[PATH_MAX];
+ char *sys_path;
struct dirent *sys_ent;
DIR *sys_dir;
@@ -1331,8 +1339,8 @@ static int count_tracepoints(void)
|| !strcmp(events_ent->d_name, "header_page"))
continue;
- scnprintf(sys_path, PATH_MAX, "%s/%s",
- tracing_events_path, events_ent->d_name);
+ sys_path = get_events_file(events_ent->d_name);
+ TEST_ASSERT_VAL("Can't get sys path", sys_path);
sys_dir = opendir(sys_path);
TEST_ASSERT_VAL("Can't open sys dir", sys_dir);
@@ -1348,6 +1356,7 @@ static int count_tracepoints(void)
}
closedir(sys_dir);
+ put_events_file(sys_path);
}
closedir(events_dir);
@@ -1637,6 +1646,11 @@ static struct evlist_test test__events[] = {
.check = test__checkevent_config_cache,
.id = 51,
},
+ {
+ .name = "intel_pt//u",
+ .check = test__intel_pt,
+ .id = 52,
+ },
};
static struct evlist_test test__events_pmu[] = {
diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
index ee86473643be..650b208f700f 100755
--- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
@@ -16,18 +16,18 @@ nm -g $libc 2>/dev/null | fgrep -q inet_pton || exit 254
trace_libc_inet_pton_backtrace() {
idx=0
expected[0]="ping[][0-9 \.:]+probe_libc:inet_pton: \([[:xdigit:]]+\)"
- expected[1]=".*inet_pton[[:space:]]\($libc|inlined\)$"
+ expected[1]=".*inet_pton\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$"
case "$(uname -m)" in
s390x)
eventattr='call-graph=dwarf,max-stack=4'
- expected[2]="gaih_inet.*[[:space:]]\($libc|inlined\)$"
- expected[3]="(__GI_)?getaddrinfo[[:space:]]\($libc|inlined\)$"
- expected[4]="main[[:space:]]\(.*/bin/ping.*\)$"
+ expected[2]="gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$"
+ expected[3]="(__GI_)?getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$"
+ expected[4]="main\+0x[[:xdigit:]]+[[:space:]]\(.*/bin/ping.*\)$"
;;
*)
eventattr='max-stack=3'
- expected[2]="getaddrinfo[[:space:]]\($libc\)$"
- expected[3]=".*\(.*/bin/ping.*\)$"
+ expected[2]="getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc\)$"
+ expected[3]=".*\+0x[[:xdigit:]]+[[:space:]]\(.*/bin/ping.*\)$"
;;
esac
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 1e5adb65632a..7691980b7df1 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -19,8 +19,7 @@ int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest
struct symbol *sym;
struct map *kallsyms_map, *vmlinux_map, *map;
struct machine kallsyms, vmlinux;
- enum map_type type = MAP__FUNCTION;
- struct maps *maps = &vmlinux.kmaps.maps[type];
+ struct maps *maps = machine__kernel_maps(&vmlinux);
u64 mem_start, mem_end;
bool header_printed;
@@ -56,7 +55,7 @@ int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest
* be compacted against the list of modules found in the "vmlinux"
* code and with the one got from /proc/modules from the "kallsyms" code.
*/
- if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type) <= 0) {
+ if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms") <= 0) {
pr_debug("dso__load_kallsyms ");
goto out;
}
@@ -94,7 +93,7 @@ int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest
* maps__reloc_vmlinux will notice and set proper ->[un]map_ip routines
* to fixup the symbols.
*/
- if (machine__load_vmlinux_path(&vmlinux, type) <= 0) {
+ if (machine__load_vmlinux_path(&vmlinux) <= 0) {
pr_debug("Couldn't find a vmlinux that matches the kernel running on this machine, skipping test\n");
err = TEST_SKIP;
goto out;
@@ -108,7 +107,7 @@ int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest
* in the kallsyms dso. For the ones that are in both, check its names and
* end addresses too.
*/
- for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
+ map__for_each_symbol(vmlinux_map, sym, nd) {
struct symbol *pair, *first_pair;
sym = rb_entry(nd, struct symbol, rb_node);
@@ -119,8 +118,7 @@ int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest
mem_start = vmlinux_map->unmap_ip(vmlinux_map, sym->start);
mem_end = vmlinux_map->unmap_ip(vmlinux_map, sym->end);
- first_pair = machine__find_kernel_symbol(&kallsyms, type,
- mem_start, NULL);
+ first_pair = machine__find_kernel_symbol(&kallsyms, mem_start, NULL);
pair = first_pair;
if (pair && UM(pair->start) == mem_start) {
@@ -149,7 +147,7 @@ next_pair:
*/
continue;
} else {
- pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL);
+ pair = machine__find_kernel_symbol_by_name(&kallsyms, sym->name, NULL);
if (pair) {
if (UM(pair->start) == mem_start)
goto next_pair;
@@ -183,7 +181,7 @@ next_pair:
* so use the short name, less descriptive but the same ("[kernel]" in
* both cases.
*/
- pair = map_groups__find_by_name(&kallsyms.kmaps, type,
+ pair = map_groups__find_by_name(&kallsyms.kmaps,
(map->dso->kernel ?
map->dso->short_name :
map->dso->name));
@@ -206,7 +204,7 @@ next_pair:
mem_start = vmlinux_map->unmap_ip(vmlinux_map, map->start);
mem_end = vmlinux_map->unmap_ip(vmlinux_map, map->end);
- pair = map_groups__find(&kallsyms.kmaps, type, mem_start);
+ pair = map_groups__find(&kallsyms.kmaps, mem_start);
if (pair == NULL || pair->priv)
continue;
@@ -228,7 +226,7 @@ next_pair:
header_printed = false;
- maps = &kallsyms.kmaps.maps[type];
+ maps = machine__kernel_maps(&kallsyms);
for (map = maps__first(maps); map; map = map__next(map)) {
if (!map->priv) {
diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh
index 0be4138fbe71..f24722146ebe 100755
--- a/tools/perf/trace/beauty/prctl_option.sh
+++ b/tools/perf/trace/beauty/prctl_option.sh
@@ -1,6 +1,6 @@
#!/bin/sh
-header_dir=$1
+[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/
printf "static const char *prctl_options[] = {\n"
regex='^#define[[:space:]]+PR_([GS]ET\w+)[[:space:]]*([[:xdigit:]]+).*'
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 3781d74088a7..8be40fa903aa 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -695,6 +695,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
"O Bump offset level (jump targets -> +call -> all -> cycle thru)\n"
"s Toggle source code view\n"
"t Circulate percent, total period, samples view\n"
+ "c Show min/max cycle\n"
"/ Search string\n"
"k Toggle line numbers\n"
"P Print to [symbol_name].annotation file.\n"
@@ -791,6 +792,13 @@ show_sup_ins:
notes->options->show_total_period = true;
annotation__update_column_widths(notes);
continue;
+ case 'c':
+ if (notes->options->show_minmax_cycle)
+ notes->options->show_minmax_cycle = false;
+ else
+ notes->options->show_minmax_cycle = true;
+ annotation__update_column_widths(notes);
+ continue;
case K_LEFT:
case K_ESC:
case 'q':
diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c
index e03fa75f108a..5b8b8c637686 100644
--- a/tools/perf/ui/browsers/map.c
+++ b/tools/perf/ui/browsers/map.c
@@ -104,7 +104,7 @@ int map__browse(struct map *map)
{
struct map_browser mb = {
.b = {
- .entries = &map->dso->symbols[map->type],
+ .entries = &map->dso->symbols,
.refresh = ui_browser__rb_tree_refresh,
.seek = ui_browser__rb_tree_seek,
.write = map_browser__write,
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 6832fcb2e6ff..c1eb476da91b 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -819,8 +819,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
}
if (h->ms.map == NULL && verbose > 1) {
- __map_groups__fprintf_maps(h->thread->mg,
- MAP__FUNCTION, fp);
+ map_groups__fprintf(h->thread->mg, fp);
fprintf(fp, "%.10s end\n", graph_dotted_line);
}
}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 8052373bcd6a..5d4c45b76895 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -152,6 +152,8 @@ libperf-y += perf-hooks.o
libperf-$(CONFIG_CXX) += c++/
CFLAGS_config.o += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_llvm-utils.o += -DPERF_INCLUDE_DIR="BUILD_STR($(perf_include_dir_SQ))"
+
# avoid compiler warnings in 32-bit mode
CFLAGS_genelf_debug.o += -Wno-packed
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 5d74a30fe00f..71897689dacf 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -760,6 +760,15 @@ static int __symbol__account_cycles(struct annotation *notes,
ch[offset].num_aggr++;
ch[offset].cycles_aggr += cycles;
+ if (cycles > ch[offset].cycles_max)
+ ch[offset].cycles_max = cycles;
+
+ if (ch[offset].cycles_min) {
+ if (cycles && cycles < ch[offset].cycles_min)
+ ch[offset].cycles_min = cycles;
+ } else
+ ch[offset].cycles_min = cycles;
+
if (!have_start && ch[offset].have_start)
return 0;
if (ch[offset].num) {
@@ -953,8 +962,11 @@ void annotation__compute_ipc(struct annotation *notes, size_t size)
if (ch->have_start)
annotation__count_and_fill(notes, ch->start, offset, ch);
al = notes->offsets[offset];
- if (al && ch->num_aggr)
+ if (al && ch->num_aggr) {
al->cycles = ch->cycles_aggr / ch->num_aggr;
+ al->cycles_max = ch->cycles_max;
+ al->cycles_min = ch->cycles_min;
+ }
notes->have_cycles = true;
}
}
@@ -1953,6 +1965,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
u64 len;
int width = symbol_conf.show_total_period ? 12 : 8;
int graph_dotted_len;
+ char buf[512];
filename = strdup(dso->long_name);
if (!filename)
@@ -1965,8 +1978,11 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
len = symbol__size(sym);
- if (perf_evsel__is_group_event(evsel))
+ if (perf_evsel__is_group_event(evsel)) {
width *= evsel->nr_members;
+ perf_evsel__group_desc(evsel, buf, sizeof(buf));
+ evsel_name = buf;
+ }
graph_dotted_len = printf(" %-*.*s| Source code & Disassembly of %s for %s (%" PRIu64 " samples)\n",
width, width, symbol_conf.show_total_period ? "Period" :
@@ -2486,13 +2502,38 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
else
obj__printf(obj, "%*s ", ANNOTATION__IPC_WIDTH - 1, "IPC");
- if (al->cycles)
- obj__printf(obj, "%*" PRIu64 " ",
+ if (!notes->options->show_minmax_cycle) {
+ if (al->cycles)
+ obj__printf(obj, "%*" PRIu64 " ",
ANNOTATION__CYCLES_WIDTH - 1, al->cycles);
- else if (!show_title)
- obj__printf(obj, "%*s", ANNOTATION__CYCLES_WIDTH, " ");
- else
- obj__printf(obj, "%*s ", ANNOTATION__CYCLES_WIDTH - 1, "Cycle");
+ else if (!show_title)
+ obj__printf(obj, "%*s",
+ ANNOTATION__CYCLES_WIDTH, " ");
+ else
+ obj__printf(obj, "%*s ",
+ ANNOTATION__CYCLES_WIDTH - 1,
+ "Cycle");
+ } else {
+ if (al->cycles) {
+ char str[32];
+
+ scnprintf(str, sizeof(str),
+ "%" PRIu64 "(%" PRIu64 "/%" PRIu64 ")",
+ al->cycles, al->cycles_min,
+ al->cycles_max);
+
+ obj__printf(obj, "%*s ",
+ ANNOTATION__MINMAX_CYCLES_WIDTH - 1,
+ str);
+ } else if (!show_title)
+ obj__printf(obj, "%*s",
+ ANNOTATION__MINMAX_CYCLES_WIDTH,
+ " ");
+ else
+ obj__printf(obj, "%*s ",
+ ANNOTATION__MINMAX_CYCLES_WIDTH - 1,
+ "Cycle(min/max)");
+ }
}
obj__printf(obj, " ");
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index f28a9e43421d..5080b6dd98b8 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -61,6 +61,7 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2);
#define ANNOTATION__IPC_WIDTH 6
#define ANNOTATION__CYCLES_WIDTH 6
+#define ANNOTATION__MINMAX_CYCLES_WIDTH 19
struct annotation_options {
bool hide_src_code,
@@ -69,7 +70,8 @@ struct annotation_options {
show_linenr,
show_nr_jumps,
show_nr_samples,
- show_total_period;
+ show_total_period,
+ show_minmax_cycle;
u8 offset_level;
};
@@ -105,6 +107,8 @@ struct annotation_line {
int jump_sources;
float ipc;
u64 cycles;
+ u64 cycles_max;
+ u64 cycles_min;
size_t privsize;
char *path;
u32 idx;
@@ -186,6 +190,8 @@ struct cyc_hist {
u64 start;
u64 cycles;
u64 cycles_aggr;
+ u64 cycles_max;
+ u64 cycles_min;
u32 num;
u32 num_aggr;
u8 have_start;
@@ -239,6 +245,9 @@ struct annotation {
static inline int annotation__cycles_width(struct annotation *notes)
{
+ if (notes->have_cycles && notes->options->show_minmax_cycle)
+ return ANNOTATION__IPC_WIDTH + ANNOTATION__MINMAX_CYCLES_WIDTH;
+
return notes->have_cycles ? ANNOTATION__IPC_WIDTH + ANNOTATION__CYCLES_WIDTH : 0;
}
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 857de69a5361..d056447520a2 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1679,7 +1679,7 @@ struct sym_args {
static bool kern_sym_match(struct sym_args *args, const char *name, char type)
{
/* A function with the same name, and global or the n'th found or any */
- return symbol_type__is_a(type, MAP__FUNCTION) &&
+ return kallsyms__is_function(type) &&
!strcmp(name, args->name) &&
((args->global && isupper(type)) ||
(args->selected && ++(args->cnt) == args->idx) ||
@@ -1784,7 +1784,7 @@ static int find_entire_kern_cb(void *arg, const char *name __maybe_unused,
{
struct sym_args *args = arg;
- if (!symbol_type__is_a(type, MAP__FUNCTION))
+ if (!kallsyms__is_function(type))
return 0;
if (!args->started) {
@@ -1915,7 +1915,7 @@ static void print_duplicate_syms(struct dso *dso, const char *sym_name)
pr_err("Multiple symbols with name '%s'\n", sym_name);
- sym = dso__first_symbol(dso, MAP__FUNCTION);
+ sym = dso__first_symbol(dso);
while (sym) {
if (dso_sym_match(sym, sym_name, &cnt, -1)) {
pr_err("#%d\t0x%"PRIx64"\t%c\t%s\n",
@@ -1945,7 +1945,7 @@ static int find_dso_sym(struct dso *dso, const char *sym_name, u64 *start,
*start = 0;
*size = 0;
- sym = dso__first_symbol(dso, MAP__FUNCTION);
+ sym = dso__first_symbol(dso);
while (sym) {
if (*start) {
if (!*size)
@@ -1972,8 +1972,8 @@ static int find_dso_sym(struct dso *dso, const char *sym_name, u64 *start,
static int addr_filter__entire_dso(struct addr_filter *filt, struct dso *dso)
{
- struct symbol *first_sym = dso__first_symbol(dso, MAP__FUNCTION);
- struct symbol *last_sym = dso__last_symbol(dso, MAP__FUNCTION);
+ struct symbol *first_sym = dso__first_symbol(dso);
+ struct symbol *last_sym = dso__last_symbol(dso);
if (!first_sym || !last_sym) {
pr_err("Failed to determine filter for %s\nNo symbols found.\n",
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 537eadd81914..04b1d53e4bf9 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -47,9 +47,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
return -1;
}
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, sample->ip, &al);
-
- if (al.map != NULL)
+ if (thread__find_map(thread, sample->cpumode, sample->ip, &al))
al.map->dso->hit = 1;
thread__put(thread);
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 84eb9393c7db..5ac157056cdf 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -707,6 +707,14 @@ struct perf_config_set *perf_config_set__new(void)
return set;
}
+static int perf_config__init(void)
+{
+ if (config_set == NULL)
+ config_set = perf_config_set__new();
+
+ return config_set == NULL;
+}
+
int perf_config(config_fn_t fn, void *data)
{
int ret = 0;
@@ -714,7 +722,7 @@ int perf_config(config_fn_t fn, void *data)
struct perf_config_section *section;
struct perf_config_item *item;
- if (config_set == NULL)
+ if (config_set == NULL && perf_config__init())
return -1;
perf_config_set__for_each_entry(config_set, section, item) {
@@ -735,12 +743,6 @@ int perf_config(config_fn_t fn, void *data)
return ret;
}
-void perf_config__init(void)
-{
- if (config_set == NULL)
- config_set = perf_config_set__new();
-}
-
void perf_config__exit(void)
{
perf_config_set__delete(config_set);
diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h
index baf82bf227ac..bd0a5897c76a 100644
--- a/tools/perf/util/config.h
+++ b/tools/perf/util/config.h
@@ -38,7 +38,6 @@ struct perf_config_set *perf_config_set__new(void);
void perf_config_set__delete(struct perf_config_set *set);
int perf_config_set__collect(struct perf_config_set *set, const char *file_name,
const char *var, const char *value);
-void perf_config__init(void);
void perf_config__exit(void);
void perf_config__refresh(void);
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index bf16dc9ee507..822ba915d144 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -270,9 +270,7 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u64 address,
thread = etmq->etm->unknown_thread;
}
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, address, &al);
-
- if (!al.map || !al.map->dso)
+ if (!thread__find_map(thread, cpumode, address, &al) || !al.map->dso)
return 0;
if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index b0c2b5c5d337..7123746edcf4 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -247,9 +247,9 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
*dso_db_id = dso->db_id;
if (!al->sym) {
- al->sym = symbol__new(al->addr, 0, 0, "unknown");
+ al->sym = symbol__new(al->addr, 0, 0, 0, "unknown");
if (al->sym)
- dso__insert_symbol(dso, al->map->type, al->sym);
+ dso__insert_symbol(dso, al->sym);
}
if (al->sym) {
@@ -315,8 +315,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe,
al.addr = node->ip;
if (al.map && !al.sym)
- al.sym = dso__find_symbol(al.map->dso, MAP__FUNCTION,
- al.addr);
+ al.sym = dso__find_symbol(al.map->dso, al.addr);
db_ids_from_al(dbe, &al, &dso_db_id, &sym_db_id, &offset);
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 36ef45b2e89d..cdfc2e5f55f5 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1014,7 +1014,7 @@ struct map *dso__new_map(const char *name)
struct dso *dso = dso__new(name);
if (dso)
- map = map__new2(0, dso, MAP__FUNCTION);
+ map = map__new2(0, dso);
return map;
}
@@ -1176,19 +1176,19 @@ int dso__name_len(const struct dso *dso)
return dso->short_name_len;
}
-bool dso__loaded(const struct dso *dso, enum map_type type)
+bool dso__loaded(const struct dso *dso)
{
- return dso->loaded & (1 << type);
+ return dso->loaded;
}
-bool dso__sorted_by_name(const struct dso *dso, enum map_type type)
+bool dso__sorted_by_name(const struct dso *dso)
{
- return dso->sorted_by_name & (1 << type);
+ return dso->sorted_by_name;
}
-void dso__set_sorted_by_name(struct dso *dso, enum map_type type)
+void dso__set_sorted_by_name(struct dso *dso)
{
- dso->sorted_by_name |= (1 << type);
+ dso->sorted_by_name = true;
}
struct dso *dso__new(const char *name)
@@ -1196,12 +1196,10 @@ struct dso *dso__new(const char *name)
struct dso *dso = calloc(1, sizeof(*dso) + strlen(name) + 1);
if (dso != NULL) {
- int i;
strcpy(dso->name, name);
dso__set_long_name(dso, dso->name, false);
dso__set_short_name(dso, dso->name, false);
- for (i = 0; i < MAP__NR_TYPES; ++i)
- dso->symbols[i] = dso->symbol_names[i] = RB_ROOT;
+ dso->symbols = dso->symbol_names = RB_ROOT;
dso->data.cache = RB_ROOT;
dso->inlined_nodes = RB_ROOT;
dso->srclines = RB_ROOT;
@@ -1231,8 +1229,6 @@ struct dso *dso__new(const char *name)
void dso__delete(struct dso *dso)
{
- int i;
-
if (!RB_EMPTY_NODE(&dso->rb_node))
pr_err("DSO %s is still in rbtree when being deleted!\n",
dso->long_name);
@@ -1240,8 +1236,7 @@ void dso__delete(struct dso *dso)
/* free inlines first, as they reference symbols */
inlines__tree_delete(&dso->inlined_nodes);
srcline__tree_delete(&dso->srclines);
- for (i = 0; i < MAP__NR_TYPES; ++i)
- symbols__delete(&dso->symbols[i]);
+ symbols__delete(&dso->symbols);
if (dso->short_name_allocated) {
zfree((char **)&dso->short_name);
@@ -1451,9 +1446,7 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp)
size_t ret = 0;
list_for_each_entry(pos, head, node) {
- int i;
- for (i = 0; i < MAP__NR_TYPES; ++i)
- ret += dso__fprintf(pos, i, fp);
+ ret += dso__fprintf(pos, fp);
}
return ret;
@@ -1467,18 +1460,17 @@ size_t dso__fprintf_buildid(struct dso *dso, FILE *fp)
return fprintf(fp, "%s", sbuild_id);
}
-size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp)
+size_t dso__fprintf(struct dso *dso, FILE *fp)
{
struct rb_node *nd;
size_t ret = fprintf(fp, "dso: %s (", dso->short_name);
if (dso->short_name != dso->long_name)
ret += fprintf(fp, "%s, ", dso->long_name);
- ret += fprintf(fp, "%s, %sloaded, ", map_type__name[type],
- dso__loaded(dso, type) ? "" : "NOT ");
+ ret += fprintf(fp, "%sloaded, ", dso__loaded(dso) ? "" : "NOT ");
ret += dso__fprintf_buildid(dso, fp);
ret += fprintf(fp, ")\n");
- for (nd = rb_first(&dso->symbols[type]); nd; nd = rb_next(nd)) {
+ for (nd = rb_first(&dso->symbols); nd; nd = rb_next(nd)) {
struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
ret += symbol__fprintf(pos, fp);
}
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index c229dbe0277a..ef69de2e69ea 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -140,14 +140,14 @@ struct dso {
struct list_head node;
struct rb_node rb_node; /* rbtree node sorted by long name */
struct rb_root *root; /* root of rbtree that rb_node is in */
- struct rb_root symbols[MAP__NR_TYPES];
- struct rb_root symbol_names[MAP__NR_TYPES];
+ struct rb_root symbols;
+ struct rb_root symbol_names;
struct rb_root inlined_nodes;
struct rb_root srclines;
struct {
u64 addr;
struct symbol *symbol;
- } last_find_result[MAP__NR_TYPES];
+ } last_find_result;
void *a2l;
char *symsrc_filename;
unsigned int a2l_fails;
@@ -164,8 +164,8 @@ struct dso {
u8 short_name_allocated:1;
u8 long_name_allocated:1;
u8 is_64_bit:1;
- u8 sorted_by_name;
- u8 loaded;
+ bool sorted_by_name;
+ bool loaded;
u8 rel;
u8 build_id[BUILD_ID_SIZE];
u64 text_offset;
@@ -202,14 +202,13 @@ struct dso {
* @dso: the 'struct dso *' in which symbols itereated
* @pos: the 'struct symbol *' to use as a loop cursor
* @n: the 'struct rb_node *' to use as a temporary storage
- * @type: the 'enum map_type' type of symbols
*/
-#define dso__for_each_symbol(dso, pos, n, type) \
- symbols__for_each_entry(&(dso)->symbols[(type)], pos, n)
+#define dso__for_each_symbol(dso, pos, n) \
+ symbols__for_each_entry(&(dso)->symbols, pos, n)
-static inline void dso__set_loaded(struct dso *dso, enum map_type type)
+static inline void dso__set_loaded(struct dso *dso)
{
- dso->loaded |= (1 << type);
+ dso->loaded = true;
}
struct dso *dso__new(const char *name);
@@ -231,11 +230,16 @@ static inline void __dso__zput(struct dso **dso)
#define dso__zput(dso) __dso__zput(&dso)
-bool dso__loaded(const struct dso *dso, enum map_type type);
+bool dso__loaded(const struct dso *dso);
-bool dso__sorted_by_name(const struct dso *dso, enum map_type type);
-void dso__set_sorted_by_name(struct dso *dso, enum map_type type);
-void dso__sort_by_name(struct dso *dso, enum map_type type);
+static inline bool dso__has_symbols(const struct dso *dso)
+{
+ return !RB_EMPTY_ROOT(&dso->symbols);
+}
+
+bool dso__sorted_by_name(const struct dso *dso);
+void dso__set_sorted_by_name(struct dso *dso);
+void dso__sort_by_name(struct dso *dso);
void dso__set_build_id(struct dso *dso, void *build_id);
bool dso__build_id_equal(const struct dso *dso, u8 *build_id);
@@ -349,9 +353,8 @@ size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
size_t __dsos__fprintf(struct list_head *head, FILE *fp);
size_t dso__fprintf_buildid(struct dso *dso, FILE *fp);
-size_t dso__fprintf_symbols_by_name(struct dso *dso,
- enum map_type type, FILE *fp);
-size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp);
+size_t dso__fprintf_symbols_by_name(struct dso *dso, FILE *fp);
+size_t dso__fprintf(struct dso *dso, FILE *fp);
static inline bool dso__is_vmlinux(struct dso *dso)
{
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 4c842762e3f2..59f38c7693f8 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -93,6 +93,37 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
}
+static int perf_env__read_arch(struct perf_env *env)
+{
+ struct utsname uts;
+
+ if (env->arch)
+ return 0;
+
+ if (!uname(&uts))
+ env->arch = strdup(uts.machine);
+
+ return env->arch ? 0 : -ENOMEM;
+}
+
+static int perf_env__read_nr_cpus_avail(struct perf_env *env)
+{
+ if (env->nr_cpus_avail == 0)
+ env->nr_cpus_avail = cpu__max_present_cpu();
+
+ return env->nr_cpus_avail ? 0 : -ENOENT;
+}
+
+const char *perf_env__raw_arch(struct perf_env *env)
+{
+ return env && !perf_env__read_arch(env) ? env->arch : "unknown";
+}
+
+int perf_env__nr_cpus_avail(struct perf_env *env)
+{
+ return env && !perf_env__read_nr_cpus_avail(env) ? env->nr_cpus_avail : 0;
+}
+
void cpu_cache_level__free(struct cpu_cache_level *cache)
{
free(cache->type);
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index c4ef2e523367..1f3ccc368530 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -76,4 +76,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env);
void cpu_cache_level__free(struct cpu_cache_level *cache);
const char *perf_env__arch(struct perf_env *env);
+const char *perf_env__raw_arch(struct perf_env *env);
+int perf_env__nr_cpus_avail(struct perf_env *env);
+
#endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 98ff3a6a3d50..0c8ecf0c78a4 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -88,10 +88,10 @@ static const char *perf_ns__name(unsigned int id)
return perf_ns__names[id];
}
-static int perf_tool__process_synth_event(struct perf_tool *tool,
- union perf_event *event,
- struct machine *machine,
- perf_event__handler_t process)
+int perf_tool__process_synth_event(struct perf_tool *tool,
+ union perf_event *event,
+ struct machine *machine,
+ perf_event__handler_t process)
{
struct perf_sample synth_sample = {
.pid = -1,
@@ -464,8 +464,7 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
{
int rc = 0;
struct map *pos;
- struct map_groups *kmaps = &machine->kmaps;
- struct maps *maps = &kmaps->maps[MAP__FUNCTION];
+ struct maps *maps = machine__kernel_maps(machine);
union perf_event *event = zalloc((sizeof(event->mmap) +
machine->id_hdr_size));
if (event == NULL) {
@@ -488,7 +487,7 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
for (pos = maps__first(maps); pos; pos = map__next(pos)) {
size_t size;
- if (__map__is_kernel(pos))
+ if (!__map__is_kmodule(pos))
continue;
size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
@@ -869,7 +868,7 @@ static int find_symbol_cb(void *arg, const char *name, char type,
* Must be a function or at least an alias, as in PARISC64, where "_text" is
* an 'A' to the same address as "_stext".
*/
- if (!(symbol_type__is_a(type, MAP__FUNCTION) ||
+ if (!(kallsyms__is_function(type) ||
type == 'A') || strcmp(name, args->name))
return 0;
@@ -889,9 +888,16 @@ int kallsyms__get_function_start(const char *kallsyms_filename,
return 0;
}
-int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
- perf_event__handler_t process,
- struct machine *machine)
+int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused,
+ perf_event__handler_t process __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+ return 0;
+}
+
+static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine)
{
size_t size;
struct map *map = machine__kernel_map(machine);
@@ -944,6 +950,19 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
return err;
}
+int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine)
+{
+ int err;
+
+ err = __perf_event__synthesize_kernel_mmap(tool, process, machine);
+ if (err < 0)
+ return err;
+
+ return perf_event__synthesize_extra_kmaps(tool, process, machine);
+}
+
int perf_event__synthesize_thread_map2(struct perf_tool *tool,
struct thread_map *threads,
perf_event__handler_t process,
@@ -1489,9 +1508,8 @@ int perf_event__process(struct perf_tool *tool __maybe_unused,
return machine__process_event(machine, event, sample);
}
-void thread__find_addr_map(struct thread *thread, u8 cpumode,
- enum map_type type, u64 addr,
- struct addr_location *al)
+struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
+ struct addr_location *al)
{
struct map_groups *mg = thread->mg;
struct machine *machine = mg->machine;
@@ -1505,7 +1523,7 @@ void thread__find_addr_map(struct thread *thread, u8 cpumode,
if (machine == NULL) {
al->map = NULL;
- return;
+ return NULL;
}
if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
@@ -1533,10 +1551,10 @@ void thread__find_addr_map(struct thread *thread, u8 cpumode,
!perf_host)
al->filtered |= (1 << HIST_FILTER__HOST);
- return;
+ return NULL;
}
try_again:
- al->map = map_groups__find(mg, type, al->addr);
+ al->map = map_groups__find(mg, al->addr);
if (al->map == NULL) {
/*
* If this is outside of all known maps, and is a negative
@@ -1563,17 +1581,17 @@ try_again:
map__load(al->map);
al->addr = al->map->map_ip(al->map, al->addr);
}
+
+ return al->map;
}
-void thread__find_addr_location(struct thread *thread,
- u8 cpumode, enum map_type type, u64 addr,
- struct addr_location *al)
+struct symbol *thread__find_symbol(struct thread *thread, u8 cpumode,
+ u64 addr, struct addr_location *al)
{
- thread__find_addr_map(thread, cpumode, type, addr, al);
- if (al->map != NULL)
+ al->sym = NULL;
+ if (thread__find_map(thread, cpumode, addr, al))
al->sym = map__find_symbol(al->map, al->addr);
- else
- al->sym = NULL;
+ return al->sym;
}
/*
@@ -1590,7 +1608,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al,
return -1;
dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, sample->ip, al);
+ thread__find_map(thread, sample->cpumode, sample->ip, al);
dump_printf(" ...... dso: %s\n",
al->map ? al->map->dso->long_name :
al->level == 'H' ? "[hypervisor]" : "<not found>");
@@ -1669,10 +1687,7 @@ bool sample_addr_correlates_sym(struct perf_event_attr *attr)
void thread__resolve(struct thread *thread, struct addr_location *al,
struct perf_sample *sample)
{
- thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, sample->addr, al);
- if (!al->map)
- thread__find_addr_map(thread, sample->cpumode, MAP__VARIABLE,
- sample->addr, al);
+ thread__find_map(thread, sample->cpumode, sample->addr, al);
al->cpu = sample->cpu;
al->sym = NULL;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 0f794744919c..bfa60bcafbde 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -750,6 +750,10 @@ int perf_event__process_exit(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine);
+int perf_tool__process_synth_event(struct perf_tool *tool,
+ union perf_event *event,
+ struct machine *machine,
+ perf_event__handler_t process);
int perf_event__process(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -796,6 +800,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
bool mmap_data,
unsigned int proc_map_timeout);
+int perf_event__synthesize_extra_kmaps(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine);
+
size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index a59281d64368..e7a4b31a84fb 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1795,3 +1795,18 @@ bool perf_evlist__exclude_kernel(struct perf_evlist *evlist)
return true;
}
+
+/*
+ * Events in data file are not collect in groups, but we still want
+ * the group display. Set the artificial group and set the leader's
+ * forced_leader flag to notify the display code.
+ */
+void perf_evlist__force_leader(struct perf_evlist *evlist)
+{
+ if (!evlist->nr_groups) {
+ struct perf_evsel *leader = perf_evlist__first(evlist);
+
+ perf_evlist__set_leader(evlist);
+ leader->forced_leader = true;
+ }
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 6c41b2f78713..dc66436add98 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -309,4 +309,7 @@ struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
union perf_event *event);
bool perf_evlist__exclude_kernel(struct perf_evlist *evlist);
+
+void perf_evlist__force_leader(struct perf_evlist *evlist);
+
#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4cd2cf93f726..150db5ed7400 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2862,7 +2862,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
return scnprintf(msg, size,
"Not enough memory to setup event with callchain.\n"
"Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n"
- "Hint: Current value: %d", sysctl_perf_event_max_stack);
+ "Hint: Current value: %d", sysctl__max_stack());
break;
case ENODEV:
if (target->cpu_list)
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
index c540d47583e7..aafbe54fd3fa 100644
--- a/tools/perf/util/genelf.c
+++ b/tools/perf/util/genelf.c
@@ -114,7 +114,7 @@ gen_build_id(struct buildid_note *note,
fd = open("/dev/urandom", O_RDONLY);
if (fd == -1)
- err(1, "cannot access /dev/urandom for builid");
+ err(1, "cannot access /dev/urandom for buildid");
sret = read(fd, note->build_id, sz);
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index 72db2744876d..7f0c83b6332b 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -335,8 +335,7 @@ static int intel_bts_get_next_insn(struct intel_bts_queue *btsq, u64 ip)
if (!thread)
return -1;
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, ip, &al);
- if (!al.map || !al.map->dso)
+ if (!thread__find_map(thread, cpumode, ip, &al) || !al.map->dso)
goto out_put;
len = dso__data_read_addr(al.map->dso, al.map, machine, ip, buf,
diff --git a/tools/perf/util/intel-pt-decoder/insn.h b/tools/perf/util/intel-pt-decoder/insn.h
index e23578c7b1be..2669c9f748e4 100644
--- a/tools/perf/util/intel-pt-decoder/insn.h
+++ b/tools/perf/util/intel-pt-decoder/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
return insn_offset_displacement(insn) + insn->displacement.nbytes;
}
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+ return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+ (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
#endif /* _ASM_X86_INSN_H */
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 0effaff57020..492986a25ef6 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -442,8 +442,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn,
}
while (1) {
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, *ip, &al);
- if (!al.map || !al.map->dso)
+ if (!thread__find_map(thread, cpumode, *ip, &al) || !al.map->dso)
return -EINVAL;
if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
@@ -596,8 +595,7 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data)
if (!thread)
return -EINVAL;
- thread__find_addr_map(thread, cpumode, MAP__FUNCTION, ip, &al);
- if (!al.map || !al.map->dso)
+ if (!thread__find_map(thread, cpumode, ip, &al) || !al.map->dso)
return -EINVAL;
offset = al.map->map_ip(al.map, ip);
@@ -1565,7 +1563,7 @@ static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
if (map__load(map))
return 0;
- start = dso__first_symbol(map->dso, MAP__FUNCTION);
+ start = dso__first_symbol(map->dso);
for (sym = start; sym; sym = dso__next_symbol(sym)) {
if (sym->binding == STB_GLOBAL &&
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index 1cca0a2fa641..976e658e38dc 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -14,11 +14,12 @@
#include "config.h"
#include "util.h"
#include <sys/wait.h>
+#include <subcmd/exec-cmd.h>
#define CLANG_BPF_CMD_DEFAULT_TEMPLATE \
"$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\
"-DLINUX_VERSION_CODE=$LINUX_VERSION_CODE " \
- "$CLANG_OPTIONS $KERNEL_INC_OPTIONS " \
+ "$CLANG_OPTIONS $KERNEL_INC_OPTIONS $PERF_BPF_INC_OPTIONS " \
"-Wno-unused-value -Wno-pointer-sign " \
"-working-directory $WORKING_DIR " \
"-c \"$CLANG_SOURCE\" -target bpf -O2 -o -"
@@ -212,7 +213,7 @@ version_notice(void)
" \t\thttp://llvm.org/apt\n\n"
" \tIf you are using old version of clang, change 'clang-bpf-cmd-template'\n"
" \toption in [llvm] section of ~/.perfconfig to:\n\n"
-" \t \"$CLANG_EXEC $CLANG_OPTIONS $KERNEL_INC_OPTIONS \\\n"
+" \t \"$CLANG_EXEC $CLANG_OPTIONS $KERNEL_INC_OPTIONS $PERF_BPF_INC_OPTIONS \\\n"
" \t -working-directory $WORKING_DIR -c $CLANG_SOURCE \\\n"
" \t -emit-llvm -o - | /path/to/llc -march=bpf -filetype=obj -o -\"\n"
" \t(Replace /path/to/llc with path to your llc)\n\n"
@@ -431,9 +432,11 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
const char *clang_opt = llvm_param.clang_opt;
char clang_path[PATH_MAX], abspath[PATH_MAX], nr_cpus_avail_str[64];
char serr[STRERR_BUFSIZE];
- char *kbuild_dir = NULL, *kbuild_include_opts = NULL;
+ char *kbuild_dir = NULL, *kbuild_include_opts = NULL,
+ *perf_bpf_include_opts = NULL;
const char *template = llvm_param.clang_bpf_cmd_template;
- char *command_echo, *command_out;
+ char *command_echo = NULL, *command_out;
+ char *perf_include_dir = system_path(PERF_INCLUDE_DIR);
if (path[0] != '-' && realpath(path, abspath) == NULL) {
err = errno;
@@ -471,12 +474,14 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
snprintf(linux_version_code_str, sizeof(linux_version_code_str),
"0x%x", kernel_version);
-
+ if (asprintf(&perf_bpf_include_opts, "-I%s/bpf", perf_include_dir) < 0)
+ goto errout;
force_set_env("NR_CPUS", nr_cpus_avail_str);
force_set_env("LINUX_VERSION_CODE", linux_version_code_str);
force_set_env("CLANG_EXEC", clang_path);
force_set_env("CLANG_OPTIONS", clang_opt);
force_set_env("KERNEL_INC_OPTIONS", kbuild_include_opts);
+ force_set_env("PERF_BPF_INC_OPTIONS", perf_bpf_include_opts);
force_set_env("WORKING_DIR", kbuild_dir ? : ".");
/*
@@ -512,6 +517,8 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
free(command_out);
free(kbuild_dir);
free(kbuild_include_opts);
+ free(perf_bpf_include_opts);
+ free(perf_include_dir);
if (!p_obj_buf)
free(obj_buf);
@@ -526,6 +533,8 @@ errout:
free(kbuild_dir);
free(kbuild_include_opts);
free(obj_buf);
+ free(perf_bpf_include_opts);
+ free(perf_include_dir);
if (p_obj_buf)
*p_obj_buf = NULL;
if (p_obj_buf_sz)
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 32d50492505d..e7b4a8b513f2 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -24,6 +24,7 @@
#include "sane_ctype.h"
#include <symbol/kallsyms.h>
+#include <linux/mman.h>
static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock);
@@ -81,8 +82,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
machine->kptr_restrict_warned = false;
machine->comm_exec = false;
machine->kernel_start = 0;
-
- memset(machine->vmlinux_maps, 0, sizeof(machine->vmlinux_maps));
+ machine->vmlinux_map = NULL;
machine->root_dir = strdup(root_dir);
if (machine->root_dir == NULL)
@@ -137,13 +137,11 @@ struct machine *machine__new_kallsyms(void)
struct machine *machine = machine__new_host();
/*
* FIXME:
- * 1) MAP__FUNCTION will go away when we stop loading separate maps for
- * functions and data objects.
- * 2) We should switch to machine__load_kallsyms(), i.e. not explicitely
+ * 1) We should switch to machine__load_kallsyms(), i.e. not explicitely
* ask for not using the kcore parsing code, once this one is fixed
* to create a map per module.
*/
- if (machine && machine__load_kallsyms(machine, "/proc/kallsyms", MAP__FUNCTION) <= 0) {
+ if (machine && machine__load_kallsyms(machine, "/proc/kallsyms") <= 0) {
machine__delete(machine);
machine = NULL;
}
@@ -673,8 +671,7 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
if (kmod_path__parse_name(&m, filename))
return NULL;
- map = map_groups__find_by_name(&machine->kmaps, MAP__FUNCTION,
- m.name);
+ map = map_groups__find_by_name(&machine->kmaps, m.name);
if (map) {
/*
* If the map's dso is an offline module, give dso__load()
@@ -689,7 +686,7 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
if (dso == NULL)
goto out;
- map = map__new2(start, dso, MAP__FUNCTION);
+ map = map__new2(start, dso);
if (map == NULL)
goto out;
@@ -810,8 +807,8 @@ struct process_args {
u64 start;
};
-static void machine__get_kallsyms_filename(struct machine *machine, char *buf,
- size_t bufsz)
+void machine__get_kallsyms_filename(struct machine *machine, char *buf,
+ size_t bufsz)
{
if (machine__is_default_guest(machine))
scnprintf(buf, bufsz, "%s", symbol_conf.default_guest_kallsyms);
@@ -854,65 +851,171 @@ static int machine__get_running_kernel_start(struct machine *machine,
return 0;
}
+int machine__create_extra_kernel_map(struct machine *machine,
+ struct dso *kernel,
+ struct extra_kernel_map *xm)
+{
+ struct kmap *kmap;
+ struct map *map;
+
+ map = map__new2(xm->start, kernel);
+ if (!map)
+ return -1;
+
+ map->end = xm->end;
+ map->pgoff = xm->pgoff;
+
+ kmap = map__kmap(map);
+
+ kmap->kmaps = &machine->kmaps;
+ strlcpy(kmap->name, xm->name, KMAP_NAME_LEN);
+
+ map_groups__insert(&machine->kmaps, map);
+
+ pr_debug2("Added extra kernel map %s %" PRIx64 "-%" PRIx64 "\n",
+ kmap->name, map->start, map->end);
+
+ map__put(map);
+
+ return 0;
+}
+
+static u64 find_entry_trampoline(struct dso *dso)
+{
+ /* Duplicates are removed so lookup all aliases */
+ const char *syms[] = {
+ "_entry_trampoline",
+ "__entry_trampoline_start",
+ "entry_SYSCALL_64_trampoline",
+ };
+ struct symbol *sym = dso__first_symbol(dso);
+ unsigned int i;
+
+ for (; sym; sym = dso__next_symbol(sym)) {
+ if (sym->binding != STB_GLOBAL)
+ continue;
+ for (i = 0; i < ARRAY_SIZE(syms); i++) {
+ if (!strcmp(sym->name, syms[i]))
+ return sym->start;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * These values can be used for kernels that do not have symbols for the entry
+ * trampolines in kallsyms.
+ */
+#define X86_64_CPU_ENTRY_AREA_PER_CPU 0xfffffe0000000000ULL
+#define X86_64_CPU_ENTRY_AREA_SIZE 0x2c000
+#define X86_64_ENTRY_TRAMPOLINE 0x6000
+
+/* Map x86_64 PTI entry trampolines */
+int machine__map_x86_64_entry_trampolines(struct machine *machine,
+ struct dso *kernel)
+{
+ struct map_groups *kmaps = &machine->kmaps;
+ struct maps *maps = &kmaps->maps;
+ int nr_cpus_avail, cpu;
+ bool found = false;
+ struct map *map;
+ u64 pgoff;
+
+ /*
+ * In the vmlinux case, pgoff is a virtual address which must now be
+ * mapped to a vmlinux offset.
+ */
+ for (map = maps__first(maps); map; map = map__next(map)) {
+ struct kmap *kmap = __map__kmap(map);
+ struct map *dest_map;
+
+ if (!kmap || !is_entry_trampoline(kmap->name))
+ continue;
+
+ dest_map = map_groups__find(kmaps, map->pgoff);
+ if (dest_map != map)
+ map->pgoff = dest_map->map_ip(dest_map, map->pgoff);
+ found = true;
+ }
+ if (found || machine->trampolines_mapped)
+ return 0;
+
+ pgoff = find_entry_trampoline(kernel);
+ if (!pgoff)
+ return 0;
+
+ nr_cpus_avail = machine__nr_cpus_avail(machine);
+
+ /* Add a 1 page map for each CPU's entry trampoline */
+ for (cpu = 0; cpu < nr_cpus_avail; cpu++) {
+ u64 va = X86_64_CPU_ENTRY_AREA_PER_CPU +
+ cpu * X86_64_CPU_ENTRY_AREA_SIZE +
+ X86_64_ENTRY_TRAMPOLINE;
+ struct extra_kernel_map xm = {
+ .start = va,
+ .end = va + page_size,
+ .pgoff = pgoff,
+ };
+
+ strlcpy(xm.name, ENTRY_TRAMPOLINE_NAME, KMAP_NAME_LEN);
+
+ if (machine__create_extra_kernel_map(machine, kernel, &xm) < 0)
+ return -1;
+ }
+
+ machine->trampolines_mapped = nr_cpus_avail;
+
+ return 0;
+}
+
+int __weak machine__create_extra_kernel_maps(struct machine *machine __maybe_unused,
+ struct dso *kernel __maybe_unused)
+{
+ return 0;
+}
+
static int
__machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
{
- int type;
+ struct kmap *kmap;
+ struct map *map;
/* In case of renewal the kernel map, destroy previous one */
machine__destroy_kernel_maps(machine);
- for (type = 0; type < MAP__NR_TYPES; ++type) {
- struct kmap *kmap;
- struct map *map;
-
- machine->vmlinux_maps[type] = map__new2(0, kernel, type);
- if (machine->vmlinux_maps[type] == NULL)
- return -1;
+ machine->vmlinux_map = map__new2(0, kernel);
+ if (machine->vmlinux_map == NULL)
+ return -1;
- machine->vmlinux_maps[type]->map_ip =
- machine->vmlinux_maps[type]->unmap_ip =
- identity__map_ip;
- map = __machine__kernel_map(machine, type);
- kmap = map__kmap(map);
- if (!kmap)
- return -1;
+ machine->vmlinux_map->map_ip = machine->vmlinux_map->unmap_ip = identity__map_ip;
+ map = machine__kernel_map(machine);
+ kmap = map__kmap(map);
+ if (!kmap)
+ return -1;
- kmap->kmaps = &machine->kmaps;
- map_groups__insert(&machine->kmaps, map);
- }
+ kmap->kmaps = &machine->kmaps;
+ map_groups__insert(&machine->kmaps, map);
return 0;
}
void machine__destroy_kernel_maps(struct machine *machine)
{
- int type;
-
- for (type = 0; type < MAP__NR_TYPES; ++type) {
- struct kmap *kmap;
- struct map *map = __machine__kernel_map(machine, type);
-
- if (map == NULL)
- continue;
+ struct kmap *kmap;
+ struct map *map = machine__kernel_map(machine);
- kmap = map__kmap(map);
- map_groups__remove(&machine->kmaps, map);
- if (kmap && kmap->ref_reloc_sym) {
- /*
- * ref_reloc_sym is shared among all maps, so free just
- * on one of them.
- */
- if (type == MAP__FUNCTION) {
- zfree((char **)&kmap->ref_reloc_sym->name);
- zfree(&kmap->ref_reloc_sym);
- } else
- kmap->ref_reloc_sym = NULL;
- }
+ if (map == NULL)
+ return;
- map__put(machine->vmlinux_maps[type]);
- machine->vmlinux_maps[type] = NULL;
+ kmap = map__kmap(map);
+ map_groups__remove(&machine->kmaps, map);
+ if (kmap && kmap->ref_reloc_sym) {
+ zfree((char **)&kmap->ref_reloc_sym->name);
+ zfree(&kmap->ref_reloc_sym);
}
+
+ map__zput(machine->vmlinux_map);
}
int machines__create_guest_kernel_maps(struct machines *machines)
@@ -989,32 +1092,31 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid)
return machine__create_kernel_maps(machine);
}
-int machine__load_kallsyms(struct machine *machine, const char *filename,
- enum map_type type)
+int machine__load_kallsyms(struct machine *machine, const char *filename)
{
struct map *map = machine__kernel_map(machine);
int ret = __dso__load_kallsyms(map->dso, filename, map, true);
if (ret > 0) {
- dso__set_loaded(map->dso, type);
+ dso__set_loaded(map->dso);
/*
* Since /proc/kallsyms will have multiple sessions for the
* kernel, with modules between them, fixup the end of all
* sections.
*/
- __map_groups__fixup_end(&machine->kmaps, type);
+ map_groups__fixup_end(&machine->kmaps);
}
return ret;
}
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type)
+int machine__load_vmlinux_path(struct machine *machine)
{
struct map *map = machine__kernel_map(machine);
int ret = dso__load_vmlinux_path(map->dso, map);
if (ret > 0)
- dso__set_loaded(map->dso, type);
+ dso__set_loaded(map->dso);
return ret;
}
@@ -1055,10 +1157,9 @@ static bool is_kmod_dso(struct dso *dso)
static int map_groups__set_module_path(struct map_groups *mg, const char *path,
struct kmod_path *m)
{
- struct map *map;
char *long_name;
+ struct map *map = map_groups__find_by_name(mg, m->name);
- map = map_groups__find_by_name(mg, MAP__FUNCTION, m->name);
if (map == NULL)
return 0;
@@ -1207,19 +1308,14 @@ static int machine__create_modules(struct machine *machine)
static void machine__set_kernel_mmap(struct machine *machine,
u64 start, u64 end)
{
- int i;
-
- for (i = 0; i < MAP__NR_TYPES; i++) {
- machine->vmlinux_maps[i]->start = start;
- machine->vmlinux_maps[i]->end = end;
-
- /*
- * Be a bit paranoid here, some perf.data file came with
- * a zero sized synthesized MMAP event for the kernel.
- */
- if (start == 0 && end == 0)
- machine->vmlinux_maps[i]->end = ~0ULL;
- }
+ machine->vmlinux_map->start = start;
+ machine->vmlinux_map->end = end;
+ /*
+ * Be a bit paranoid here, some perf.data file came with
+ * a zero sized synthesized MMAP event for the kernel.
+ */
+ if (start == 0 && end == 0)
+ machine->vmlinux_map->end = ~0ULL;
}
int machine__create_kernel_maps(struct machine *machine)
@@ -1234,9 +1330,8 @@ int machine__create_kernel_maps(struct machine *machine)
return -1;
ret = __machine__create_kernel_maps(machine, kernel);
- dso__put(kernel);
if (ret < 0)
- return -1;
+ goto out_put;
if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
if (machine__is_host(machine))
@@ -1249,9 +1344,10 @@ int machine__create_kernel_maps(struct machine *machine)
if (!machine__get_running_kernel_start(machine, &name, &addr)) {
if (name &&
- maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name, addr)) {
+ map__set_kallsyms_ref_reloc_sym(machine->vmlinux_map, name, addr)) {
machine__destroy_kernel_maps(machine);
- return -1;
+ ret = -1;
+ goto out_put;
}
/* we have a real start address now, so re-order the kmaps */
@@ -1267,12 +1363,16 @@ int machine__create_kernel_maps(struct machine *machine)
map__put(map);
}
+ if (machine__create_extra_kernel_maps(machine, kernel))
+ pr_debug("Problems creating extra kernel maps, continuing anyway...\n");
+
/* update end address of the kernel map using adjacent module address */
map = map__next(machine__kernel_map(machine));
if (map)
machine__set_kernel_mmap(machine, addr, map->start);
-
- return 0;
+out_put:
+ dso__put(kernel);
+ return ret;
}
static bool machine__uses_kcore(struct machine *machine)
@@ -1287,6 +1387,32 @@ static bool machine__uses_kcore(struct machine *machine)
return false;
}
+static bool perf_event__is_extra_kernel_mmap(struct machine *machine,
+ union perf_event *event)
+{
+ return machine__is(machine, "x86_64") &&
+ is_entry_trampoline(event->mmap.filename);
+}
+
+static int machine__process_extra_kernel_map(struct machine *machine,
+ union perf_event *event)
+{
+ struct map *kernel_map = machine__kernel_map(machine);
+ struct dso *kernel = kernel_map ? kernel_map->dso : NULL;
+ struct extra_kernel_map xm = {
+ .start = event->mmap.start,
+ .end = event->mmap.start + event->mmap.len,
+ .pgoff = event->mmap.pgoff,
+ };
+
+ if (kernel == NULL)
+ return -1;
+
+ strlcpy(xm.name, event->mmap.filename, KMAP_NAME_LEN);
+
+ return machine__create_extra_kernel_map(machine, kernel, &xm);
+}
+
static int machine__process_kernel_mmap_event(struct machine *machine,
union perf_event *event)
{
@@ -1379,9 +1505,9 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
* time /proc/sys/kernel/kptr_restrict was non zero.
*/
if (event->mmap.pgoff != 0) {
- maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
- symbol_name,
- event->mmap.pgoff);
+ map__set_kallsyms_ref_reloc_sym(machine->vmlinux_map,
+ symbol_name,
+ event->mmap.pgoff);
}
if (machine__is_default_guest(machine)) {
@@ -1390,6 +1516,8 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
*/
dso__load(kernel, machine__kernel_map(machine));
}
+ } else if (perf_event__is_extra_kernel_mmap(machine, event)) {
+ return machine__process_extra_kernel_map(machine, event);
}
return 0;
out_problem:
@@ -1402,7 +1530,6 @@ int machine__process_mmap2_event(struct machine *machine,
{
struct thread *thread;
struct map *map;
- enum map_type type;
int ret = 0;
if (dump_trace)
@@ -1421,11 +1548,6 @@ int machine__process_mmap2_event(struct machine *machine,
if (thread == NULL)
goto out_problem;
- if (event->header.misc & PERF_RECORD_MISC_MMAP_DATA)
- type = MAP__VARIABLE;
- else
- type = MAP__FUNCTION;
-
map = map__new(machine, event->mmap2.start,
event->mmap2.len, event->mmap2.pgoff,
event->mmap2.maj,
@@ -1433,7 +1555,7 @@ int machine__process_mmap2_event(struct machine *machine,
event->mmap2.ino_generation,
event->mmap2.prot,
event->mmap2.flags,
- event->mmap2.filename, type, thread);
+ event->mmap2.filename, thread);
if (map == NULL)
goto out_problem_map;
@@ -1460,7 +1582,7 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
{
struct thread *thread;
struct map *map;
- enum map_type type;
+ u32 prot = 0;
int ret = 0;
if (dump_trace)
@@ -1479,16 +1601,14 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
if (thread == NULL)
goto out_problem;
- if (event->header.misc & PERF_RECORD_MISC_MMAP_DATA)
- type = MAP__VARIABLE;
- else
- type = MAP__FUNCTION;
+ if (!(event->header.misc & PERF_RECORD_MISC_MMAP_DATA))
+ prot = PROT_EXEC;
map = map__new(machine, event->mmap.start,
event->mmap.len, event->mmap.pgoff,
- 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, prot, 0,
event->mmap.filename,
- type, thread);
+ thread);
if (map == NULL)
goto out_problem_map;
@@ -1664,7 +1784,7 @@ static void ip__resolve_ams(struct thread *thread,
* Thus, we have to try consecutively until we find a match
* or else, the symbol is unknown
*/
- thread__find_cpumode_addr_location(thread, MAP__FUNCTION, ip, &al);
+ thread__find_cpumode_addr_location(thread, ip, &al);
ams->addr = ip;
ams->al_addr = al.addr;
@@ -1681,15 +1801,7 @@ static void ip__resolve_data(struct thread *thread,
memset(&al, 0, sizeof(al));
- thread__find_addr_location(thread, m, MAP__VARIABLE, addr, &al);
- if (al.map == NULL) {
- /*
- * some shared data regions have execute bit set which puts
- * their mapping in the MAP__FUNCTION type array.
- * Check there as a fallback option before dropping the sample.
- */
- thread__find_addr_location(thread, m, MAP__FUNCTION, addr, &al);
- }
+ thread__find_symbol(thread, m, addr, &al);
ams->addr = addr;
ams->al_addr = al.addr;
@@ -1758,8 +1870,7 @@ static int add_callchain_ip(struct thread *thread,
al.filtered = 0;
al.sym = NULL;
if (!cpumode) {
- thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
- ip, &al);
+ thread__find_cpumode_addr_location(thread, ip, &al);
} else {
if (ip >= PERF_CONTEXT_MAX) {
switch (ip) {
@@ -1784,8 +1895,7 @@ static int add_callchain_ip(struct thread *thread,
}
return 0;
}
- thread__find_addr_location(thread, *cpumode, MAP__FUNCTION,
- ip, &al);
+ thread__find_symbol(thread, *cpumode, ip, &al);
}
if (al.sym != NULL) {
@@ -1810,7 +1920,7 @@ static int add_callchain_ip(struct thread *thread,
}
srcline = callchain_srcline(al.map, al.sym, al.addr);
- return callchain_cursor_append(cursor, al.addr, al.map, al.sym,
+ return callchain_cursor_append(cursor, ip, al.map, al.sym,
branch, flags, nr_loop_iter,
iter_cycles, branch_from, srcline);
}
@@ -2342,6 +2452,20 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
return 0;
}
+/*
+ * Compares the raw arch string. N.B. see instead perf_env__arch() if a
+ * normalized arch is needed.
+ */
+bool machine__is(struct machine *machine, const char *arch)
+{
+ return machine && !strcmp(perf_env__raw_arch(machine->env), arch);
+}
+
+int machine__nr_cpus_avail(struct machine *machine)
+{
+ return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
+}
+
int machine__get_kernel_start(struct machine *machine)
{
struct map *map = machine__kernel_map(machine);
@@ -2358,7 +2482,12 @@ int machine__get_kernel_start(struct machine *machine)
machine->kernel_start = 1ULL << 63;
if (map) {
err = map__load(map);
- if (!err)
+ /*
+ * On x86_64, PTI entry trampolines are less than the
+ * start of kernel text, but still above 2^63. So leave
+ * kernel_start = 1ULL << 63 for x86_64.
+ */
+ if (!err && !machine__is(machine, "x86_64"))
machine->kernel_start = map->start;
}
return err;
@@ -2373,7 +2502,7 @@ char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, ch
{
struct machine *machine = vmachine;
struct map *map;
- struct symbol *sym = map_groups__find_symbol(&machine->kmaps, MAP__FUNCTION, *addrp, &map);
+ struct symbol *sym = machine__find_kernel_symbol(machine, *addrp, &map);
if (sym == NULL)
return NULL;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 66cc200ef86f..1de7660d93e9 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -49,13 +49,14 @@ struct machine {
struct perf_env *env;
struct dsos dsos;
struct map_groups kmaps;
- struct map *vmlinux_maps[MAP__NR_TYPES];
+ struct map *vmlinux_map;
u64 kernel_start;
pid_t *current_tid;
union { /* Tool specific area */
void *priv;
u64 db_id;
};
+ bool trampolines_mapped;
};
static inline struct threads *machine__threads(struct machine *machine, pid_t tid)
@@ -64,16 +65,22 @@ static inline struct threads *machine__threads(struct machine *machine, pid_t ti
return &machine->threads[(unsigned int)tid % THREADS__TABLE_SIZE];
}
+/*
+ * The main kernel (vmlinux) map
+ */
static inline
-struct map *__machine__kernel_map(struct machine *machine, enum map_type type)
+struct map *machine__kernel_map(struct machine *machine)
{
- return machine->vmlinux_maps[type];
+ return machine->vmlinux_map;
}
+/*
+ * kernel (the one returned by machine__kernel_map()) plus kernel modules maps
+ */
static inline
-struct map *machine__kernel_map(struct machine *machine)
+struct maps *machine__kernel_maps(struct machine *machine)
{
- return __machine__kernel_map(machine, MAP__FUNCTION);
+ return &machine->kmaps.maps;
}
int machine__get_kernel_start(struct machine *machine);
@@ -182,6 +189,9 @@ static inline bool machine__is_host(struct machine *machine)
return machine ? machine->pid == HOST_KERNEL_ID : false;
}
+bool machine__is(struct machine *machine, const char *arch);
+int machine__nr_cpus_avail(struct machine *machine);
+
struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
@@ -190,44 +200,27 @@ struct dso *machine__findnew_dso(struct machine *machine, const char *filename);
size_t machine__fprintf(struct machine *machine, FILE *fp);
static inline
-struct symbol *machine__find_kernel_symbol(struct machine *machine,
- enum map_type type, u64 addr,
+struct symbol *machine__find_kernel_symbol(struct machine *machine, u64 addr,
struct map **mapp)
{
- return map_groups__find_symbol(&machine->kmaps, type, addr, mapp);
+ return map_groups__find_symbol(&machine->kmaps, addr, mapp);
}
static inline
struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
- enum map_type type, const char *name,
+ const char *name,
struct map **mapp)
{
- return map_groups__find_symbol_by_name(&machine->kmaps, type, name, mapp);
-}
-
-static inline
-struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
- struct map **mapp)
-{
- return machine__find_kernel_symbol(machine, MAP__FUNCTION, addr,
- mapp);
-}
-
-static inline
-struct symbol *machine__find_kernel_function_by_name(struct machine *machine,
- const char *name,
- struct map **mapp)
-{
- return map_groups__find_function_by_name(&machine->kmaps, name, mapp);
+ return map_groups__find_symbol_by_name(&machine->kmaps, name, mapp);
}
struct map *machine__findnew_module_map(struct machine *machine, u64 start,
const char *filename);
int arch__fix_module_text_start(u64 *start, const char *name);
-int machine__load_kallsyms(struct machine *machine, const char *filename,
- enum map_type type);
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type);
+int machine__load_kallsyms(struct machine *machine, const char *filename);
+
+int machine__load_vmlinux_path(struct machine *machine);
size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
bool (skip)(struct dso *dso, int parm), int parm);
@@ -276,4 +269,25 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
*/
char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp);
+void machine__get_kallsyms_filename(struct machine *machine, char *buf,
+ size_t bufsz);
+
+int machine__create_extra_kernel_maps(struct machine *machine,
+ struct dso *kernel);
+
+/* Kernel-space maps for symbols that are outside the main kernel map and module maps */
+struct extra_kernel_map {
+ u64 start;
+ u64 end;
+ u64 pgoff;
+ char name[KMAP_NAME_LEN];
+};
+
+int machine__create_extra_kernel_map(struct machine *machine,
+ struct dso *kernel,
+ struct extra_kernel_map *xm);
+
+int machine__map_x86_64_entry_trampolines(struct machine *machine,
+ struct dso *kernel);
+
#endif /* __PERF_MACHINE_H */
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 8fe57031e1a8..6ae97eda370b 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -22,11 +22,6 @@
static void __maps__insert(struct maps *maps, struct map *map);
-const char *map_type__name[MAP__NR_TYPES] = {
- [MAP__FUNCTION] = "Functions",
- [MAP__VARIABLE] = "Variables",
-};
-
static inline int is_anon_memory(const char *filename, u32 flags)
{
return flags & MAP_HUGETLB ||
@@ -129,10 +124,8 @@ static inline bool replace_android_lib(const char *filename, char *newfilename)
return false;
}
-void map__init(struct map *map, enum map_type type,
- u64 start, u64 end, u64 pgoff, struct dso *dso)
+void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso)
{
- map->type = type;
map->start = start;
map->end = end;
map->pgoff = pgoff;
@@ -149,7 +142,7 @@ void map__init(struct map *map, enum map_type type,
struct map *map__new(struct machine *machine, u64 start, u64 len,
u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
u64 ino_gen, u32 prot, u32 flags, char *filename,
- enum map_type type, struct thread *thread)
+ struct thread *thread)
{
struct map *map = malloc(sizeof(*map));
struct nsinfo *nsi = NULL;
@@ -173,7 +166,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
map->flags = flags;
nsi = nsinfo__get(thread->nsinfo);
- if ((anon || no_dso) && nsi && type == MAP__FUNCTION) {
+ if ((anon || no_dso) && nsi && (prot & PROT_EXEC)) {
snprintf(newfilename, sizeof(newfilename),
"/tmp/perf-%d.map", nsi->pid);
filename = newfilename;
@@ -203,7 +196,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
if (dso == NULL)
goto out_delete;
- map__init(map, type, start, start + len, pgoff, dso);
+ map__init(map, start, start + len, pgoff, dso);
if (anon || no_dso) {
map->map_ip = map->unmap_ip = identity__map_ip;
@@ -213,8 +206,8 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
* functions still return NULL, and we avoid the
* unnecessary map__load warning.
*/
- if (type != MAP__FUNCTION)
- dso__set_loaded(dso, map->type);
+ if (!(prot & PROT_EXEC))
+ dso__set_loaded(dso);
}
dso->nsinfo = nsi;
dso__put(dso);
@@ -231,7 +224,7 @@ out_delete:
* they are loaded) and for vmlinux, where only after we load all the
* symbols we'll know where it starts and ends.
*/
-struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
+struct map *map__new2(u64 start, struct dso *dso)
{
struct map *map = calloc(1, (sizeof(*map) +
(dso->kernel ? sizeof(struct kmap) : 0)));
@@ -239,7 +232,7 @@ struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
/*
* ->end will be filled after we load all the symbols
*/
- map__init(map, type, start, 0, 0, dso);
+ map__init(map, start, 0, 0, dso);
}
return map;
@@ -256,7 +249,19 @@ struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
*/
bool __map__is_kernel(const struct map *map)
{
- return __machine__kernel_map(map->groups->machine, map->type) == map;
+ return machine__kernel_map(map->groups->machine) == map;
+}
+
+bool __map__is_extra_kernel_map(const struct map *map)
+{
+ struct kmap *kmap = __map__kmap((struct map *)map);
+
+ return kmap && kmap->name[0];
+}
+
+bool map__has_symbols(const struct map *map)
+{
+ return dso__has_symbols(map->dso);
}
static void map__exit(struct map *map)
@@ -279,7 +284,7 @@ void map__put(struct map *map)
void map__fixup_start(struct map *map)
{
- struct rb_root *symbols = &map->dso->symbols[map->type];
+ struct rb_root *symbols = &map->dso->symbols;
struct rb_node *nd = rb_first(symbols);
if (nd != NULL) {
struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
@@ -289,7 +294,7 @@ void map__fixup_start(struct map *map)
void map__fixup_end(struct map *map)
{
- struct rb_root *symbols = &map->dso->symbols[map->type];
+ struct rb_root *symbols = &map->dso->symbols;
struct rb_node *nd = rb_last(symbols);
if (nd != NULL) {
struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
@@ -304,7 +309,7 @@ int map__load(struct map *map)
const char *name = map->dso->long_name;
int nr;
- if (dso__loaded(map->dso, map->type))
+ if (dso__loaded(map->dso))
return 0;
nr = dso__load(map->dso, map);
@@ -348,7 +353,7 @@ struct symbol *map__find_symbol(struct map *map, u64 addr)
if (map__load(map) < 0)
return NULL;
- return dso__find_symbol(map->dso, map->type, addr);
+ return dso__find_symbol(map->dso, addr);
}
struct symbol *map__find_symbol_by_name(struct map *map, const char *name)
@@ -356,10 +361,10 @@ struct symbol *map__find_symbol_by_name(struct map *map, const char *name)
if (map__load(map) < 0)
return NULL;
- if (!dso__sorted_by_name(map->dso, map->type))
- dso__sort_by_name(map->dso, map->type);
+ if (!dso__sorted_by_name(map->dso))
+ dso__sort_by_name(map->dso);
- return dso__find_symbol_by_name(map->dso, map->type, name);
+ return dso__find_symbol_by_name(map->dso, name);
}
struct map *map__clone(struct map *from)
@@ -494,10 +499,7 @@ static void maps__init(struct maps *maps)
void map_groups__init(struct map_groups *mg, struct machine *machine)
{
- int i;
- for (i = 0; i < MAP__NR_TYPES; ++i) {
- maps__init(&mg->maps[i]);
- }
+ maps__init(&mg->maps);
mg->machine = machine;
refcount_set(&mg->refcnt, 1);
}
@@ -525,22 +527,12 @@ static void maps__exit(struct maps *maps)
void map_groups__exit(struct map_groups *mg)
{
- int i;
-
- for (i = 0; i < MAP__NR_TYPES; ++i)
- maps__exit(&mg->maps[i]);
+ maps__exit(&mg->maps);
}
bool map_groups__empty(struct map_groups *mg)
{
- int i;
-
- for (i = 0; i < MAP__NR_TYPES; ++i) {
- if (maps__first(&mg->maps[i]))
- return false;
- }
-
- return true;
+ return !maps__first(&mg->maps);
}
struct map_groups *map_groups__new(struct machine *machine)
@@ -566,10 +558,9 @@ void map_groups__put(struct map_groups *mg)
}
struct symbol *map_groups__find_symbol(struct map_groups *mg,
- enum map_type type, u64 addr,
- struct map **mapp)
+ u64 addr, struct map **mapp)
{
- struct map *map = map_groups__find(mg, type, addr);
+ struct map *map = map_groups__find(mg, addr);
/* Ensure map is loaded before using map->map_ip */
if (map != NULL && map__load(map) >= 0) {
@@ -608,13 +599,10 @@ out:
}
struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
- enum map_type type,
const char *name,
struct map **mapp)
{
- struct symbol *sym = maps__find_symbol_by_name(&mg->maps[type], name, mapp);
-
- return sym;
+ return maps__find_symbol_by_name(&mg->maps, name, mapp);
}
int map_groups__find_ams(struct addr_map_symbol *ams)
@@ -622,8 +610,7 @@ int map_groups__find_ams(struct addr_map_symbol *ams)
if (ams->addr < ams->map->start || ams->addr >= ams->map->end) {
if (ams->map->groups == NULL)
return -1;
- ams->map = map_groups__find(ams->map->groups, ams->map->type,
- ams->addr);
+ ams->map = map_groups__find(ams->map->groups, ams->addr);
if (ams->map == NULL)
return -1;
}
@@ -646,7 +633,7 @@ static size_t maps__fprintf(struct maps *maps, FILE *fp)
printed += fprintf(fp, "Map:");
printed += map__fprintf(pos, fp);
if (verbose > 2) {
- printed += dso__fprintf(pos->dso, pos->type, fp);
+ printed += dso__fprintf(pos->dso, fp);
printed += fprintf(fp, "--\n");
}
}
@@ -656,24 +643,14 @@ static size_t maps__fprintf(struct maps *maps, FILE *fp)
return printed;
}
-size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
- FILE *fp)
-{
- size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
- return printed += maps__fprintf(&mg->maps[type], fp);
-}
-
size_t map_groups__fprintf(struct map_groups *mg, FILE *fp)
{
- size_t printed = 0, i;
- for (i = 0; i < MAP__NR_TYPES; ++i)
- printed += __map_groups__fprintf_maps(mg, i, fp);
- return printed;
+ return maps__fprintf(&mg->maps, fp);
}
static void __map_groups__insert(struct map_groups *mg, struct map *map)
{
- __maps__insert(&mg->maps[map->type], map);
+ __maps__insert(&mg->maps, map);
map->groups = mg;
}
@@ -758,19 +735,18 @@ out:
int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
FILE *fp)
{
- return maps__fixup_overlappings(&mg->maps[map->type], map, fp);
+ return maps__fixup_overlappings(&mg->maps, map, fp);
}
/*
* XXX This should not really _copy_ te maps, but refcount them.
*/
-int map_groups__clone(struct thread *thread,
- struct map_groups *parent, enum map_type type)
+int map_groups__clone(struct thread *thread, struct map_groups *parent)
{
struct map_groups *mg = thread->mg;
int err = -ENOMEM;
struct map *map;
- struct maps *maps = &parent->maps[type];
+ struct maps *maps = &parent->maps;
down_read(&maps->lock);
@@ -877,15 +853,22 @@ struct map *map__next(struct map *map)
return NULL;
}
-struct kmap *map__kmap(struct map *map)
+struct kmap *__map__kmap(struct map *map)
{
- if (!map->dso || !map->dso->kernel) {
- pr_err("Internal error: map__kmap with a non-kernel map\n");
+ if (!map->dso || !map->dso->kernel)
return NULL;
- }
return (struct kmap *)(map + 1);
}
+struct kmap *map__kmap(struct map *map)
+{
+ struct kmap *kmap = __map__kmap(map);
+
+ if (!kmap)
+ pr_err("Internal error: map__kmap with a non-kernel map\n");
+ return kmap;
+}
+
struct map_groups *map__kmaps(struct map *map)
{
struct kmap *kmap = map__kmap(map);
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 0e9bbe01b0ab..97e2a063bd65 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -8,19 +8,11 @@
#include <linux/rbtree.h>
#include <pthread.h>
#include <stdio.h>
+#include <string.h>
#include <stdbool.h>
#include <linux/types.h>
#include "rwsem.h"
-enum map_type {
- MAP__FUNCTION = 0,
- MAP__VARIABLE,
-};
-
-#define MAP__NR_TYPES (MAP__VARIABLE + 1)
-
-extern const char *map_type__name[MAP__NR_TYPES];
-
struct dso;
struct ip_callchain;
struct ref_reloc_sym;
@@ -35,7 +27,6 @@ struct map {
};
u64 start;
u64 end;
- u8 /* enum map_type */ type;
bool erange_warned;
u32 priv;
u32 prot;
@@ -56,9 +47,12 @@ struct map {
refcount_t refcnt;
};
+#define KMAP_NAME_LEN 256
+
struct kmap {
struct ref_reloc_sym *ref_reloc_sym;
struct map_groups *kmaps;
+ char name[KMAP_NAME_LEN];
};
struct maps {
@@ -67,7 +61,7 @@ struct maps {
};
struct map_groups {
- struct maps maps[MAP__NR_TYPES];
+ struct maps maps;
struct machine *machine;
refcount_t refcnt;
};
@@ -85,6 +79,7 @@ static inline struct map_groups *map_groups__get(struct map_groups *mg)
void map_groups__put(struct map_groups *mg);
+struct kmap *__map__kmap(struct map *map);
struct kmap *map__kmap(struct map *map);
struct map_groups *map__kmaps(struct map *map);
@@ -125,7 +120,7 @@ struct thread;
* Note: caller must ensure map->dso is not NULL (map is loaded).
*/
#define map__for_each_symbol(map, pos, n) \
- dso__for_each_symbol(map->dso, pos, n, map->type)
+ dso__for_each_symbol(map->dso, pos, n)
/* map__for_each_symbol_with_name - iterate over the symbols in the given map
* that have the given name
@@ -144,13 +139,13 @@ struct thread;
#define map__for_each_symbol_by_name(map, sym_name, pos) \
__map__for_each_symbol_by_name(map, sym_name, (pos))
-void map__init(struct map *map, enum map_type type,
+void map__init(struct map *map,
u64 start, u64 end, u64 pgoff, struct dso *dso);
struct map *map__new(struct machine *machine, u64 start, u64 len,
u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
u64 ino_gen, u32 prot, u32 flags,
- char *filename, enum map_type type, struct thread *thread);
-struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
+ char *filename, struct thread *thread);
+struct map *map__new2(u64 start, struct dso *dso);
void map__delete(struct map *map);
struct map *map__clone(struct map *map);
@@ -185,8 +180,6 @@ void map__fixup_end(struct map *map);
void map__reloc_vmlinux(struct map *map);
-size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
- FILE *fp);
void maps__insert(struct maps *maps, struct map *map);
void maps__remove(struct maps *maps, struct map *map);
struct map *maps__find(struct maps *maps, u64 addr);
@@ -197,34 +190,29 @@ struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name,
void map_groups__init(struct map_groups *mg, struct machine *machine);
void map_groups__exit(struct map_groups *mg);
int map_groups__clone(struct thread *thread,
- struct map_groups *parent, enum map_type type);
+ struct map_groups *parent);
size_t map_groups__fprintf(struct map_groups *mg, FILE *fp);
-int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name,
- u64 addr);
+int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name,
+ u64 addr);
static inline void map_groups__insert(struct map_groups *mg, struct map *map)
{
- maps__insert(&mg->maps[map->type], map);
+ maps__insert(&mg->maps, map);
map->groups = mg;
}
static inline void map_groups__remove(struct map_groups *mg, struct map *map)
{
- maps__remove(&mg->maps[map->type], map);
+ maps__remove(&mg->maps, map);
}
-static inline struct map *map_groups__find(struct map_groups *mg,
- enum map_type type, u64 addr)
+static inline struct map *map_groups__find(struct map_groups *mg, u64 addr)
{
- return maps__find(&mg->maps[type], addr);
+ return maps__find(&mg->maps, addr);
}
-static inline struct map *map_groups__first(struct map_groups *mg,
- enum map_type type)
-{
- return maps__first(&mg->maps[type]);
-}
+struct map *map_groups__first(struct map_groups *mg);
static inline struct map *map_groups__next(struct map *map)
{
@@ -232,11 +220,9 @@ static inline struct map *map_groups__next(struct map *map)
}
struct symbol *map_groups__find_symbol(struct map_groups *mg,
- enum map_type type, u64 addr,
- struct map **mapp);
+ u64 addr, struct map **mapp);
struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
- enum map_type type,
const char *name,
struct map **mapp);
@@ -244,24 +230,26 @@ struct addr_map_symbol;
int map_groups__find_ams(struct addr_map_symbol *ams);
-static inline
-struct symbol *map_groups__find_function_by_name(struct map_groups *mg,
- const char *name, struct map **mapp)
-{
- return map_groups__find_symbol_by_name(mg, MAP__FUNCTION, name, mapp);
-}
-
int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
FILE *fp);
-struct map *map_groups__find_by_name(struct map_groups *mg,
- enum map_type type, const char *name);
+struct map *map_groups__find_by_name(struct map_groups *mg, const char *name);
bool __map__is_kernel(const struct map *map);
+bool __map__is_extra_kernel_map(const struct map *map);
static inline bool __map__is_kmodule(const struct map *map)
{
- return !__map__is_kernel(map);
+ return !__map__is_kernel(map) && !__map__is_extra_kernel_map(map);
+}
+
+bool map__has_symbols(const struct map *map);
+
+#define ENTRY_TRAMPOLINE_NAME "__entry_SYSCALL_64_trampoline"
+
+static inline bool is_entry_trampoline(const char *name)
+{
+ return !strcmp(name, ENTRY_TRAMPOLINE_NAME);
}
#endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2fc4ee8b86c1..15eec49e71a1 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -156,13 +156,12 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
(strcmp(sys_dirent->d_name, ".")) && \
(strcmp(sys_dirent->d_name, "..")))
-static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
+static int tp_event_has_id(const char *dir_path, struct dirent *evt_dir)
{
char evt_path[MAXPATHLEN];
int fd;
- snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
- sys_dir->d_name, evt_dir->d_name);
+ snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path, evt_dir->d_name);
fd = open(evt_path, O_RDONLY);
if (fd < 0)
return -EINVAL;
@@ -171,12 +170,12 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
return 0;
}
-#define for_each_event(sys_dirent, evt_dir, evt_dirent) \
+#define for_each_event(dir_path, evt_dir, evt_dirent) \
while ((evt_dirent = readdir(evt_dir)) != NULL) \
if (evt_dirent->d_type == DT_DIR && \
(strcmp(evt_dirent->d_name, ".")) && \
(strcmp(evt_dirent->d_name, "..")) && \
- (!tp_event_has_id(sys_dirent, evt_dirent)))
+ (!tp_event_has_id(dir_path, evt_dirent)))
#define MAX_EVENT_LENGTH 512
@@ -190,21 +189,21 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
int fd;
u64 id;
char evt_path[MAXPATHLEN];
- char dir_path[MAXPATHLEN];
+ char *dir_path;
- sys_dir = opendir(tracing_events_path);
+ sys_dir = tracing_events__opendir();
if (!sys_dir)
return NULL;
for_each_subsystem(sys_dir, sys_dirent) {
-
- snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent->d_name);
+ dir_path = get_events_file(sys_dirent->d_name);
+ if (!dir_path)
+ continue;
evt_dir = opendir(dir_path);
if (!evt_dir)
- continue;
+ goto next;
- for_each_event(sys_dirent, evt_dir, evt_dirent) {
+ for_each_event(dir_path, evt_dir, evt_dirent) {
scnprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
evt_dirent->d_name);
@@ -218,6 +217,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
close(fd);
id = atoll(id_buf);
if (id == config) {
+ put_events_file(dir_path);
closedir(evt_dir);
closedir(sys_dir);
path = zalloc(sizeof(*path));
@@ -242,6 +242,8 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
}
}
closedir(evt_dir);
+next:
+ put_events_file(dir_path);
}
closedir(sys_dir);
@@ -512,14 +514,19 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx,
struct parse_events_error *err,
struct list_head *head_config)
{
- char evt_path[MAXPATHLEN];
+ char *evt_path;
struct dirent *evt_ent;
DIR *evt_dir;
int ret = 0, found = 0;
- snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name);
+ evt_path = get_events_file(sys_name);
+ if (!evt_path) {
+ tracepoint_error(err, errno, sys_name, evt_name);
+ return -1;
+ }
evt_dir = opendir(evt_path);
if (!evt_dir) {
+ put_events_file(evt_path);
tracepoint_error(err, errno, sys_name, evt_name);
return -1;
}
@@ -545,6 +552,7 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx,
ret = -1;
}
+ put_events_file(evt_path);
closedir(evt_dir);
return ret;
}
@@ -570,7 +578,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
DIR *events_dir;
int ret = 0;
- events_dir = opendir(tracing_events_path);
+ events_dir = tracing_events__opendir();
if (!events_dir) {
tracepoint_error(err, errno, sys_name, evt_name);
return -1;
@@ -2092,13 +2100,13 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
DIR *sys_dir, *evt_dir;
struct dirent *sys_dirent, *evt_dirent;
char evt_path[MAXPATHLEN];
- char dir_path[MAXPATHLEN];
+ char *dir_path;
char **evt_list = NULL;
unsigned int evt_i = 0, evt_num = 0;
bool evt_num_known = false;
restart:
- sys_dir = opendir(tracing_events_path);
+ sys_dir = tracing_events__opendir();
if (!sys_dir)
return;
@@ -2113,13 +2121,14 @@ restart:
!strglobmatch(sys_dirent->d_name, subsys_glob))
continue;
- snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent->d_name);
+ dir_path = get_events_file(sys_dirent->d_name);
+ if (!dir_path)
+ continue;
evt_dir = opendir(dir_path);
if (!evt_dir)
- continue;
+ goto next;
- for_each_event(sys_dirent, evt_dir, evt_dirent) {
+ for_each_event(dir_path, evt_dir, evt_dirent) {
if (event_glob != NULL &&
!strglobmatch(evt_dirent->d_name, event_glob))
continue;
@@ -2133,11 +2142,15 @@ restart:
sys_dirent->d_name, evt_dirent->d_name);
evt_list[evt_i] = strdup(evt_path);
- if (evt_list[evt_i] == NULL)
+ if (evt_list[evt_i] == NULL) {
+ put_events_file(dir_path);
goto out_close_evt_dir;
+ }
evt_i++;
}
closedir(evt_dir);
+next:
+ put_events_file(dir_path);
}
closedir(sys_dir);
@@ -2185,21 +2198,21 @@ int is_valid_tracepoint(const char *event_string)
DIR *sys_dir, *evt_dir;
struct dirent *sys_dirent, *evt_dirent;
char evt_path[MAXPATHLEN];
- char dir_path[MAXPATHLEN];
+ char *dir_path;
- sys_dir = opendir(tracing_events_path);
+ sys_dir = tracing_events__opendir();
if (!sys_dir)
return 0;
for_each_subsystem(sys_dir, sys_dirent) {
-
- snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent->d_name);
+ dir_path = get_events_file(sys_dirent->d_name);
+ if (!dir_path)
+ continue;
evt_dir = opendir(dir_path);
if (!evt_dir)
- continue;
+ goto next;
- for_each_event(sys_dirent, evt_dir, evt_dirent) {
+ for_each_event(dir_path, evt_dir, evt_dirent) {
snprintf(evt_path, MAXPATHLEN, "%s:%s",
sys_dirent->d_name, evt_dirent->d_name);
if (!strcmp(evt_path, event_string)) {
@@ -2209,6 +2222,8 @@ int is_valid_tracepoint(const char *event_string)
}
}
closedir(evt_dir);
+next:
+ put_events_file(dir_path);
}
closedir(sys_dir);
return 0;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index e1dbc9821617..3094f11e7d81 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -111,17 +111,6 @@ void exit_probe_symbol_maps(void)
symbol__exit();
}
-static struct symbol *__find_kernel_function_by_name(const char *name,
- struct map **mapp)
-{
- return machine__find_kernel_function_by_name(host_machine, name, mapp);
-}
-
-static struct symbol *__find_kernel_function(u64 addr, struct map **mapp)
-{
- return machine__find_kernel_function(host_machine, addr, mapp);
-}
-
static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void)
{
/* kmap->ref_reloc_sym should be set if host_machine is initialized */
@@ -149,7 +138,7 @@ static int kernel_get_symbol_address_by_name(const char *name, u64 *addr,
if (reloc_sym && strcmp(name, reloc_sym->name) == 0)
*addr = (reloc) ? reloc_sym->addr : reloc_sym->unrelocated_addr;
else {
- sym = __find_kernel_function_by_name(name, &map);
+ sym = machine__find_kernel_symbol_by_name(host_machine, name, &map);
if (!sym)
return -ENOENT;
*addr = map->unmap_ip(map, sym->start) -
@@ -161,8 +150,7 @@ static int kernel_get_symbol_address_by_name(const char *name, u64 *addr,
static struct map *kernel_get_module_map(const char *module)
{
- struct map_groups *grp = &host_machine->kmaps;
- struct maps *maps = &grp->maps[MAP__FUNCTION];
+ struct maps *maps = machine__kernel_maps(host_machine);
struct map *pos;
/* A file path -- this is an offline module */
@@ -341,7 +329,7 @@ static int kernel_get_module_dso(const char *module, struct dso **pdso)
char module_name[128];
snprintf(module_name, sizeof(module_name), "[%s]", module);
- map = map_groups__find_by_name(&host_machine->kmaps, MAP__FUNCTION, module_name);
+ map = map_groups__find_by_name(&host_machine->kmaps, module_name);
if (map) {
dso = map->dso;
goto found;
@@ -2098,7 +2086,7 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
}
if (addr) {
addr += tp->offset;
- sym = __find_kernel_function(addr, &map);
+ sym = machine__find_kernel_symbol(host_machine, addr, &map);
}
}
@@ -3504,19 +3492,18 @@ int show_available_funcs(const char *target, struct nsinfo *nsi,
(target) ? : "kernel");
goto end;
}
- if (!dso__sorted_by_name(map->dso, map->type))
- dso__sort_by_name(map->dso, map->type);
+ if (!dso__sorted_by_name(map->dso))
+ dso__sort_by_name(map->dso);
/* Show all (filtered) symbols */
setup_pager();
- for (nd = rb_first(&map->dso->symbol_names[map->type]); nd; nd = rb_next(nd)) {
+ for (nd = rb_first(&map->dso->symbol_names); nd; nd = rb_next(nd)) {
struct symbol_name_rb_node *pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
if (strfilter__compare(_filter, pos->sym.name))
printf("%s\n", pos->sym.name);
- }
-
+ }
end:
map__put(map);
exit_probe_symbol_maps();
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 4ae1123c6794..b76088fadf3d 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -84,8 +84,7 @@ int open_trace_file(const char *trace_file, bool readwrite)
char buf[PATH_MAX];
int ret;
- ret = e_snprintf(buf, PATH_MAX, "%s/%s",
- tracing_path, trace_file);
+ ret = e_snprintf(buf, PATH_MAX, "%s/%s", tracing_path_mount(), trace_file);
if (ret >= 0) {
pr_debug("Opening %s write=%d\n", buf, readwrite);
if (readwrite && !probe_event_dry_run)
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index f4a7a437ee87..b998bb475589 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1973,12 +1973,11 @@ bool perf_session__has_traces(struct perf_session *session, const char *msg)
return false;
}
-int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
- const char *symbol_name, u64 addr)
+int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr)
{
char *bracket;
- int i;
struct ref_reloc_sym *ref;
+ struct kmap *kmap;
ref = zalloc(sizeof(struct ref_reloc_sym));
if (ref == NULL)
@@ -1996,13 +1995,9 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
ref->addr = addr;
- for (i = 0; i < MAP__NR_TYPES; ++i) {
- struct kmap *kmap = map__kmap(maps[i]);
-
- if (!kmap)
- continue;
+ kmap = map__kmap(map);
+ if (kmap)
kmap->ref_reloc_sym = ref;
- }
return 0;
}
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 26a68dfd8a4f..4058ade352a5 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2,7 +2,7 @@
#include <errno.h>
#include <inttypes.h>
#include <regex.h>
-#include <sys/mman.h>
+#include <linux/mman.h>
#include "sort.h"
#include "hist.h"
#include "comm.h"
@@ -282,7 +282,7 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", level);
if (sym && map) {
- if (map->type == MAP__VARIABLE) {
+ if (sym->type == STT_OBJECT) {
ret += repsep_snprintf(bf + ret, size - ret, "%s", sym->name);
ret += repsep_snprintf(bf + ret, size - ret, "+0x%llx",
ip - map->unmap_ip(map, sym->start));
@@ -1211,7 +1211,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
/* print [s] for shared data mmaps */
if ((he->cpumode != PERF_RECORD_MISC_KERNEL) &&
- map && (map->type == MAP__VARIABLE) &&
+ map && !(map->prot & PROT_EXEC) &&
(map->flags & MAP_SHARED) &&
(map->maj || map->min || map->ino ||
map->ino_generation))
@@ -2582,7 +2582,7 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
if (sort__mode != SORT_MODE__MEMORY)
return -EINVAL;
- if (sd->entry == &sort_mem_dcacheline && cacheline_size == 0)
+ if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0)
return -EINVAL;
if (sd->entry == &sort_mem_daddr_sym)
@@ -2628,7 +2628,7 @@ static int setup_sort_list(struct perf_hpp_list *list, char *str,
if (*tok) {
ret = sort_dimension__add(list, tok, evlist, level);
if (ret == -EINVAL) {
- if (!cacheline_size && !strncasecmp(tok, "dcacheline", strlen(tok)))
+ if (!cacheline_size() && !strncasecmp(tok, "dcacheline", strlen(tok)))
pr_err("The \"dcacheline\" --sort key needs to know the cacheline size and it couldn't be determined on this system");
else
pr_err("Invalid --sort key: `%s'", tok);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 035b62e2c60b..9e6896293bbd 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -186,13 +186,13 @@ static inline float hist_entry__get_percent_limit(struct hist_entry *he)
static inline u64 cl_address(u64 address)
{
/* return the cacheline of the address */
- return (address & ~(cacheline_size - 1));
+ return (address & ~(cacheline_size() - 1));
}
static inline u64 cl_offset(u64 address)
{
/* return the cacheline of the address */
- return (address & (cacheline_size - 1));
+ return (address & (cacheline_size() - 1));
}
enum sort_mode {
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 3c21fd059b64..09d6746e6ec8 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -103,6 +103,7 @@ static struct symbol *new_inline_sym(struct dso *dso,
inline_sym = symbol__new(base_sym ? base_sym->start : 0,
base_sym ? base_sym->end : 0,
base_sym ? base_sym->binding : 0,
+ base_sym ? base_sym->type : 0,
funcname);
if (inline_sym)
inline_sym->inlined = 1;
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 8f56ba4fd258..36efb986f7fc 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -7,8 +7,7 @@
#include "xyarray.h"
#include "rblist.h"
-struct stats
-{
+struct stats {
double n, mean, M2;
u64 max, min;
};
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 2de770511e70..29770ea61768 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -114,16 +114,9 @@ static inline int elf_sym__is_label(const GElf_Sym *sym)
sym->st_shndx != SHN_ABS;
}
-static bool elf_sym__is_a(GElf_Sym *sym, enum map_type type)
+static bool elf_sym__filter(GElf_Sym *sym)
{
- switch (type) {
- case MAP__FUNCTION:
- return elf_sym__is_function(sym);
- case MAP__VARIABLE:
- return elf_sym__is_object(sym);
- default:
- return false;
- }
+ return elf_sym__is_function(sym) || elf_sym__is_object(sym);
}
static inline const char *elf_sym__name(const GElf_Sym *sym,
@@ -150,17 +143,10 @@ static inline bool elf_sec__is_data(const GElf_Shdr *shdr,
return strstr(elf_sec__name(shdr, secstrs), "data") != NULL;
}
-static bool elf_sec__is_a(GElf_Shdr *shdr, Elf_Data *secstrs,
- enum map_type type)
+static bool elf_sec__filter(GElf_Shdr *shdr, Elf_Data *secstrs)
{
- switch (type) {
- case MAP__FUNCTION:
- return elf_sec__is_text(shdr, secstrs);
- case MAP__VARIABLE:
- return elf_sec__is_data(shdr, secstrs);
- default:
- return false;
- }
+ return elf_sec__is_text(shdr, secstrs) ||
+ elf_sec__is_data(shdr, secstrs);
}
static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr)
@@ -256,7 +242,7 @@ static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
* And always look at the original dso, not at debuginfo packages, that
* have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
*/
-int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *map)
+int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss)
{
uint32_t nr_rel_entries, idx;
GElf_Sym sym;
@@ -364,12 +350,12 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
free(demangled);
f = symbol__new(plt_offset, plt_entry_size,
- STB_GLOBAL, sympltname);
+ STB_GLOBAL, STT_FUNC, sympltname);
if (!f)
goto out_elf_end;
plt_offset += plt_entry_size;
- symbols__insert(&dso->symbols[map->type], f);
+ symbols__insert(&dso->symbols, f);
++nr;
}
} else if (shdr_rel_plt.sh_type == SHT_REL) {
@@ -390,12 +376,12 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
free(demangled);
f = symbol__new(plt_offset, plt_entry_size,
- STB_GLOBAL, sympltname);
+ STB_GLOBAL, STT_FUNC, sympltname);
if (!f)
goto out_elf_end;
plt_offset += plt_entry_size;
- symbols__insert(&dso->symbols[map->type], f);
+ symbols__insert(&dso->symbols, f);
++nr;
}
}
@@ -811,6 +797,110 @@ static u64 ref_reloc(struct kmap *kmap)
void __weak arch__sym_update(struct symbol *s __maybe_unused,
GElf_Sym *sym __maybe_unused) { }
+static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
+ GElf_Sym *sym, GElf_Shdr *shdr,
+ struct map_groups *kmaps, struct kmap *kmap,
+ struct dso **curr_dsop, struct map **curr_mapp,
+ const char *section_name,
+ bool adjust_kernel_syms, bool kmodule, bool *remap_kernel)
+{
+ struct dso *curr_dso = *curr_dsop;
+ struct map *curr_map;
+ char dso_name[PATH_MAX];
+
+ /* Adjust symbol to map to file offset */
+ if (adjust_kernel_syms)
+ sym->st_value -= shdr->sh_addr - shdr->sh_offset;
+
+ if (strcmp(section_name, (curr_dso->short_name + dso->short_name_len)) == 0)
+ return 0;
+
+ if (strcmp(section_name, ".text") == 0) {
+ /*
+ * The initial kernel mapping is based on
+ * kallsyms and identity maps. Overwrite it to
+ * map to the kernel dso.
+ */
+ if (*remap_kernel && dso->kernel) {
+ *remap_kernel = false;
+ map->start = shdr->sh_addr + ref_reloc(kmap);
+ map->end = map->start + shdr->sh_size;
+ map->pgoff = shdr->sh_offset;
+ map->map_ip = map__map_ip;
+ map->unmap_ip = map__unmap_ip;
+ /* Ensure maps are correctly ordered */
+ if (kmaps) {
+ map__get(map);
+ map_groups__remove(kmaps, map);
+ map_groups__insert(kmaps, map);
+ map__put(map);
+ }
+ }
+
+ /*
+ * The initial module mapping is based on
+ * /proc/modules mapped to offset zero.
+ * Overwrite it to map to the module dso.
+ */
+ if (*remap_kernel && kmodule) {
+ *remap_kernel = false;
+ map->pgoff = shdr->sh_offset;
+ }
+
+ *curr_mapp = map;
+ *curr_dsop = dso;
+ return 0;
+ }
+
+ if (!kmap)
+ return 0;
+
+ snprintf(dso_name, sizeof(dso_name), "%s%s", dso->short_name, section_name);
+
+ curr_map = map_groups__find_by_name(kmaps, dso_name);
+ if (curr_map == NULL) {
+ u64 start = sym->st_value;
+
+ if (kmodule)
+ start += map->start + shdr->sh_offset;
+
+ curr_dso = dso__new(dso_name);
+ if (curr_dso == NULL)
+ return -1;
+ curr_dso->kernel = dso->kernel;
+ curr_dso->long_name = dso->long_name;
+ curr_dso->long_name_len = dso->long_name_len;
+ curr_map = map__new2(start, curr_dso);
+ dso__put(curr_dso);
+ if (curr_map == NULL)
+ return -1;
+
+ if (adjust_kernel_syms) {
+ curr_map->start = shdr->sh_addr + ref_reloc(kmap);
+ curr_map->end = curr_map->start + shdr->sh_size;
+ curr_map->pgoff = shdr->sh_offset;
+ } else {
+ curr_map->map_ip = curr_map->unmap_ip = identity__map_ip;
+ }
+ curr_dso->symtab_type = dso->symtab_type;
+ map_groups__insert(kmaps, curr_map);
+ /*
+ * Add it before we drop the referece to curr_map, i.e. while
+ * we still are sure to have a reference to this DSO via
+ * *curr_map->dso.
+ */
+ dsos__add(&map->groups->machine->dsos, curr_dso);
+ /* kmaps already got it */
+ map__put(curr_map);
+ dso__set_loaded(curr_dso);
+ *curr_mapp = curr_map;
+ *curr_dsop = curr_dso;
+ } else
+ *curr_dsop = curr_map->dso;
+
+ return 0;
+}
+
int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
struct symsrc *runtime_ss, int kmodule)
{
@@ -844,7 +934,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
* have the wrong values for the dso maps, so remove them.
*/
if (kmodule && syms_ss->symtab)
- symbols__delete(&dso->symbols[map->type]);
+ symbols__delete(&dso->symbols);
if (!syms_ss->symtab) {
/*
@@ -921,10 +1011,10 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
dso->adjust_symbols = runtime_ss->adjust_symbols || ref_reloc(kmap);
/*
- * Initial kernel and module mappings do not map to the dso. For
- * function mappings, flag the fixups.
+ * Initial kernel and module mappings do not map to the dso.
+ * Flag the fixups.
*/
- if (map->type == MAP__FUNCTION && (dso->kernel || kmodule)) {
+ if (dso->kernel || kmodule) {
remap_kernel = true;
adjust_kernel_syms = dso->adjust_symbols;
}
@@ -936,7 +1026,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
const char *section_name;
bool used_opd = false;
- if (!is_label && !elf_sym__is_a(&sym, map->type))
+ if (!is_label && !elf_sym__filter(&sym))
continue;
/* Reject ARM ELF "mapping symbols": these aren't unique and
@@ -974,7 +1064,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
gelf_getshdr(sec, &shdr);
- if (is_label && !elf_sec__is_a(&shdr, secstrs, map->type))
+ if (is_label && !elf_sec__filter(&shdr, secstrs))
continue;
section_name = elf_sec__name(&shdr, secstrs);
@@ -982,134 +1072,37 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
/* On ARM, symbols for thumb functions have 1 added to
* the symbol address as a flag - remove it */
if ((ehdr.e_machine == EM_ARM) &&
- (map->type == MAP__FUNCTION) &&
+ (GELF_ST_TYPE(sym.st_info) == STT_FUNC) &&
(sym.st_value & 1))
--sym.st_value;
if (dso->kernel || kmodule) {
- char dso_name[PATH_MAX];
-
- /* Adjust symbol to map to file offset */
- if (adjust_kernel_syms)
- sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-
- if (strcmp(section_name,
- (curr_dso->short_name +
- dso->short_name_len)) == 0)
- goto new_symbol;
-
- if (strcmp(section_name, ".text") == 0) {
- /*
- * The initial kernel mapping is based on
- * kallsyms and identity maps. Overwrite it to
- * map to the kernel dso.
- */
- if (remap_kernel && dso->kernel) {
- remap_kernel = false;
- map->start = shdr.sh_addr +
- ref_reloc(kmap);
- map->end = map->start + shdr.sh_size;
- map->pgoff = shdr.sh_offset;
- map->map_ip = map__map_ip;
- map->unmap_ip = map__unmap_ip;
- /* Ensure maps are correctly ordered */
- if (kmaps) {
- map__get(map);
- map_groups__remove(kmaps, map);
- map_groups__insert(kmaps, map);
- map__put(map);
- }
- }
-
- /*
- * The initial module mapping is based on
- * /proc/modules mapped to offset zero.
- * Overwrite it to map to the module dso.
- */
- if (remap_kernel && kmodule) {
- remap_kernel = false;
- map->pgoff = shdr.sh_offset;
- }
-
- curr_map = map;
- curr_dso = dso;
- goto new_symbol;
- }
-
- if (!kmap)
- goto new_symbol;
-
- snprintf(dso_name, sizeof(dso_name),
- "%s%s", dso->short_name, section_name);
-
- curr_map = map_groups__find_by_name(kmaps, map->type, dso_name);
- if (curr_map == NULL) {
- u64 start = sym.st_value;
-
- if (kmodule)
- start += map->start + shdr.sh_offset;
-
- curr_dso = dso__new(dso_name);
- if (curr_dso == NULL)
- goto out_elf_end;
- curr_dso->kernel = dso->kernel;
- curr_dso->long_name = dso->long_name;
- curr_dso->long_name_len = dso->long_name_len;
- curr_map = map__new2(start, curr_dso,
- map->type);
- dso__put(curr_dso);
- if (curr_map == NULL) {
- goto out_elf_end;
- }
- if (adjust_kernel_syms) {
- curr_map->start = shdr.sh_addr +
- ref_reloc(kmap);
- curr_map->end = curr_map->start +
- shdr.sh_size;
- curr_map->pgoff = shdr.sh_offset;
- } else {
- curr_map->map_ip = identity__map_ip;
- curr_map->unmap_ip = identity__map_ip;
- }
- curr_dso->symtab_type = dso->symtab_type;
- map_groups__insert(kmaps, curr_map);
- /*
- * Add it before we drop the referece to curr_map,
- * i.e. while we still are sure to have a reference
- * to this DSO via curr_map->dso.
- */
- dsos__add(&map->groups->machine->dsos, curr_dso);
- /* kmaps already got it */
- map__put(curr_map);
- dso__set_loaded(curr_dso, map->type);
- } else
- curr_dso = curr_map->dso;
-
- goto new_symbol;
- }
-
- if ((used_opd && runtime_ss->adjust_symbols)
- || (!used_opd && syms_ss->adjust_symbols)) {
+ if (dso__process_kernel_symbol(dso, map, &sym, &shdr, kmaps, kmap, &curr_dso, &curr_map,
+ section_name, adjust_kernel_syms, kmodule, &remap_kernel))
+ goto out_elf_end;
+ } else if ((used_opd && runtime_ss->adjust_symbols) ||
+ (!used_opd && syms_ss->adjust_symbols)) {
pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
"sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__,
(u64)sym.st_value, (u64)shdr.sh_addr,
(u64)shdr.sh_offset);
sym.st_value -= shdr.sh_addr - shdr.sh_offset;
}
-new_symbol:
+
demangled = demangle_sym(dso, kmodule, elf_name);
if (demangled != NULL)
elf_name = demangled;
f = symbol__new(sym.st_value, sym.st_size,
- GELF_ST_BIND(sym.st_info), elf_name);
+ GELF_ST_BIND(sym.st_info),
+ GELF_ST_TYPE(sym.st_info), elf_name);
free(demangled);
if (!f)
goto out_elf_end;
arch__sym_update(f, &sym);
- __symbols__insert(&curr_dso->symbols[curr_map->type], f, dso->kernel);
+ __symbols__insert(&curr_dso->symbols, f, dso->kernel);
nr++;
}
@@ -1117,14 +1110,14 @@ new_symbol:
* For misannotated, zeroed, ASM function sizes.
*/
if (nr > 0) {
- symbols__fixup_end(&dso->symbols[map->type]);
- symbols__fixup_duplicate(&dso->symbols[map->type]);
+ symbols__fixup_end(&dso->symbols);
+ symbols__fixup_duplicate(&dso->symbols);
if (kmap) {
/*
* We need to fixup this here too because we create new
* maps here, for things like vsyscall sections.
*/
- __map_groups__fixup_end(kmaps, map->type);
+ map_groups__fixup_end(kmaps);
}
}
err = nr;
@@ -1393,8 +1386,16 @@ static off_t kcore__write(struct kcore *kcore)
struct phdr_data {
off_t offset;
+ off_t rel;
u64 addr;
u64 len;
+ struct list_head node;
+ struct phdr_data *remaps;
+};
+
+struct sym_data {
+ u64 addr;
+ struct list_head node;
};
struct kcore_copy_info {
@@ -1404,16 +1405,78 @@ struct kcore_copy_info {
u64 last_symbol;
u64 first_module;
u64 last_module_symbol;
- struct phdr_data kernel_map;
- struct phdr_data modules_map;
+ size_t phnum;
+ struct list_head phdrs;
+ struct list_head syms;
};
+#define kcore_copy__for_each_phdr(k, p) \
+ list_for_each_entry((p), &(k)->phdrs, node)
+
+static struct phdr_data *phdr_data__new(u64 addr, u64 len, off_t offset)
+{
+ struct phdr_data *p = zalloc(sizeof(*p));
+
+ if (p) {
+ p->addr = addr;
+ p->len = len;
+ p->offset = offset;
+ }
+
+ return p;
+}
+
+static struct phdr_data *kcore_copy_info__addnew(struct kcore_copy_info *kci,
+ u64 addr, u64 len,
+ off_t offset)
+{
+ struct phdr_data *p = phdr_data__new(addr, len, offset);
+
+ if (p)
+ list_add_tail(&p->node, &kci->phdrs);
+
+ return p;
+}
+
+static void kcore_copy__free_phdrs(struct kcore_copy_info *kci)
+{
+ struct phdr_data *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &kci->phdrs, node) {
+ list_del(&p->node);
+ free(p);
+ }
+}
+
+static struct sym_data *kcore_copy__new_sym(struct kcore_copy_info *kci,
+ u64 addr)
+{
+ struct sym_data *s = zalloc(sizeof(*s));
+
+ if (s) {
+ s->addr = addr;
+ list_add_tail(&s->node, &kci->syms);
+ }
+
+ return s;
+}
+
+static void kcore_copy__free_syms(struct kcore_copy_info *kci)
+{
+ struct sym_data *s, *tmp;
+
+ list_for_each_entry_safe(s, tmp, &kci->syms, node) {
+ list_del(&s->node);
+ free(s);
+ }
+}
+
static int kcore_copy__process_kallsyms(void *arg, const char *name, char type,
u64 start)
{
struct kcore_copy_info *kci = arg;
- if (!symbol_type__is_a(type, MAP__FUNCTION))
+ if (!kallsyms__is_function(type))
return 0;
if (strchr(name, '[')) {
@@ -1438,6 +1501,9 @@ static int kcore_copy__process_kallsyms(void *arg, const char *name, char type,
return 0;
}
+ if (is_entry_trampoline(name) && !kcore_copy__new_sym(kci, start))
+ return -1;
+
return 0;
}
@@ -1487,27 +1553,39 @@ static int kcore_copy__parse_modules(struct kcore_copy_info *kci,
return 0;
}
-static void kcore_copy__map(struct phdr_data *p, u64 start, u64 end, u64 pgoff,
- u64 s, u64 e)
+static int kcore_copy__map(struct kcore_copy_info *kci, u64 start, u64 end,
+ u64 pgoff, u64 s, u64 e)
{
- if (p->addr || s < start || s >= end)
- return;
+ u64 len, offset;
+
+ if (s < start || s >= end)
+ return 0;
- p->addr = s;
- p->offset = (s - start) + pgoff;
- p->len = e < end ? e - s : end - s;
+ offset = (s - start) + pgoff;
+ len = e < end ? e - s : end - s;
+
+ return kcore_copy_info__addnew(kci, s, len, offset) ? 0 : -1;
}
static int kcore_copy__read_map(u64 start, u64 len, u64 pgoff, void *data)
{
struct kcore_copy_info *kci = data;
u64 end = start + len;
+ struct sym_data *sdat;
- kcore_copy__map(&kci->kernel_map, start, end, pgoff, kci->stext,
- kci->etext);
+ if (kcore_copy__map(kci, start, end, pgoff, kci->stext, kci->etext))
+ return -1;
- kcore_copy__map(&kci->modules_map, start, end, pgoff, kci->first_module,
- kci->last_module_symbol);
+ if (kcore_copy__map(kci, start, end, pgoff, kci->first_module,
+ kci->last_module_symbol))
+ return -1;
+
+ list_for_each_entry(sdat, &kci->syms, node) {
+ u64 s = round_down(sdat->addr, page_size);
+
+ if (kcore_copy__map(kci, start, end, pgoff, s, s + len))
+ return -1;
+ }
return 0;
}
@@ -1520,6 +1598,64 @@ static int kcore_copy__read_maps(struct kcore_copy_info *kci, Elf *elf)
return 0;
}
+static void kcore_copy__find_remaps(struct kcore_copy_info *kci)
+{
+ struct phdr_data *p, *k = NULL;
+ u64 kend;
+
+ if (!kci->stext)
+ return;
+
+ /* Find phdr that corresponds to the kernel map (contains stext) */
+ kcore_copy__for_each_phdr(kci, p) {
+ u64 pend = p->addr + p->len - 1;
+
+ if (p->addr <= kci->stext && pend >= kci->stext) {
+ k = p;
+ break;
+ }
+ }
+
+ if (!k)
+ return;
+
+ kend = k->offset + k->len;
+
+ /* Find phdrs that remap the kernel */
+ kcore_copy__for_each_phdr(kci, p) {
+ u64 pend = p->offset + p->len;
+
+ if (p == k)
+ continue;
+
+ if (p->offset >= k->offset && pend <= kend)
+ p->remaps = k;
+ }
+}
+
+static void kcore_copy__layout(struct kcore_copy_info *kci)
+{
+ struct phdr_data *p;
+ off_t rel = 0;
+
+ kcore_copy__find_remaps(kci);
+
+ kcore_copy__for_each_phdr(kci, p) {
+ if (!p->remaps) {
+ p->rel = rel;
+ rel += p->len;
+ }
+ kci->phnum += 1;
+ }
+
+ kcore_copy__for_each_phdr(kci, p) {
+ struct phdr_data *k = p->remaps;
+
+ if (k)
+ p->rel = p->offset - k->offset + k->rel;
+ }
+}
+
static int kcore_copy__calc_maps(struct kcore_copy_info *kci, const char *dir,
Elf *elf)
{
@@ -1555,7 +1691,12 @@ static int kcore_copy__calc_maps(struct kcore_copy_info *kci, const char *dir,
if (kci->first_module && !kci->last_module_symbol)
return -1;
- return kcore_copy__read_maps(kci, elf);
+ if (kcore_copy__read_maps(kci, elf))
+ return -1;
+
+ kcore_copy__layout(kci);
+
+ return 0;
}
static int kcore_copy__copy_file(const char *from_dir, const char *to_dir,
@@ -1678,12 +1819,15 @@ int kcore_copy(const char *from_dir, const char *to_dir)
{
struct kcore kcore;
struct kcore extract;
- size_t count = 2;
int idx = 0, err = -1;
- off_t offset = page_size, sz, modules_offset = 0;
+ off_t offset, sz;
struct kcore_copy_info kci = { .stext = 0, };
char kcore_filename[PATH_MAX];
char extract_filename[PATH_MAX];
+ struct phdr_data *p;
+
+ INIT_LIST_HEAD(&kci.phdrs);
+ INIT_LIST_HEAD(&kci.syms);
if (kcore_copy__copy_file(from_dir, to_dir, "kallsyms"))
return -1;
@@ -1703,20 +1847,17 @@ int kcore_copy(const char *from_dir, const char *to_dir)
if (kcore__init(&extract, extract_filename, kcore.elfclass, false))
goto out_kcore_close;
- if (!kci.modules_map.addr)
- count -= 1;
-
- if (kcore__copy_hdr(&kcore, &extract, count))
+ if (kcore__copy_hdr(&kcore, &extract, kci.phnum))
goto out_extract_close;
- if (kcore__add_phdr(&extract, idx++, offset, kci.kernel_map.addr,
- kci.kernel_map.len))
- goto out_extract_close;
+ offset = gelf_fsize(extract.elf, ELF_T_EHDR, 1, EV_CURRENT) +
+ gelf_fsize(extract.elf, ELF_T_PHDR, kci.phnum, EV_CURRENT);
+ offset = round_up(offset, page_size);
+
+ kcore_copy__for_each_phdr(&kci, p) {
+ off_t offs = p->rel + offset;
- if (kci.modules_map.addr) {
- modules_offset = offset + kci.kernel_map.len;
- if (kcore__add_phdr(&extract, idx, modules_offset,
- kci.modules_map.addr, kci.modules_map.len))
+ if (kcore__add_phdr(&extract, idx++, offs, p->addr, p->len))
goto out_extract_close;
}
@@ -1724,14 +1865,14 @@ int kcore_copy(const char *from_dir, const char *to_dir)
if (sz < 0 || sz > offset)
goto out_extract_close;
- if (copy_bytes(kcore.fd, kci.kernel_map.offset, extract.fd, offset,
- kci.kernel_map.len))
- goto out_extract_close;
+ kcore_copy__for_each_phdr(&kci, p) {
+ off_t offs = p->rel + offset;
- if (modules_offset && copy_bytes(kcore.fd, kci.modules_map.offset,
- extract.fd, modules_offset,
- kci.modules_map.len))
- goto out_extract_close;
+ if (p->remaps)
+ continue;
+ if (copy_bytes(kcore.fd, p->offset, extract.fd, offs, p->len))
+ goto out_extract_close;
+ }
if (kcore_copy__compare_file(from_dir, to_dir, "modules"))
goto out_extract_close;
@@ -1754,6 +1895,9 @@ out_unlink_kallsyms:
if (err)
kcore_copy__unlink(to_dir, "kallsyms");
+ kcore_copy__free_phdrs(&kci);
+ kcore_copy__free_syms(&kci);
+
return err;
}
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index ff48d0d49584..7119df77dc0b 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -288,8 +288,7 @@ void symsrc__destroy(struct symsrc *ss)
}
int dso__synthesize_plt_symbols(struct dso *dso __maybe_unused,
- struct symsrc *ss __maybe_unused,
- struct map *map __maybe_unused)
+ struct symsrc *ss __maybe_unused)
{
return 0;
}
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 1466814ebada..8c84437f2a10 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -5,6 +5,7 @@
#include <stdio.h>
#include <string.h>
#include <linux/kernel.h>
+#include <linux/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
@@ -70,18 +71,10 @@ static enum dso_binary_type binary_type_symtab[] = {
#define DSO_BINARY_TYPE__SYMTAB_CNT ARRAY_SIZE(binary_type_symtab)
-bool symbol_type__is_a(char symbol_type, enum map_type map_type)
+static bool symbol_type__filter(char symbol_type)
{
symbol_type = toupper(symbol_type);
-
- switch (map_type) {
- case MAP__FUNCTION:
- return symbol_type == 'T' || symbol_type == 'W';
- case MAP__VARIABLE:
- return symbol_type == 'D';
- default:
- return false;
- }
+ return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D';
}
static int prefix_underscores_count(const char *str)
@@ -228,9 +221,9 @@ void symbols__fixup_end(struct rb_root *symbols)
curr->end = roundup(curr->start, 4096) + 4096;
}
-void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
+void map_groups__fixup_end(struct map_groups *mg)
{
- struct maps *maps = &mg->maps[type];
+ struct maps *maps = &mg->maps;
struct map *next, *curr;
down_write(&maps->lock);
@@ -256,7 +249,7 @@ out_unlock:
up_write(&maps->lock);
}
-struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
+struct symbol *symbol__new(u64 start, u64 len, u8 binding, u8 type, const char *name)
{
size_t namelen = strlen(name) + 1;
struct symbol *sym = calloc(1, (symbol_conf.priv_size +
@@ -274,6 +267,7 @@ struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
sym->start = start;
sym->end = len ? start + len : start;
+ sym->type = type;
sym->binding = binding;
sym->namelen = namelen - 1;
@@ -484,45 +478,40 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
void dso__reset_find_symbol_cache(struct dso *dso)
{
- enum map_type type;
-
- for (type = MAP__FUNCTION; type <= MAP__VARIABLE; ++type) {
- dso->last_find_result[type].addr = 0;
- dso->last_find_result[type].symbol = NULL;
- }
+ dso->last_find_result.addr = 0;
+ dso->last_find_result.symbol = NULL;
}
-void dso__insert_symbol(struct dso *dso, enum map_type type, struct symbol *sym)
+void dso__insert_symbol(struct dso *dso, struct symbol *sym)
{
- __symbols__insert(&dso->symbols[type], sym, dso->kernel);
+ __symbols__insert(&dso->symbols, sym, dso->kernel);
/* update the symbol cache if necessary */
- if (dso->last_find_result[type].addr >= sym->start &&
- (dso->last_find_result[type].addr < sym->end ||
+ if (dso->last_find_result.addr >= sym->start &&
+ (dso->last_find_result.addr < sym->end ||
sym->start == sym->end)) {
- dso->last_find_result[type].symbol = sym;
+ dso->last_find_result.symbol = sym;
}
}
-struct symbol *dso__find_symbol(struct dso *dso,
- enum map_type type, u64 addr)
+struct symbol *dso__find_symbol(struct dso *dso, u64 addr)
{
- if (dso->last_find_result[type].addr != addr || dso->last_find_result[type].symbol == NULL) {
- dso->last_find_result[type].addr = addr;
- dso->last_find_result[type].symbol = symbols__find(&dso->symbols[type], addr);
+ if (dso->last_find_result.addr != addr || dso->last_find_result.symbol == NULL) {
+ dso->last_find_result.addr = addr;
+ dso->last_find_result.symbol = symbols__find(&dso->symbols, addr);
}
- return dso->last_find_result[type].symbol;
+ return dso->last_find_result.symbol;
}
-struct symbol *dso__first_symbol(struct dso *dso, enum map_type type)
+struct symbol *dso__first_symbol(struct dso *dso)
{
- return symbols__first(&dso->symbols[type]);
+ return symbols__first(&dso->symbols);
}
-struct symbol *dso__last_symbol(struct dso *dso, enum map_type type)
+struct symbol *dso__last_symbol(struct dso *dso)
{
- return symbols__last(&dso->symbols[type]);
+ return symbols__last(&dso->symbols);
}
struct symbol *dso__next_symbol(struct symbol *sym)
@@ -539,24 +528,22 @@ struct symbol *symbol__next_by_name(struct symbol *sym)
}
/*
- * Teturns first symbol that matched with @name.
+ * Returns first symbol that matched with @name.
*/
-struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type,
- const char *name)
+struct symbol *dso__find_symbol_by_name(struct dso *dso, const char *name)
{
- struct symbol *s = symbols__find_by_name(&dso->symbol_names[type], name,
+ struct symbol *s = symbols__find_by_name(&dso->symbol_names, name,
SYMBOL_TAG_INCLUDE__NONE);
if (!s)
- s = symbols__find_by_name(&dso->symbol_names[type], name,
+ s = symbols__find_by_name(&dso->symbol_names, name,
SYMBOL_TAG_INCLUDE__DEFAULT_ONLY);
return s;
}
-void dso__sort_by_name(struct dso *dso, enum map_type type)
+void dso__sort_by_name(struct dso *dso)
{
- dso__set_sorted_by_name(dso, type);
- return symbols__sort_by_name(&dso->symbol_names[type],
- &dso->symbols[type]);
+ dso__set_sorted_by_name(dso);
+ return symbols__sort_by_name(&dso->symbol_names, &dso->symbols);
}
int modules__parse(const char *filename, void *arg,
@@ -621,11 +608,6 @@ out:
return err;
}
-struct process_kallsyms_args {
- struct map *map;
- struct dso *dso;
-};
-
/*
* These are symbols in the kernel image, so make sure that
* sym is from a kernel DSO.
@@ -661,10 +643,10 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
char type, u64 start)
{
struct symbol *sym;
- struct process_kallsyms_args *a = arg;
- struct rb_root *root = &a->dso->symbols[a->map->type];
+ struct dso *dso = arg;
+ struct rb_root *root = &dso->symbols;
- if (!symbol_type__is_a(type, a->map->type))
+ if (!symbol_type__filter(type))
return 0;
/*
@@ -672,7 +654,7 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
* symbols, setting length to 0, and rely on
* symbols__fixup_end() to fix it up.
*/
- sym = symbol__new(start, 0, kallsyms2elf_binding(type), name);
+ sym = symbol__new(start, 0, kallsyms2elf_binding(type), kallsyms2elf_type(type), name);
if (sym == NULL)
return -ENOMEM;
/*
@@ -689,21 +671,18 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
* so that we can in the next step set the symbol ->end address and then
* call kernel_maps__split_kallsyms.
*/
-static int dso__load_all_kallsyms(struct dso *dso, const char *filename,
- struct map *map)
+static int dso__load_all_kallsyms(struct dso *dso, const char *filename)
{
- struct process_kallsyms_args args = { .map = map, .dso = dso, };
- return kallsyms__parse(filename, &args, map__process_kallsym_symbol);
+ return kallsyms__parse(filename, dso, map__process_kallsym_symbol);
}
-static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map)
+static int map_groups__split_kallsyms_for_kcore(struct map_groups *kmaps, struct dso *dso)
{
- struct map_groups *kmaps = map__kmaps(map);
struct map *curr_map;
struct symbol *pos;
int count = 0;
- struct rb_root old_root = dso->symbols[map->type];
- struct rb_root *root = &dso->symbols[map->type];
+ struct rb_root old_root = dso->symbols;
+ struct rb_root *root = &dso->symbols;
struct rb_node *next = rb_first(root);
if (!kmaps)
@@ -723,7 +702,7 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map)
if (module)
*module = '\0';
- curr_map = map_groups__find(kmaps, map->type, pos->start);
+ curr_map = map_groups__find(kmaps, pos->start);
if (!curr_map) {
symbol__delete(pos);
@@ -733,7 +712,7 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map)
pos->start -= curr_map->start - curr_map->pgoff;
if (pos->end)
pos->end -= curr_map->start - curr_map->pgoff;
- symbols__insert(&curr_map->dso->symbols[curr_map->type], pos);
+ symbols__insert(&curr_map->dso->symbols, pos);
++count;
}
@@ -748,22 +727,25 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map)
* kernel range is broken in several maps, named [kernel].N, as we don't have
* the original ELF section names vmlinux have.
*/
-static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
+static int map_groups__split_kallsyms(struct map_groups *kmaps, struct dso *dso, u64 delta,
+ struct map *initial_map)
{
- struct map_groups *kmaps = map__kmaps(map);
struct machine *machine;
- struct map *curr_map = map;
+ struct map *curr_map = initial_map;
struct symbol *pos;
int count = 0, moved = 0;
- struct rb_root *root = &dso->symbols[map->type];
+ struct rb_root *root = &dso->symbols;
struct rb_node *next = rb_first(root);
int kernel_range = 0;
+ bool x86_64;
if (!kmaps)
return -1;
machine = kmaps->machine;
+ x86_64 = machine__is(machine, "x86_64");
+
while (next) {
char *module;
@@ -778,7 +760,7 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
*module++ = '\0';
if (strcmp(curr_map->dso->short_name, module)) {
- if (curr_map != map &&
+ if (curr_map != initial_map &&
dso->kernel == DSO_TYPE_GUEST_KERNEL &&
machine__is_default_guest(machine)) {
/*
@@ -788,18 +770,16 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
* symbols are in its kmap. Mark it as
* loaded.
*/
- dso__set_loaded(curr_map->dso,
- curr_map->type);
+ dso__set_loaded(curr_map->dso);
}
- curr_map = map_groups__find_by_name(kmaps,
- map->type, module);
+ curr_map = map_groups__find_by_name(kmaps, module);
if (curr_map == NULL) {
pr_debug("%s/proc/{kallsyms,modules} "
"inconsistency while looking "
"for \"%s\" module!\n",
machine->root_dir, module);
- curr_map = map;
+ curr_map = initial_map;
goto discard_symbol;
}
@@ -809,11 +789,21 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
}
/*
* So that we look just like we get from .ko files,
- * i.e. not prelinked, relative to map->start.
+ * i.e. not prelinked, relative to initial_map->start.
*/
pos->start = curr_map->map_ip(curr_map, pos->start);
pos->end = curr_map->map_ip(curr_map, pos->end);
- } else if (curr_map != map) {
+ } else if (x86_64 && is_entry_trampoline(pos->name)) {
+ /*
+ * These symbols are not needed anymore since the
+ * trampoline maps refer to the text section and it's
+ * symbols instead. Avoid having to deal with
+ * relocations, and the assumption that the first symbol
+ * is the start of kernel text, by simply removing the
+ * symbols at this point.
+ */
+ goto discard_symbol;
+ } else if (curr_map != initial_map) {
char dso_name[PATH_MAX];
struct dso *ndso;
@@ -824,7 +814,7 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
}
if (count == 0) {
- curr_map = map;
+ curr_map = initial_map;
goto add_symbol;
}
@@ -843,7 +833,7 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
ndso->kernel = dso->kernel;
- curr_map = map__new2(pos->start, ndso, map->type);
+ curr_map = map__new2(pos->start, ndso);
if (curr_map == NULL) {
dso__put(ndso);
return -1;
@@ -858,9 +848,9 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
pos->end -= delta;
}
add_symbol:
- if (curr_map != map) {
+ if (curr_map != initial_map) {
rb_erase(&pos->rb_node, root);
- symbols__insert(&curr_map->dso->symbols[curr_map->type], pos);
+ symbols__insert(&curr_map->dso->symbols, pos);
++moved;
} else
++count;
@@ -871,10 +861,10 @@ discard_symbol:
symbol__delete(pos);
}
- if (curr_map != map &&
+ if (curr_map != initial_map &&
dso->kernel == DSO_TYPE_GUEST_KERNEL &&
machine__is_default_guest(kmaps->machine)) {
- dso__set_loaded(curr_map->dso, curr_map->type);
+ dso__set_loaded(curr_map->dso);
}
return count + moved;
@@ -1035,7 +1025,12 @@ out_delete_from:
return ret;
}
-static int do_validate_kcore_modules(const char *filename, struct map *map,
+struct map *map_groups__first(struct map_groups *mg)
+{
+ return maps__first(&mg->maps);
+}
+
+static int do_validate_kcore_modules(const char *filename,
struct map_groups *kmaps)
{
struct rb_root modules = RB_ROOT;
@@ -1046,13 +1041,12 @@ static int do_validate_kcore_modules(const char *filename, struct map *map,
if (err)
return err;
- old_map = map_groups__first(kmaps, map->type);
+ old_map = map_groups__first(kmaps);
while (old_map) {
struct map *next = map_groups__next(old_map);
struct module_info *mi;
- if (old_map == map || old_map->start == map->start) {
- /* The kernel map */
+ if (!__map__is_kmodule(old_map)) {
old_map = next;
continue;
}
@@ -1109,7 +1103,7 @@ static int validate_kcore_modules(const char *kallsyms_filename,
kallsyms_filename))
return -EINVAL;
- if (do_validate_kcore_modules(modules_filename, map, kmaps))
+ if (do_validate_kcore_modules(modules_filename, kmaps))
return -EINVAL;
return 0;
@@ -1138,7 +1132,6 @@ static int validate_kcore_addresses(const char *kallsyms_filename,
struct kcore_mapfn_data {
struct dso *dso;
- enum map_type type;
struct list_head maps;
};
@@ -1147,7 +1140,7 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data)
struct kcore_mapfn_data *md = data;
struct map *map;
- map = map__new2(start, md->dso, md->type);
+ map = map__new2(start, md->dso);
if (map == NULL)
return -ENOMEM;
@@ -1163,13 +1156,13 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
const char *kallsyms_filename)
{
struct map_groups *kmaps = map__kmaps(map);
- struct machine *machine;
struct kcore_mapfn_data md;
struct map *old_map, *new_map, *replacement_map = NULL;
+ struct machine *machine;
bool is_64_bit;
int err, fd;
char kcore_filename[PATH_MAX];
- struct symbol *sym;
+ u64 stext;
if (!kmaps)
return -EINVAL;
@@ -1177,7 +1170,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
machine = kmaps->machine;
/* This function requires that the map is the kernel map */
- if (map != machine->vmlinux_maps[map->type])
+ if (!__map__is_kernel(map))
return -EINVAL;
if (!filename_from_kallsyms_filename(kcore_filename, "kcore",
@@ -1189,7 +1182,6 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
return -EINVAL;
md.dso = dso;
- md.type = map->type;
INIT_LIST_HEAD(&md.maps);
fd = open(kcore_filename, O_RDONLY);
@@ -1200,7 +1192,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
}
/* Read new maps into temporary lists */
- err = file__read_maps(fd, md.type == MAP__FUNCTION, kcore_mapfn, &md,
+ err = file__read_maps(fd, map->prot & PROT_EXEC, kcore_mapfn, &md,
&is_64_bit);
if (err)
goto out_err;
@@ -1212,7 +1204,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
}
/* Remove old maps */
- old_map = map_groups__first(kmaps, map->type);
+ old_map = map_groups__first(kmaps);
while (old_map) {
struct map *next = map_groups__next(old_map);
@@ -1220,14 +1212,15 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
map_groups__remove(kmaps, old_map);
old_map = next;
}
+ machine->trampolines_mapped = false;
- /* Find the kernel map using the first symbol */
- sym = dso__first_symbol(dso, map->type);
- list_for_each_entry(new_map, &md.maps, node) {
- if (sym && sym->start >= new_map->start &&
- sym->start < new_map->end) {
- replacement_map = new_map;
- break;
+ /* Find the kernel map using the '_stext' symbol */
+ if (!kallsyms__get_function_start(kallsyms_filename, "_stext", &stext)) {
+ list_for_each_entry(new_map, &md.maps, node) {
+ if (stext >= new_map->start && stext < new_map->end) {
+ replacement_map = new_map;
+ break;
+ }
}
}
@@ -1256,6 +1249,19 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
map__put(new_map);
}
+ if (machine__is(machine, "x86_64")) {
+ u64 addr;
+
+ /*
+ * If one of the corresponding symbols is there, assume the
+ * entry trampoline maps are too.
+ */
+ if (!kallsyms__get_function_start(kallsyms_filename,
+ ENTRY_TRAMPOLINE_NAME,
+ &addr))
+ machine->trampolines_mapped = true;
+ }
+
/*
* Set the data type and long name so that kcore can be read via
* dso__data_read_addr().
@@ -1268,7 +1274,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
close(fd);
- if (map->type == MAP__FUNCTION)
+ if (map->prot & PROT_EXEC)
pr_debug("Using %s for kernel object code\n", kcore_filename);
else
pr_debug("Using %s for kernel data\n", kcore_filename);
@@ -1289,14 +1295,10 @@ out_err:
* If the kernel is relocated at boot time, kallsyms won't match. Compute the
* delta based on the relocation reference symbol.
*/
-static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
+static int kallsyms__delta(struct kmap *kmap, const char *filename, u64 *delta)
{
- struct kmap *kmap = map__kmap(map);
u64 addr;
- if (!kmap)
- return -1;
-
if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->name)
return 0;
@@ -1310,19 +1312,23 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
int __dso__load_kallsyms(struct dso *dso, const char *filename,
struct map *map, bool no_kcore)
{
+ struct kmap *kmap = map__kmap(map);
u64 delta = 0;
if (symbol__restricted_filename(filename, "/proc/kallsyms"))
return -1;
- if (dso__load_all_kallsyms(dso, filename, map) < 0)
+ if (!kmap || !kmap->kmaps)
return -1;
- if (kallsyms__delta(map, filename, &delta))
+ if (dso__load_all_kallsyms(dso, filename) < 0)
return -1;
- symbols__fixup_end(&dso->symbols[map->type]);
- symbols__fixup_duplicate(&dso->symbols[map->type]);
+ if (kallsyms__delta(kmap, filename, &delta))
+ return -1;
+
+ symbols__fixup_end(&dso->symbols);
+ symbols__fixup_duplicate(&dso->symbols);
if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
dso->symtab_type = DSO_BINARY_TYPE__GUEST_KALLSYMS;
@@ -1330,9 +1336,9 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename,
dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS;
if (!no_kcore && !dso__load_kcore(dso, map, filename))
- return dso__split_kallsyms_for_kcore(dso, map);
+ return map_groups__split_kallsyms_for_kcore(kmap->kmaps, dso);
else
- return dso__split_kallsyms(dso, map, delta);
+ return map_groups__split_kallsyms(kmap->kmaps, dso, delta, map);
}
int dso__load_kallsyms(struct dso *dso, const char *filename,
@@ -1341,8 +1347,7 @@ int dso__load_kallsyms(struct dso *dso, const char *filename,
return __dso__load_kallsyms(dso, filename, map, false);
}
-static int dso__load_perf_map(const char *map_path, struct dso *dso,
- struct map *map)
+static int dso__load_perf_map(const char *map_path, struct dso *dso)
{
char *line = NULL;
size_t n;
@@ -1379,12 +1384,12 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso,
if (len + 2 >= line_len)
continue;
- sym = symbol__new(start, size, STB_GLOBAL, line + len);
+ sym = symbol__new(start, size, STB_GLOBAL, STT_FUNC, line + len);
if (sym == NULL)
goto out_delete_line;
- symbols__insert(&dso->symbols[map->type], sym);
+ symbols__insert(&dso->symbols, sym);
nr_syms++;
}
@@ -1509,25 +1514,27 @@ int dso__load(struct dso *dso, struct map *map)
pthread_mutex_lock(&dso->lock);
/* check again under the dso->lock */
- if (dso__loaded(dso, map->type)) {
+ if (dso__loaded(dso)) {
ret = 1;
goto out;
}
+ if (map->groups && map->groups->machine)
+ machine = map->groups->machine;
+ else
+ machine = NULL;
+
if (dso->kernel) {
if (dso->kernel == DSO_TYPE_KERNEL)
ret = dso__load_kernel_sym(dso, map);
else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
ret = dso__load_guest_kernel_sym(dso, map);
+ if (machine__is(machine, "x86_64"))
+ machine__map_x86_64_entry_trampolines(machine, dso);
goto out;
}
- if (map->groups && map->groups->machine)
- machine = map->groups->machine;
- else
- machine = NULL;
-
dso->adjust_symbols = 0;
if (perfmap) {
@@ -1542,7 +1549,7 @@ int dso__load(struct dso *dso, struct map *map)
goto out;
}
- ret = dso__load_perf_map(map_path, dso, map);
+ ret = dso__load_perf_map(map_path, dso);
dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT :
DSO_BINARY_TYPE__NOT_FOUND;
goto out;
@@ -1651,7 +1658,7 @@ int dso__load(struct dso *dso, struct map *map)
if (ret > 0) {
int nr_plt;
- nr_plt = dso__synthesize_plt_symbols(dso, runtime_ss, map);
+ nr_plt = dso__synthesize_plt_symbols(dso, runtime_ss);
if (nr_plt > 0)
ret += nr_plt;
}
@@ -1663,17 +1670,16 @@ out_free:
if (ret < 0 && strstr(dso->name, " (deleted)") != NULL)
ret = 0;
out:
- dso__set_loaded(dso, map->type);
+ dso__set_loaded(dso);
pthread_mutex_unlock(&dso->lock);
nsinfo__mountns_exit(&nsc);
return ret;
}
-struct map *map_groups__find_by_name(struct map_groups *mg,
- enum map_type type, const char *name)
+struct map *map_groups__find_by_name(struct map_groups *mg, const char *name)
{
- struct maps *maps = &mg->maps[type];
+ struct maps *maps = &mg->maps;
struct map *map;
down_read(&maps->lock);
@@ -1720,7 +1726,7 @@ int dso__load_vmlinux(struct dso *dso, struct map *map,
else
dso->binary_type = DSO_BINARY_TYPE__VMLINUX;
dso__set_long_name(dso, vmlinux, vmlinux_allocated);
- dso__set_loaded(dso, map->type);
+ dso__set_loaded(dso);
pr_debug("Using %s for symbols\n", symfs_vmlinux);
}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 70c16741f50a..1a16438eb3ce 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -57,7 +57,8 @@ struct symbol {
u64 start;
u64 end;
u16 namelen;
- u8 binding;
+ u8 type:4;
+ u8 binding:4;
u8 idle:1;
u8 ignore:1;
u8 inlined:1;
@@ -259,17 +260,16 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
bool no_kcore);
int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map);
-void dso__insert_symbol(struct dso *dso, enum map_type type,
+void dso__insert_symbol(struct dso *dso,
struct symbol *sym);
-struct symbol *dso__find_symbol(struct dso *dso, enum map_type type,
- u64 addr);
-struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type,
- const char *name);
+struct symbol *dso__find_symbol(struct dso *dso, u64 addr);
+struct symbol *dso__find_symbol_by_name(struct dso *dso, const char *name);
+
struct symbol *symbol__next_by_name(struct symbol *sym);
-struct symbol *dso__first_symbol(struct dso *dso, enum map_type type);
-struct symbol *dso__last_symbol(struct dso *dso, enum map_type type);
+struct symbol *dso__first_symbol(struct dso *dso);
+struct symbol *dso__last_symbol(struct dso *dso);
struct symbol *dso__next_symbol(struct symbol *sym);
enum dso_type dso__type_fd(int fd);
@@ -288,7 +288,7 @@ void symbol__exit(void);
void symbol__elf_init(void);
int symbol__annotation_init(void);
-struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
+struct symbol *symbol__new(u64 start, u64 len, u8 binding, u8 type, const char *name);
size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
const struct addr_location *al,
bool unknown_as_addr,
@@ -300,7 +300,6 @@ size_t __symbol__fprintf_symname(const struct symbol *sym,
bool unknown_as_addr, FILE *fp);
size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
size_t symbol__fprintf(struct symbol *sym, FILE *fp);
-bool symbol_type__is_a(char symbol_type, enum map_type map_type);
bool symbol__restricted_filename(const char *filename,
const char *restricted_filename);
int symbol__config_symfs(const struct option *opt __maybe_unused,
@@ -308,8 +307,7 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,
int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
struct symsrc *runtime_ss, int kmodule);
-int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
- struct map *map);
+int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss);
char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);
@@ -317,7 +315,7 @@ void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel)
void symbols__insert(struct rb_root *symbols, struct symbol *sym);
void symbols__fixup_duplicate(struct rb_root *symbols);
void symbols__fixup_end(struct rb_root *symbols);
-void __map_groups__fixup_end(struct map_groups *mg, enum map_type type);
+void map_groups__fixup_end(struct map_groups *mg);
typedef int (*mapfn_t)(u64 start, u64 len, u64 pgoff, void *data);
int file__read_maps(int fd, bool exe, mapfn_t mapfn, void *data,
diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c
index 6dd2cb88ccbe..ed0205cc7942 100644
--- a/tools/perf/util/symbol_fprintf.c
+++ b/tools/perf/util/symbol_fprintf.c
@@ -58,13 +58,13 @@ size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp)
}
size_t dso__fprintf_symbols_by_name(struct dso *dso,
- enum map_type type, FILE *fp)
+ FILE *fp)
{
size_t ret = 0;
struct rb_node *nd;
struct symbol_name_rb_node *pos;
- for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) {
+ for (nd = rb_first(&dso->symbol_names); nd; nd = rb_next(nd)) {
pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
fprintf(fp, "%s\n", pos->sym.name);
}
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 68b65b10579b..2048d393ece6 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -302,23 +302,20 @@ int thread__insert_map(struct thread *thread, struct map *map)
static int __thread__prepare_access(struct thread *thread)
{
bool initialized = false;
- int i, err = 0;
-
- for (i = 0; i < MAP__NR_TYPES; ++i) {
- struct maps *maps = &thread->mg->maps[i];
- struct map *map;
+ int err = 0;
+ struct maps *maps = &thread->mg->maps;
+ struct map *map;
- down_read(&maps->lock);
+ down_read(&maps->lock);
- for (map = maps__first(maps); map; map = map__next(map)) {
- err = unwind__prepare_access(thread, map, &initialized);
- if (err || initialized)
- break;
- }
-
- up_read(&maps->lock);
+ for (map = maps__first(maps); map; map = map__next(map)) {
+ err = unwind__prepare_access(thread, map, &initialized);
+ if (err || initialized)
+ break;
}
+ up_read(&maps->lock);
+
return err;
}
@@ -335,8 +332,6 @@ static int thread__prepare_access(struct thread *thread)
static int thread__clone_map_groups(struct thread *thread,
struct thread *parent)
{
- int i;
-
/* This is new thread, we share map groups for process. */
if (thread->pid_ == parent->pid_)
return thread__prepare_access(thread);
@@ -348,9 +343,8 @@ static int thread__clone_map_groups(struct thread *thread,
}
/* But this one is new process, copy maps. */
- for (i = 0; i < MAP__NR_TYPES; ++i)
- if (map_groups__clone(thread, parent->mg, i) < 0)
- return -ENOMEM;
+ if (map_groups__clone(thread, parent->mg) < 0)
+ return -ENOMEM;
return 0;
}
@@ -371,8 +365,7 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp)
return thread__clone_map_groups(thread, parent);
}
-void thread__find_cpumode_addr_location(struct thread *thread,
- enum map_type type, u64 addr,
+void thread__find_cpumode_addr_location(struct thread *thread, u64 addr,
struct addr_location *al)
{
size_t i;
@@ -384,7 +377,7 @@ void thread__find_cpumode_addr_location(struct thread *thread,
};
for (i = 0; i < ARRAY_SIZE(cpumodes); i++) {
- thread__find_addr_location(thread, cpumodes[i], type, addr, al);
+ thread__find_symbol(thread, cpumodes[i], addr, al);
if (al->map)
break;
}
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 14d44c3235b8..07606aa6998d 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -92,16 +92,13 @@ size_t thread__fprintf(struct thread *thread, FILE *fp);
struct thread *thread__main_thread(struct machine *machine, struct thread *thread);
-void thread__find_addr_map(struct thread *thread,
- u8 cpumode, enum map_type type, u64 addr,
- struct addr_location *al);
+struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
+ struct addr_location *al);
-void thread__find_addr_location(struct thread *thread,
- u8 cpumode, enum map_type type, u64 addr,
- struct addr_location *al);
+struct symbol *thread__find_symbol(struct thread *thread, u8 cpumode,
+ u64 addr, struct addr_location *al);
-void thread__find_cpumode_addr_location(struct thread *thread,
- enum map_type type, u64 addr,
+void thread__find_cpumode_addr_location(struct thread *thread, u64 addr,
struct addr_location *al);
static inline void *thread__priv(struct thread *thread)
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index d7f2113462fb..c85d0d1a65ed 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -103,11 +103,10 @@ out:
static int record_header_files(void)
{
- char *path;
+ char *path = get_events_file("header_page");
struct stat st;
int err = -EIO;
- path = get_tracing_file("events/header_page");
if (!path) {
pr_debug("can't get tracing/events/header_page");
return -ENOMEM;
@@ -128,9 +127,9 @@ static int record_header_files(void)
goto out;
}
- put_tracing_file(path);
+ put_events_file(path);
- path = get_tracing_file("events/header_event");
+ path = get_events_file("header_event");
if (!path) {
pr_debug("can't get tracing/events/header_event");
err = -ENOMEM;
@@ -154,7 +153,7 @@ static int record_header_files(void)
err = 0;
out:
- put_tracing_file(path);
+ put_events_file(path);
return err;
}
@@ -243,7 +242,7 @@ static int record_ftrace_files(struct tracepoint_path *tps)
char *path;
int ret;
- path = get_tracing_file("events/ftrace");
+ path = get_events_file("ftrace");
if (!path) {
pr_debug("can't get tracing/events/ftrace");
return -ENOMEM;
diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c
index 16a776371d03..1aa368603268 100644
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c
@@ -75,6 +75,7 @@ void trace_event__cleanup(struct trace_event *t)
static struct event_format*
tp_format(const char *sys, const char *name)
{
+ char *tp_dir = get_events_file(sys);
struct pevent *pevent = tevent.pevent;
struct event_format *event = NULL;
char path[PATH_MAX];
@@ -82,8 +83,11 @@ tp_format(const char *sys, const char *name)
char *data;
int err;
- scnprintf(path, PATH_MAX, "%s/%s/%s/format",
- tracing_events_path, sys, name);
+ if (!tp_dir)
+ return ERR_PTR(-errno);
+
+ scnprintf(path, PATH_MAX, "%s/%s/format", tp_dir, name);
+ put_events_file(tp_dir);
err = filename__read_str(path, &data, &size);
if (err)
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 7bdd239c795c..538db4e5d1e6 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -28,10 +28,11 @@ static int __report_module(struct addr_location *al, u64 ip,
{
Dwfl_Module *mod;
struct dso *dso = NULL;
-
- thread__find_addr_location(ui->thread,
- PERF_RECORD_MISC_USER,
- MAP__FUNCTION, ip, al);
+ /*
+ * Some callers will use al->sym, so we can't just use the
+ * cheaper thread__find_map() here.
+ */
+ thread__find_symbol(ui->thread, PERF_RECORD_MISC_USER, ip, al);
if (al->map)
dso = al->map->dso;
@@ -103,19 +104,7 @@ static int access_dso_mem(struct unwind_info *ui, Dwarf_Addr addr,
struct addr_location al;
ssize_t size;
- thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
- MAP__FUNCTION, addr, &al);
- if (!al.map) {
- /*
- * We've seen cases (softice) where DWARF unwinder went
- * through non executable mmaps, which we need to lookup
- * in MAP__VARIABLE tree.
- */
- thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
- MAP__VARIABLE, addr, &al);
- }
-
- if (!al.map) {
+ if (!thread__find_map(ui->thread, PERF_RECORD_MISC_USER, addr, &al)) {
pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
return -1;
}
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index af873044d33a..6a11bc7e6b27 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -366,19 +366,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
{
struct addr_location al;
-
- thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
- MAP__FUNCTION, ip, &al);
- if (!al.map) {
- /*
- * We've seen cases (softice) where DWARF unwinder went
- * through non executable mmaps, which we need to lookup
- * in MAP__VARIABLE tree.
- */
- thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
- MAP__VARIABLE, ip, &al);
- }
- return al.map;
+ return thread__find_map(ui->thread, PERF_RECORD_MISC_USER, ip, &al);
}
static int
@@ -586,12 +574,9 @@ static int entry(u64 ip, struct thread *thread,
struct unwind_entry e;
struct addr_location al;
- thread__find_addr_location(thread, PERF_RECORD_MISC_USER,
- MAP__FUNCTION, ip, &al);
-
+ e.sym = thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al);
e.ip = al.addr;
e.map = al.map;
- e.sym = al.sym;
pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
al.sym ? al.sym->name : "''",
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 1019bbc5dbd8..eac5b858a371 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -38,11 +38,43 @@ void perf_set_multithreaded(void)
}
unsigned int page_size;
-int cacheline_size;
+
+#ifdef _SC_LEVEL1_DCACHE_LINESIZE
+#define cache_line_size(cacheline_sizep) *cacheline_sizep = sysconf(_SC_LEVEL1_DCACHE_LINESIZE)
+#else
+static void cache_line_size(int *cacheline_sizep)
+{
+ if (sysfs__read_int("devices/system/cpu/cpu0/cache/index0/coherency_line_size", cacheline_sizep))
+ pr_debug("cannot determine cache line size");
+}
+#endif
+
+int cacheline_size(void)
+{
+ static int size;
+
+ if (!size)
+ cache_line_size(&size);
+
+ return size;
+}
int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK;
+int sysctl__max_stack(void)
+{
+ int value;
+
+ if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
+ sysctl_perf_event_max_stack = value;
+
+ if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0)
+ sysctl_perf_event_max_contexts_per_stack = value;
+
+ return sysctl_perf_event_max_stack;
+}
+
bool test_attr__enabled;
bool perf_host = true;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c9626c206208..dc58254a2b69 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -43,7 +43,9 @@ size_t hex_width(u64 v);
int hex2u64(const char *ptr, u64 *val);
extern unsigned int page_size;
-extern int cacheline_size;
+int __pure cacheline_size(void);
+
+int sysctl__max_stack(void);
int fetch_kernel_version(unsigned int *puint,
char *str, size_t str_sz);
diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c
index 0acb1ec0e2f0..741af209b19d 100644
--- a/tools/perf/util/vdso.c
+++ b/tools/perf/util/vdso.c
@@ -139,12 +139,10 @@ static enum dso_type machine__thread_dso_type(struct machine *machine,
struct thread *thread)
{
enum dso_type dso_type = DSO__TYPE_UNKNOWN;
- struct map *map;
- struct dso *dso;
+ struct map *map = map_groups__first(thread->mg);
- map = map_groups__first(thread->mg, MAP__FUNCTION);
for (; map ; map = map_groups__next(map)) {
- dso = map->dso;
+ struct dso *dso = map->dso;
if (!dso || dso->long_name[0] != '/')
continue;
dso_type = dso__type(dso, machine);
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
new file mode 100755
index 000000000000..98f650c9bf54
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# Invoke a text editor on all console.log files for all runs with diagnostics,
+# that is, on all such files having a console.log.diags counterpart.
+# Note that both console.log.diags and console.log are passed to the
+# editor (currently defaulting to "vi"), allowing the user to get an
+# idea of what to search for in the console.log file.
+#
+# Usage: kvm-find-errors.sh directory
+#
+# The "directory" above should end with the date/time directory, for example,
+# "tools/testing/selftests/rcutorture/res/2018.02.25-14:27:27".
+
+rundir="${1}"
+if test -z "$rundir" -o ! -d "$rundir"
+then
+ echo Usage: $0 directory
+fi
+editor=${EDITOR-vi}
+
+# Find builds with errors
+files=
+for i in ${rundir}/*/Make.out
+do
+ if egrep -q "error:|warning:" < $i
+ then
+ egrep "error:|warning:" < $i > $i.diags
+ files="$files $i.diags $i"
+ fi
+done
+if test -n "$files"
+then
+ $editor $files
+else
+ echo No build errors.
+fi
+if grep -q -e "--buildonly" < ${rundir}/log
+then
+ echo Build-only run, no console logs to check.
+fi
+
+# Find console logs with errors
+files=
+for i in ${rundir}/*/console.log
+do
+ if test -r $i.diags
+ then
+ files="$files $i.diags $i"
+ fi
+done
+if test -n "$files"
+then
+ $editor $files
+else
+ echo No errors in console logs.
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index c2e1bb6d0cba..477ecb1293ab 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -34,11 +34,15 @@ fi
configfile=`echo $i | sed -e 's/^.*\///'`
ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'`
+stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null |
+ tail -1 | sed -e 's/^\[[ 0-9.]*] //' |
+ awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' |
+ tr -d '\012\015'`"
if test -z "$ngps"
then
- echo "$configfile -------"
+ echo "$configfile ------- " $stopstate
else
- title="$configfile ------- $ngps grace periods"
+ title="$configfile ------- $ngps GPs"
dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
if test -z "$dur"
then
@@ -46,9 +50,9 @@ else
else
ngpsps=`awk -v ngps=$ngps -v dur=$dur '
BEGIN { print ngps / dur }' < /dev/null`
- title="$title ($ngpsps per second)"
+ title="$title ($ngpsps/s)"
fi
- echo $title
+ echo $title $stopstate
nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
if test -z "$nclosecalls"
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index f7e988f369dd..c27e97824163 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -48,10 +48,6 @@ do
cat $i/Make.oldconfig.err
fi
parse-build.sh $i/Make.out $configfile
- if test "$TORTURE_SUITE" != rcuperf
- then
- parse-torture.sh $i/console.log $configfile
- fi
parse-console.sh $i/console.log $configfile
if test -r $i/Warnings
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 5f8fbb0d7c17..c5b0f94341d9 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -267,5 +267,4 @@ then
echo Unknown PID, cannot kill qemu command
fi
-parse-torture.sh $resdir/console.log $title
parse-console.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 08aa7d50ae0e..17293436f551 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -24,57 +24,146 @@
#
# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+T=${TMPDIR-/tmp}/parse-console.sh.$$
file="$1"
title="$2"
+trap 'rm -f $T.seq $T.diags' 0
+
. functions.sh
+# Check for presence and readability of console output file
+if test -f "$file" -a -r "$file"
+then
+ :
+else
+ echo $title unreadable console output file: $file
+ exit 1
+fi
if grep -Pq '\x00' < $file
then
print_warning Console output contains nul bytes, old qemu still running?
fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
-if test -s $1.diags
+cat /dev/null > $file.diags
+
+# Check for proper termination, except that rcuperf runs don't indicate this.
+if test "$TORTURE_SUITE" != rcuperf
then
- print_warning Assertion failure in $file $title
- # cat $1.diags
+ # check for abject failure
+
+ if grep -q FAILURE $file || grep -q -e '-torture.*!!!' $file
+ then
+ nerrs=`grep --binary-files=text '!!!' $file |
+ tail -1 |
+ awk '
+ {
+ for (i=NF-8;i<=NF;i++)
+ sum+=$i;
+ }
+ END { print sum }'`
+ print_bug $title FAILURE, $nerrs instances
+ exit
+ fi
+
+ grep --binary-files=text 'torture:.*ver:' $file |
+ egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' |
+ sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
+ awk '
+ BEGIN {
+ ver = 0;
+ badseq = 0;
+ }
+
+ {
+ if (!badseq && ($5 + 0 != $5 || $5 <= ver)) {
+ badseqno1 = ver;
+ badseqno2 = $5;
+ badseqnr = NR;
+ badseq = 1;
+ }
+ ver = $5
+ }
+
+ END {
+ if (badseq) {
+ if (badseqno1 == badseqno2 && badseqno2 == ver)
+ print "GP HANG at " ver " torture stat " badseqnr;
+ else
+ print "BAD SEQ " badseqno1 ":" badseqno2 " last:" ver " version " badseqnr;
+ }
+ }' > $T.seq
+
+ if grep -q SUCCESS $file
+ then
+ if test -s $T.seq
+ then
+ print_warning $title `cat $T.seq`
+ echo " " $file
+ exit 2
+ fi
+ else
+ if grep -q "_HOTPLUG:" $file
+ then
+ print_warning HOTPLUG FAILURES $title `cat $T.seq`
+ echo " " $file
+ exit 3
+ fi
+ echo $title no success message, `grep --binary-files=text 'ver:' $file | wc -l` successful version messages
+ if test -s $T.seq
+ then
+ print_warning $title `cat $T.seq`
+ fi
+ exit 2
+ fi
+fi | tee -a $file.diags
+
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file |
+grep -v 'ODEBUG: ' |
+grep -v 'Warning: unable to open an initial console' > $T.diags
+if test -s $T.diags
+then
+ print_warning "Assertion failure in $file $title"
+ # cat $T.diags
summary=""
- n_badness=`grep -c Badness $1`
+ n_badness=`grep -c Badness $file`
if test "$n_badness" -ne 0
then
summary="$summary Badness: $n_badness"
fi
- n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
+ n_warn=`grep -v 'Warning: unable to open an initial console' $file | egrep -c 'WARNING:|Warn'`
if test "$n_warn" -ne 0
then
summary="$summary Warnings: $n_warn"
fi
- n_bugs=`egrep -c 'BUG|Oops:' $1`
+ n_bugs=`egrep -c 'BUG|Oops:' $file`
if test "$n_bugs" -ne 0
then
summary="$summary Bugs: $n_bugs"
fi
- n_calltrace=`grep -c 'Call Trace:' $1`
+ n_calltrace=`grep -c 'Call Trace:' $file`
if test "$n_calltrace" -ne 0
then
summary="$summary Call Traces: $n_calltrace"
fi
- n_lockdep=`grep -c =========== $1`
+ n_lockdep=`grep -c =========== $file`
if test "$n_badness" -ne 0
then
summary="$summary lockdep: $n_badness"
fi
- n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1`
+ n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file`
if test "$n_stalls" -ne 0
then
summary="$summary Stalls: $n_stalls"
fi
- n_starves=`grep -c 'rcu_.*kthread starved for' $1`
+ n_starves=`grep -c 'rcu_.*kthread starved for' $file`
if test "$n_starves" -ne 0
then
summary="$summary Starves: $n_starves"
fi
print_warning Summary: $summary
-else
- rm $1.diags
+ cat $T.diags >> $file.diags
+fi
+if ! test -s $file.diags
+then
+ rm -f $file.diags
fi
diff --git a/tools/testing/selftests/rcutorture/bin/parse-torture.sh b/tools/testing/selftests/rcutorture/bin/parse-torture.sh
deleted file mode 100755
index 5987e50cfeb4..000000000000
--- a/tools/testing/selftests/rcutorture/bin/parse-torture.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Check the console output from a torture run for goodness.
-# The "file" is a pathname on the local system, and "title" is
-# a text string for error-message purposes.
-#
-# The file must contain torture output, but can be interspersed
-# with other dmesg text, as in console-log output.
-#
-# Usage: parse-torture.sh file title
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-# Copyright (C) IBM Corporation, 2011
-#
-# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-
-T=${TMPDIR-/tmp}/parse-torture.sh.$$
-file="$1"
-title="$2"
-
-trap 'rm -f $T.seq' 0
-
-. functions.sh
-
-# check for presence of torture output file.
-
-if test -f "$file" -a -r "$file"
-then
- :
-else
- echo $title unreadable torture output file: $file
- exit 1
-fi
-
-# check for abject failure
-
-if grep -q FAILURE $file || grep -q -e '-torture.*!!!' $file
-then
- nerrs=`grep --binary-files=text '!!!' $file | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
- print_bug $title FAILURE, $nerrs instances
- echo " " $url
- exit
-fi
-
-grep --binary-files=text 'torture:.*ver:' $file | egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
-awk '
-BEGIN {
- ver = 0;
- badseq = 0;
- }
-
- {
- if (!badseq && ($5 + 0 != $5 || $5 <= ver)) {
- badseqno1 = ver;
- badseqno2 = $5;
- badseqnr = NR;
- badseq = 1;
- }
- ver = $5
- }
-
-END {
- if (badseq) {
- if (badseqno1 == badseqno2 && badseqno2 == ver)
- print "GP HANG at " ver " torture stat " badseqnr;
- else
- print "BAD SEQ " badseqno1 ":" badseqno2 " last:" ver " version " badseqnr;
- }
- }' > $T.seq
-
-if grep -q SUCCESS $file
-then
- if test -s $T.seq
- then
- print_warning $title $title `cat $T.seq`
- echo " " $file
- exit 2
- fi
-else
- if grep -q "_HOTPLUG:" $file
- then
- print_warning HOTPLUG FAILURES $title `cat $T.seq`
- echo " " $file
- exit 3
- fi
- echo $title no success message, `grep --binary-files=text 'ver:' $file | wc -l` successful version messages
- if test -s $T.seq
- then
- print_warning $title `cat $T.seq`
- fi
- exit 2
-fi
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
index 1571e24e9494..f91aeb5fe571 100644
--- a/tools/virtio/linux/dma-mapping.h
+++ b/tools/virtio/linux/dma-mapping.h
@@ -6,8 +6,6 @@
# error Virtio userspace code does not support CONFIG_HAS_DMA
#endif
-#define PCI_DMA_BUS_IS_PHYS 1
-
enum dma_data_direction {
DMA_BIDIRECTIONAL = 0,
DMA_TO_DEVICE = 1,
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 7f6a944db23d..8d90de213ce9 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1401,6 +1401,7 @@ static void kvm_send_hwpoison_signal(unsigned long address,
{
siginfo_t info;
+ clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_MCEERR_AR;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6e865e8b5b10..90d30fbe95ae 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
- events = f.file->f_op->poll(f.file, &irqfd->pt);
+ events = vfs_poll(f.file, &irqfd->pt);
if (events & EPOLLIN)
schedule_work(&irqfd->inject);