diff options
Diffstat (limited to 'include')
109 files changed, 2355 insertions, 1004 deletions
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h index 62cac266a1c8..eeff40295b4b 100644 --- a/include/acpi/platform/aclinuxex.h +++ b/include/acpi/platform/aclinuxex.h @@ -46,6 +46,9 @@ acpi_status acpi_os_terminate(void); * Interrupts are off during resume, just like they are for boot. * However, boot has (system_state != SYSTEM_RUNNING) * to quiet __might_sleep() in kmalloc() and resume does not. + * + * These specialized allocators have to be macros for their allocations to be + * accounted separately (to have separate alloc_tag). */ #define acpi_os_allocate(_size) \ kmalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) @@ -53,14 +56,14 @@ acpi_status acpi_os_terminate(void); #define acpi_os_allocate_zeroed(_size) \ kzalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) +#define acpi_os_acquire_object(_cache) \ + kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) + static inline void acpi_os_free(void *memory) { kfree(memory); } -#define acpi_os_acquire_object(_cache) \ - kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) - static inline acpi_thread_id acpi_os_get_thread_id(void) { return (acpi_thread_id) (unsigned long)current; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 6dcf4d576970..594d5905f615 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -144,7 +144,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 35245e9225a5..677315e51e54 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -141,14 +141,6 @@ * often happens at runtime) */ -#if defined(CONFIG_MEMORY_HOTPLUG) -#define MEM_KEEP(sec) *(.mem##sec) -#define MEM_DISCARD(sec) -#else -#define MEM_KEEP(sec) -#define MEM_DISCARD(sec) *(.mem##sec) -#endif - #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_NO_PATCHABLE #define KEEP_PATCHABLE KEEP(*(__patchable_function_entries)) #define PATCHABLE_DISCARDS @@ -357,7 +349,6 @@ *(.data..decrypted) \ *(.ref.data) \ *(.data..shared_aligned) /* percpu related */ \ - MEM_KEEP(init.data*) \ *(.data.unlikely) \ __start_once = .; \ *(.data.once) \ @@ -542,7 +533,6 @@ /* __*init sections */ \ __init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \ *(.ref.rodata) \ - MEM_KEEP(init.rodata) \ } \ \ /* Built-in module parameters. */ \ @@ -593,8 +583,7 @@ *(.text.unknown .text.unknown.*) \ NOINSTR_TEXT \ *(.ref.text) \ - *(.text.asan.* .text.tsan.*) \ - MEM_KEEP(init.text*) \ + *(.text.asan.* .text.tsan.*) /* sched.text is aling to function alignment to secure we have same @@ -701,7 +690,6 @@ #define INIT_DATA \ KEEP(*(SORT(___kentry+*))) \ *(.init.data .init.data.*) \ - MEM_DISCARD(init.data*) \ KERNEL_CTORS() \ MCOUNT_REC() \ *(.init.rodata .init.rodata.*) \ @@ -709,7 +697,6 @@ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ ERROR_INJECT_WHITELIST() \ - MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ TIMER_OF_TABLES() \ @@ -727,8 +714,7 @@ #define INIT_TEXT \ *(.init.text .init.text.*) \ - *(.text.startup) \ - MEM_DISCARD(init.text*) + *(.text.startup) #define EXIT_DATA \ *(.exit.data .exit.data.*) \ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e93059f71c71..f0b95c76c707 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -274,6 +274,9 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) return phys_id == PHYS_CPUID_INVALID; } + +int __init acpi_get_madt_revision(void); + /* Validate the processor object's proc_id */ bool acpi_duplicate_processor_id(int proc_id); /* Processor _CTS control */ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index abd24016a900..8c61ccd161ba 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -122,7 +122,7 @@ static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); - WARN_ONCE(!tag, "current->alloc_tag not set"); + WARN_ONCE(!tag, "current->alloc_tag not set\n"); } static inline void alloc_tag_sub_check(union codetag_ref *ref) diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c82d56768101..c6d18f50f671 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -212,6 +212,9 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } extern const struct bus_type ffa_bus_type; +/* The FF-A 1.0 partition structure lacks the uuid[4] */ +#define FFA_1_0_PARTITON_INFO_SZ (8) + /* FFA transport related */ struct ffa_partition_info { u16 id; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h new file mode 100644 index 000000000000..dd831c269e99 --- /dev/null +++ b/include/linux/bio-integrity.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BIO_INTEGRITY_H +#define _LINUX_BIO_INTEGRITY_H + +#include <linux/bio.h> + +enum bip_flags { + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ +}; + +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + + struct bvec_iter bip_iter; + + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_max_vcnt; /* integrity bio_vec slots */ + unsigned short bip_flags; /* control flags */ + + struct bvec_iter bio_iter; /* for rewinding parent bio */ + + struct work_struct bip_work; /* I/O completion */ + + struct bio_vec *bip_vec; + struct bio_vec bip_inline_vecs[];/* embedded bvec array */ +}; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + +#define bip_for_each_vec(bvl, bip, iter) \ + for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) + +#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ + for_each_bio(_bio) \ + bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_opf & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip) + return bip->bip_flags & flag; + + return false; +} + +static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) +{ + return bip->bip_iter.bi_sector; +} + +static inline void bip_set_seed(struct bio_integrity_payload *bip, + sector_t seed) +{ + bip->bip_iter.bi_sector = seed; +} + +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, + unsigned int nr); +int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, + unsigned int offset); +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_user(struct bio *bio); +bool bio_integrity_prep(struct bio *bio); +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); +void bio_integrity_trim(struct bio *bio); +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); +int bioset_integrity_create(struct bio_set *bs, int pool_size); +void bioset_integrity_free(struct bio_set *bs); +void bio_integrity_init(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + return NULL; +} + +static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + return 0; +} + +static inline void bioset_integrity_free(struct bio_set *bs) +{ +} + +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + ssize_t len, u32 seed) +{ + return -EINVAL; +} + +static inline void bio_integrity_unmap_user(struct bio *bio) +{ +} + +static inline bool bio_integrity_prep(struct bio *bio) +{ + return true; +} + +static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + return 0; +} + +static inline void bio_integrity_advance(struct bio *bio, + unsigned int bytes_done) +{ +} + +static inline void bio_integrity_trim(struct bio *bio) +{ +} + +static inline void bio_integrity_init(void) +{ +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + return false; +} + +static inline struct bio_integrity_payload * +bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 818e93612947..a46e2047bea4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -321,69 +321,6 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) #define bio_for_each_folio_all(fi, bio) \ for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio)) -enum bip_flags { - BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ - BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ - BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ -}; - -/* - * bio integrity payload - */ -struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - - struct bvec_iter bip_iter; - - unsigned short bip_vcnt; /* # of integrity bio_vecs */ - unsigned short bip_max_vcnt; /* integrity bio_vec slots */ - unsigned short bip_flags; /* control flags */ - - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ -}; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_opf & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bip) - return bip->bip_flags & flag; - - return false; -} - -static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) -{ - return bip->bip_iter.bi_sector; -} - -static inline void bip_set_seed(struct bio_integrity_payload *bip, - sector_t seed) -{ - bip->bip_iter.bi_sector = seed; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); @@ -721,99 +658,6 @@ static inline bool bioset_initialized(struct bio_set *bs) return bs->bio_slab != NULL; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define bip_for_each_vec(bvl, bip, iter) \ - for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) - -#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ - for_each_bio(_bio) \ - bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) - -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); -void bio_integrity_unmap_free_user(struct bio *bio); -extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); -extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); -extern bool bio_integrity_prep(struct bio *); -extern void bio_integrity_advance(struct bio *, unsigned int); -extern void bio_integrity_trim(struct bio *); -extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); -extern int bioset_integrity_create(struct bio_set *, int); -extern void bioset_integrity_free(struct bio_set *); -extern void bio_integrity_init(void); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void *bio_integrity(struct bio *bio) -{ - return NULL; -} - -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free (struct bio_set *bs) -{ - return; -} - -static inline bool bio_integrity_prep(struct bio *bio) -{ - return true; -} - -static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - return 0; -} - -static inline void bio_integrity_advance(struct bio *bio, - unsigned int bytes_done) -{ - return; -} - -static inline void bio_integrity_trim(struct bio *bio) -{ - return; -} - -static inline void bio_integrity_init(void) -{ - return; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - return false; -} - -static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, - unsigned int nr) -{ - return ERR_PTR(-EINVAL); -} - -static inline int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return 0; -} - -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len, u32 seed) -{ - return -EINVAL; -} -static inline void bio_integrity_unmap_free_user(struct bio *bio) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - /* * Mark a bio as polled. Note that for async polled IO, the caller must * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 804f856ed3e5..de98049b7ded 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -3,6 +3,7 @@ #define _LINUX_BLK_INTEGRITY_H #include <linux/blk-mq.h> +#include <linux/bio-integrity.h> struct request; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 89ba6b16fe8b..8d304b1d16b1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -27,38 +27,61 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); * request flags */ typedef __u32 __bitwise req_flags_t; -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* use hctx->sched_tags */ -#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) -/* use an I/O scheduler for this request */ -#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The request completion needs to be signaled to zone write pluging. */ -#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -#define RQF_RESV ((__force req_flags_t)(1 << 23)) +/* Keep rqf_name[] in sync with the definitions below */ +enum { + /* drive already may have started this one */ + __RQF_STARTED, + /* request for flush sequence */ + __RQF_FLUSH_SEQ, + /* merge of different types, fail separately */ + __RQF_MIXED_MERGE, + /* don't call prep for this one */ + __RQF_DONTPREP, + /* use hctx->sched_tags */ + __RQF_SCHED_TAGS, + /* use an I/O scheduler for this request */ + __RQF_USE_SCHED, + /* vaguely specified driver internal error. Ignored by block layer */ + __RQF_FAILED, + /* don't warn about errors */ + __RQF_QUIET, + /* account into disk and partition IO statistics */ + __RQF_IO_STAT, + /* runtime pm request */ + __RQF_PM, + /* on IO scheduler merge hash */ + __RQF_HASHED, + /* track IO completion time */ + __RQF_STATS, + /* Look at ->special_vec for the actual data payload instead of the + bio chain. */ + __RQF_SPECIAL_PAYLOAD, + /* request completion needs to be signaled to zone write plugging. */ + __RQF_ZONE_WRITE_PLUGGING, + /* ->timeout has been called, don't expire again */ + __RQF_TIMED_OUT, + __RQF_RESV, + __RQF_BITS +}; + +#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) +#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) +#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) +#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) +#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) +#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) +#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) +#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) +#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) +#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) +#define RQF_SPECIAL_PAYLOAD \ + ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) +#define RQF_ZONE_WRITE_PLUGGING \ + ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) +#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -278,8 +301,12 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ +/* Keep alloc_policy_name[] in sync with the definitions below */ +enum { + BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ + BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ + BLK_TAG_ALLOC_MAX +}; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware @@ -644,6 +671,7 @@ struct blk_mq_ops { #endif }; +/* Keep hctx_flag_name[] in sync with the definitions below */ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, @@ -653,27 +681,17 @@ enum { */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, - BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_BLOCKING = 1 << 4, /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 6, + BLK_MQ_F_NO_SCHED = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ - BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, + BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, BLK_MQ_F_ALLOC_POLICY_BITS = 1, - - BLK_MQ_S_STOPPED = 0, - BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_S_SCHED_RESTART = 2, - - /* hw queue is inactive after all its CPUs become offline */ - BLK_MQ_S_INACTIVE = 3, - - BLK_MQ_MAX_DEPTH = 10240, - - BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ @@ -682,8 +700,19 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) +enum { + /* Keep hctx_state_name[] in sync with the definitions below */ + BLK_MQ_S_STOPPED, + BLK_MQ_S_TAG_ACTIVE, + BLK_MQ_S_SCHED_RESTART, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE, + BLK_MQ_S_MAX +}; + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 632edd71f8c6..36ed96133217 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -354,6 +354,7 @@ enum req_op { REQ_OP_LAST = (__force blk_opf_t)36, }; +/* Keep cmd_flag_name[] in sync with the definitions below */ enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b8196e219ac2..e85ec73a07d5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -588,27 +588,28 @@ struct request_queue { }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ -#define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ -#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +enum { + QUEUE_FLAG_DYING, /* queue being torn down */ + QUEUE_FLAG_NOMERGES, /* disable merge attempts */ + QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ + QUEUE_FLAG_FAIL_IO, /* fake timeout */ + QUEUE_FLAG_NOXMERGES, /* No extended merges */ + QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ + QUEUE_FLAG_INIT_DONE, /* queue is initialized */ + QUEUE_FLAG_STATS, /* track IO start and completion times */ + QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ + QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ + QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ + QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ + QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_MAX +}; #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f1d4a97b9d1..3b94ec161e8c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,7 +275,7 @@ struct bpf_map { u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; @@ -2253,7 +2253,7 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); @@ -2262,6 +2262,10 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else +/* + * These specialized allocators have to be macros for their allocations to be + * accounted separately (to have separate alloc_tag). + */ #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ kmalloc_node(_size, _flags, _node) #define bpf_map_kzalloc(_map, _size, _flags) \ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e022e40b099e..14acf1bbe0ce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -53,7 +53,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within - * a page (via a page_mapping) and for wrapping bio submission + * a folio (via a folio_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 3dde175f4108..108060612bb8 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -4,7 +4,7 @@ #include <linux/bitops.h> #include <linux/cpuhplock.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/smp.h> struct device_node; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index b36690ca0d3f..ae04035b6cbe 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -539,9 +539,6 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; - /* If there is block congestion on this cgroup. */ - atomic_t congestion_count; - /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; @@ -681,9 +678,7 @@ struct cftype { __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); -#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; -#endif }; /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2150ca60394b..c60ba0ab1462 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -10,7 +10,6 @@ */ #include <linux/sched.h> -#include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 9aac31d856f3..b0df28ddd394 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -12,7 +12,7 @@ #ifdef CONFIG_GENERIC_CLOCKEVENTS # include <linux/clocksource.h> -# include <linux/cpumask.h> +# include <linux/cpumask_types.h> # include <linux/ktime.h> # include <linux/notifier.h> diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 68a24a3a6979..2594553bb30b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -208,10 +208,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define data_race(expr) \ ({ \ - __unqual_scalar_typeof(({ expr; })) __v = ({ \ - __kcsan_disable_current(); \ - expr; \ - }); \ + __kcsan_disable_current(); \ + __auto_type __v = (expr); \ __kcsan_enable_current(); \ __v; \ }) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a8926d0a28cd..bdcec1732445 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,7 +16,6 @@ #include <linux/node.h> #include <linux/compiler.h> -#include <linux/cpumask.h> #include <linux/cpuhotplug.h> #include <linux/cpuhplock.h> #include <linux/cpu_smt.h> diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index a3bdc8a98f2c..2c774fb3c091 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -15,7 +15,6 @@ #include <linux/of.h> #include <linux/thermal.h> -#include <linux/cpumask.h> struct cpufreq_policy; diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index cae324d10965..20b5729903d7 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -7,7 +7,7 @@ * Copyright 2011 Solarflare Communications Inc. */ -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/gfp.h> #include <linux/slab.h> #include <linux/kref.h> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 954d4adc8f81..099e8b32dd68 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -9,25 +9,13 @@ */ #include <linux/cleanup.h> #include <linux/kernel.h> -#include <linux/threads.h> #include <linux/bitmap.h> +#include <linux/cpumask_types.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/gfp_types.h> #include <linux/numa.h> -/* Don't assign or return these: may not be this big! */ -typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; - -/** - * cpumask_bits - get the bits in a cpumask - * @maskp: the struct cpumask * - * - * You should only assume nr_cpu_ids bits of this mask are valid. This is - * a macro so it's const-correct. - */ -#define cpumask_bits(maskp) ((maskp)->bits) - /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed @@ -925,48 +913,7 @@ static inline unsigned int cpumask_size(void) return bitmap_size(large_cpumask_bits); } -/* - * cpumask_var_t: struct cpumask for stack usage. - * - * Oh, the wicked games we play! In order to make kernel coding a - * little more difficult, we typedef cpumask_var_t to an array or a - * pointer: doing &mask on an array is a noop, so it still works. - * - * i.e. - * cpumask_var_t tmpmask; - * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - * return -ENOMEM; - * - * ... use 'tmpmask' like a normal struct cpumask * ... - * - * free_cpumask_var(tmpmask); - * - * - * However, one notable exception is there. alloc_cpumask_var() allocates - * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has - * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. - * - * cpumask_var_t tmpmask; - * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - * return -ENOMEM; - * - * var = *tmpmask; - * - * This code makes NR_CPUS length memcopy and brings to a memory corruption. - * cpumask_copy() provide safe copy functionality. - * - * Note that there is another evil here: If you define a cpumask_var_t - * as a percpu variable then the way to obtain the address of the cpumask - * structure differently influences what this_cpu_* operation needs to be - * used. Please use this_cpu_cpumask_var_t in those cases. The direct use - * of this_cpu_ptr() or this_cpu_read() will lead to failures when the - * other type of cpumask_var_t implementation is configured. - * - * Please also note that __cpumask_var_read_mostly can be used to declare - * a cpumask_var_t variable itself (not its content) as read mostly. - */ #ifdef CONFIG_CPUMASK_OFFSTACK -typedef struct cpumask *cpumask_var_t; #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly @@ -1013,7 +960,6 @@ static inline bool cpumask_available(cpumask_var_t mask) } #else -typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly diff --git a/include/linux/cpumask_types.h b/include/linux/cpumask_types.h new file mode 100644 index 000000000000..461ed1b6bcdb --- /dev/null +++ b/include/linux/cpumask_types.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_CPUMASK_TYPES_H +#define __LINUX_CPUMASK_TYPES_H + +#include <linux/bitops.h> +#include <linux/threads.h> + +/* Don't assign or return these: may not be this big! */ +typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; + +/** + * cpumask_bits - get the bits in a cpumask + * @maskp: the struct cpumask * + * + * You should only assume nr_cpu_ids bits of this mask are valid. This is + * a macro so it's const-correct. + */ +#define cpumask_bits(maskp) ((maskp)->bits) + +/* + * cpumask_var_t: struct cpumask for stack usage. + * + * Oh, the wicked games we play! In order to make kernel coding a + * little more difficult, we typedef cpumask_var_t to an array or a + * pointer: doing &mask on an array is a noop, so it still works. + * + * i.e. + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * ... use 'tmpmask' like a normal struct cpumask * ... + * + * free_cpumask_var(tmpmask); + * + * + * However, one notable exception is there. alloc_cpumask_var() allocates + * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has + * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. + * + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * var = *tmpmask; + * + * This code makes NR_CPUS length memcopy and brings to a memory corruption. + * cpumask_copy() provide safe copy functionality. + * + * Note that there is another evil here: If you define a cpumask_var_t + * as a percpu variable then the way to obtain the address of the cpumask + * structure differently influences what this_cpu_* operation needs to be + * used. Please use this_cpu_cpumask_var_t in those cases. The direct use + * of this_cpu_ptr() or this_cpu_read() will lead to failures when the + * other type of cpumask_var_t implementation is configured. + * + * Please also note that __cpumask_var_read_mostly can be used to declare + * a cpumask_var_t variable itself (not its content) as read mostly. + */ +#ifdef CONFIG_CPUMASK_OFFSTACK +typedef struct cpumask *cpumask_var_t; +#else +typedef struct cpumask cpumask_var_t[1]; +#endif /* CONFIG_CPUMASK_OFFSTACK */ + +#endif /* __LINUX_CPUMASK_TYPES_H */ diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 9e8a032c1788..87f788c0d607 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -9,7 +9,9 @@ #include <linux/bitrev.h> u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); /** * crc32_le_combine - Combine two crc32 check values into one. For two @@ -37,6 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len); /** * __crc32c_le_combine - Combine two crc32c check values into one. For two diff --git a/include/linux/damon.h b/include/linux/damon.h index f7da65e1ac04..27c546bfc6d4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -105,6 +105,8 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. + * @DAMOS_MIGRATE_COLD: Migrate the regions prioritizing colder regions. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * @@ -122,6 +124,8 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_MIGRATE_HOT, + DAMOS_MIGRATE_COLD, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; @@ -374,6 +378,7 @@ struct damos_access_pattern { * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -389,6 +394,10 @@ struct damos_access_pattern { * monitoring context are inactive, DAMON stops monitoring either, and just * repeatedly checks the watermarks. * + * @target_nid is used to set the migration target node for migrate_hot or + * migrate_cold actions, which means it's only meaningful when @action is either + * "migrate_hot" or "migrate_cold". + * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect * &filters @@ -410,6 +419,9 @@ struct damos { /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; + union { + int target_nid; + }; struct list_head filters; struct damos_stat stat; struct list_head list; @@ -726,9 +738,11 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, struct damos_quota *quota, - struct damos_watermarks *wmarks); + struct damos_watermarks *wmarks, + int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); +int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src); struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); @@ -742,6 +756,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); +int damon_commit_ctx(struct damon_ctx *old_ctx, struct damon_ctx *new_ctx); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index ad9e2506c2f4..68c3c1e41014 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -85,6 +85,10 @@ dma_fence_chain_contained(struct dma_fence *fence) * dma_fence_chain_alloc * * Returns a new struct dma_fence_chain object or NULL on failure. + * + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. */ #define dma_fence_chain_alloc() \ ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL)) diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 6d5edef09d45..354413950d34 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -91,22 +91,19 @@ static inline void fault_config_init(struct fault_config *config, struct kmem_cache; -bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); - #ifdef CONFIG_FAIL_PAGE_ALLOC -bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ -int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB -extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); +int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f9691d375f0..f53f76e0b17e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,8 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -319,6 +321,11 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc_noprof(gfp, order); +} #define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ folio_alloc_noprof(gfp, order) #endif @@ -326,6 +333,7 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 9ca96fc90449..d4d063cf63b5 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -228,6 +228,11 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} static inline int hid_bpf_device_init(struct hid_device *hid) { return 0; } +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a3028e400a9c..dd100e849f5e 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -131,22 +131,17 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -unsigned int __nr_free_highpages(void); -extern atomic_long_t _totalhigh_pages; +unsigned long __nr_free_highpages(void); +unsigned long __totalhigh_pages(void); -static inline unsigned int nr_free_highpages(void) +static inline unsigned long nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { - return (unsigned long)atomic_long_read(&_totalhigh_pages); -} - -static inline void totalhigh_pages_add(long count) -{ - atomic_long_add(count, &_totalhigh_pages); + return __totalhigh_pages(); } static inline bool is_kmap_addr(const void *x) @@ -239,8 +234,8 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -static inline unsigned int nr_free_highpages(void) { return 0; } -static inline unsigned long totalhigh_pages(void) { return 0UL; } +static inline unsigned long nr_free_highpages(void) { return 0; } +static inline unsigned long totalhigh_pages(void) { return 0; } static inline bool is_kmap_addr(const void *x) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 00341b56d291..930a591b9b61 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -179,7 +179,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ -static inline unsigned int nr_free_highpages(void); +static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE @@ -352,6 +352,9 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from, kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } @@ -368,6 +371,9 @@ static inline int copy_mc_highpage(struct page *to, struct page *from) kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } #else diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2aa986a5cd1b..cff002be83eb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ +#include <linux/kobject.h> vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -126,18 +128,6 @@ static inline bool hugepage_global_always(void) (1<<TRANSPARENT_HUGEPAGE_FLAG); } -static inline bool hugepage_flags_enabled(void) -{ - /* - * We cover both the anon and the file-backed case here; we must return - * true if globally enabled, even when all anon sizes are set to never. - * So we don't need to look at huge_anon_orders_inherit. - */ - return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; -} - static inline int highest_order(unsigned long orders) { return fls_long(orders) - 1; @@ -265,12 +255,26 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, __MTHP_STAT_COUNT }; @@ -415,6 +419,11 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, bool freeze, struct folio *folio); +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool folio_test_pmd_mappable(struct folio *folio) @@ -477,6 +486,16 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) {} +static inline void split_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + bool freeze, struct folio *folio) {} + +static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, + struct folio *folio) +{ + return false; +} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) @@ -550,6 +569,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2b3c3a404769..c9bf68c239a0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,12 +20,6 @@ struct user_struct; struct mmu_gather; struct node; -#ifndef CONFIG_ARCH_HAS_HUGEPD -typedef struct { unsigned long pd; } hugepd_t; -#define is_hugepd(hugepd) (0) -#define __hugepd(x) ((hugepd_t) { (x) }) -#endif - void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE @@ -616,47 +610,35 @@ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ - } \ -static inline int HPage##uname(struct page *page) \ - { return test_bit(HPG_##flname, &(page->private)); } + } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ - } \ -static inline void SetHPage##uname(struct page *page) \ - { set_bit(HPG_##flname, &(page->private)); } + } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ - } \ -static inline void ClearHPage##uname(struct page *page) \ - { clear_bit(HPG_##flname, &(page->private)); } + } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ - { return 0; } \ -static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void ClearHPage##uname(struct page *page) \ { } #endif @@ -681,6 +663,7 @@ HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; + struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; @@ -698,11 +681,6 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_CGROUP_HUGETLB - /* cgroup control files */ - struct cftype cgroup_files_dfl[8]; - struct cftype cgroup_files_legacy[10]; -#endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/init.h b/include/linux/init.h index 58cef4c2e59a..ee1309473bc6 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -84,11 +84,15 @@ #define __exit __section(".exit.text") __exitused __cold notrace -/* Used for MEMORY_HOTPLUG */ -#define __meminit __section(".meminit.text") __cold notrace \ - __latent_entropy -#define __meminitdata __section(".meminit.data") -#define __meminitconst __section(".meminit.rodata") +#ifdef CONFIG_MEMORY_HOTPLUG +#define __meminit +#define __meminitdata +#define __meminitconst +#else +#define __meminit __init +#define __meminitdata __initdata +#define __meminitconst __initconst +#endif /* For assembly routines */ #define __HEAD .section ".head.text","ax" @@ -99,10 +103,6 @@ #define __INITRODATA .section ".init.rodata","a",%progbits #define __FINITDATA .previous -#define __MEMINIT .section ".meminit.text", "ax" -#define __MEMINITDATA .section ".meminit.data", "aw" -#define __MEMINITRODATA .section ".meminit.rodata", "a" - /* silence warnings when references are OK */ #define __REF .section ".ref.text", "ax" #define __REFDATA .section ".ref.data", "aw" diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 3a36e64119c8..3f30c88e0b4c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -6,13 +6,13 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cleanup.h> -#include <linux/cpumask.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> +#include <linux/cpumask_types.h> #include <linux/workqueue.h> #include <linux/jump_label.h> @@ -169,7 +169,7 @@ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { - return request_threaded_irq(irq, handler, NULL, flags, name, dev); + return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check diff --git a/include/linux/ioport.h b/include/linux/ioport.h index db7fe25f3370..6e9fb667a1c5 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,6 +188,42 @@ enum { #define DEFINE_RES_DMA(_dma) \ DEFINE_RES_DMA_NAMED((_dma), NULL) +/** + * typedef resource_alignf - Resource alignment callback + * @data: Private data used by the callback + * @res: Resource candidate range (an empty resource space) + * @size: The minimum size of the empty space + * @align: Alignment from the constraints + * + * Callback allows calculating resource placement and alignment beyond min, + * max, and align fields in the struct resource_constraint. + * + * Return: Start address for the resource. + */ +typedef resource_size_t (*resource_alignf)(void *data, + const struct resource *res, + resource_size_t size, + resource_size_t align); + +/** + * struct resource_constraint - constraints to be met while searching empty + * resource space + * @min: The minimum address for the memory range + * @max: The maximum address for the memory range + * @align: Alignment for the start address of the empty space + * @alignf: Additional alignment constraints callback + * @alignf_data: Data provided for @alignf callback + * + * Contains the range and alignment constraints that have to be met during + * find_resource_space(). @alignf can be NULL indicating no alignment beyond + * @align is necessary. + */ +struct resource_constraint { + resource_size_t min, max, align; + resource_alignf alignf; + void *alignf_data; +}; + /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; @@ -207,10 +243,7 @@ extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), + resource_alignf alignf, void *alignf_data); struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, @@ -264,6 +297,9 @@ static inline bool resource_union(const struct resource *r1, const struct resour return true; } +int find_resource_space(struct resource *root, struct resource *new, + resource_size_t size, struct resource_constraint *constraint); + /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) #define request_muxed_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED) diff --git a/include/linux/irq.h b/include/linux/irq.h index a217e1029c1d..1f5dbf1f92c9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1106,6 +1106,7 @@ enum irq_gc_flags { * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags + * @exit: Function called on each chip when they are destroyed. * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { @@ -1114,9 +1115,37 @@ struct irq_domain_chip_generic { unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; + void (*exit)(struct irq_chip_generic *gc); struct irq_chip_generic *gc[]; }; +/** + * struct irq_domain_chip_generic_info - Generic chip information structure + * @name: Name of the generic interrupt chip + * @handler: Interrupt handler used by the generic interrupt chip + * @irqs_per_chip: Number of interrupts each chip handles (max 32) + * @num_ct: Number of irq_chip_type instances associated with each + * chip + * @irq_flags_to_clear: IRQ_* bits to clear in the mapping function + * @irq_flags_to_set: IRQ_* bits to set in the mapping function + * @gc_flags: Generic chip specific setup flags + * @init: Function called on each chip when they are created. + * Allow to do some additional chip initialisation. + * @exit: Function called on each chip when they are destroyed. + * Allow to do some chip cleanup operation. + */ +struct irq_domain_chip_generic_info { + const char *name; + irq_flow_handler_t handler; + unsigned int irqs_per_chip; + unsigned int num_ct; + unsigned int irq_flags_to_clear; + unsigned int irq_flags_to_set; + enum irq_gc_flags gc_flags; + int (*init)(struct irq_chip_generic *gc); + void (*exit)(struct irq_chip_generic *gc); +}; + /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); @@ -1153,6 +1182,20 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); +#ifdef CONFIG_GENERIC_IRQ_CHIP +int irq_domain_alloc_generic_chips(struct irq_domain *d, + const struct irq_domain_chip_generic_info *info); +void irq_domain_remove_generic_chips(struct irq_domain *d); +#else +static inline int +irq_domain_alloc_generic_chips(struct irq_domain *d, + const struct irq_domain_chip_generic_info *info) +{ + return -EINVAL; +} +static inline void irq_domain_remove_generic_chips(struct irq_domain *d) { } +#endif /* CONFIG_GENERIC_IRQ_CHIP */ + int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 2c63375bbd43..ecabed6d3307 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -25,6 +25,14 @@ struct its_vm { irq_hw_number_t db_lpi_base; unsigned long *db_bitmap; int nr_db_lpis; + /* + * Ensures mutual exclusion between updates to vlpi_count[] + * and map/unmap when using the ITSList mechanism. + * + * The lock order for any sequence involving the ITSList is + * vmapp_lock -> vpe_lock ->vmovp_lock. + */ + raw_spinlock_t vmapp_lock; u32 vlpi_count[GICv4_ITS_LIST_MAX]; }; diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h index 2f6ae7551748..b35ee22c278f 100644 --- a/include/linux/irqchip/irq-partition-percpu.h +++ b/include/linux/irqchip/irq-partition-percpu.h @@ -8,7 +8,7 @@ #define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H #include <linux/fwnode.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/irqdomain.h> struct partition_affinity { diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 21ecf582a0fe..de6105f68fec 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -74,11 +74,24 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * struct irq_domain_ops - Methods for irq_domain objects * @match: Match an interrupt controller device node to a host, returns * 1 on a match + * @select: Match an interrupt controller fw specification. It is more generic + * than @match as it receives a complete struct irq_fwspec. Therefore, + * @select is preferred if provided. Returns 1 on a match. * @map: Create or update a mapping between a virtual irq number and a hw * irq number. This is called only once for a given mapping. * @unmap: Dispose of such a mapping * @xlate: Given a device tree node and interrupt specifier, decode * the hardware irq number and linux irq type value. + * @alloc: Allocate @nr_irqs interrupts starting from @virq. + * @free: Free @nr_irqs interrupts starting from @virq. + * @activate: Activate one interrupt in HW (@irqd). If @reserve is set, only + * reserve the vector. If unset, assign the vector (called from + * request_irq()). + * @deactivate: Disarm one interrupt (@irqd). + * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and + * linux irq type value (@out_type). This is a generalised @xlate + * (over struct irq_fwspec) and is preferred if provided. + * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping * is created or an old mapping is disposed. The driver can then proceed to @@ -131,6 +144,9 @@ struct irq_domain_chip_generic; * Optional elements: * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy * to swap it for the of_node via the irq_domain_get_of_node accessor + * @bus_token: @fwnode's device_node might be used for several irq domains. But + * in connection with @bus_token, the pair shall be unique in a + * system. * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. @@ -141,9 +157,12 @@ struct irq_domain_chip_generic; * purposes related to the irq domain. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains * @msi_parent_ops: Pointer to MSI parent domain methods for per device domain init + * @exit: Function called when the domain is destroyed * * Revmap data, used internally by the irq domain code: - * @revmap_size: Size of the linear map table @revmap[] + * @hwirq_max: Top limit for the HW irq number. Especially to avoid + * conflicts/failures with reserved HW irqs. Can be ~0. + * @revmap_size: Size of the linear map table @revmap * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map * @revmap: Linear table of irq_data pointers */ @@ -169,6 +188,7 @@ struct irq_domain { #ifdef CONFIG_GENERIC_MSI_IRQ const struct msi_parent_ops *msi_parent_ops; #endif + void (*exit)(struct irq_domain *d); /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; @@ -182,7 +202,7 @@ enum { /* Irq domain is hierarchical */ IRQ_DOMAIN_FLAG_HIERARCHY = (1 << 0), - /* Irq domain name was allocated in __irq_domain_add() */ + /* Irq domain name was allocated internally */ IRQ_DOMAIN_NAME_ALLOCATED = (1 << 1), /* Irq domain is an IPI domain with virq per cpu */ @@ -208,6 +228,9 @@ enum { /* Irq domain is a MSI device domain */ IRQ_DOMAIN_FLAG_MSI_DEVICE = (1 << 9), + /* Irq domain must destroy generic chips when removed */ + IRQ_DOMAIN_FLAG_DESTROY_GC = (1 << 10), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -257,10 +280,51 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, - irq_hw_number_t hwirq_max, int direct_max, - const struct irq_domain_ops *ops, - void *host_data); + +struct irq_domain_chip_generic_info; + +/** + * struct irq_domain_info - Domain information structure + * @fwnode: firmware node for the interrupt controller + * @domain_flags: Additional flags to add to the domain flags + * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller + * @direct_max: Maximum value of direct maps; + * Use ~0 for no limit; 0 for no direct mapping + * @bus_token: Domain bus token + * @ops: Domain operation callbacks + * @host_data: Controller private data pointer + * @dgc_info: Geneneric chip information structure pointer used to + * create generic chips for the domain if not NULL. + * @init: Function called when the domain is created. + * Allow to do some additional domain initialisation. + * @exit: Function called when the domain is destroyed. + * Allow to do some additional cleanup operation. + */ +struct irq_domain_info { + struct fwnode_handle *fwnode; + unsigned int domain_flags; + unsigned int size; + irq_hw_number_t hwirq_max; + int direct_max; + enum irq_domain_bus_token bus_token; + const struct irq_domain_ops *ops; + void *host_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /** + * @parent: Pointer to the parent irq domain used in a hierarchy domain + */ + struct irq_domain *parent; +#endif + struct irq_domain_chip_generic_info *dgc_info; + int (*init)(struct irq_domain *d); + void (*exit)(struct irq_domain *d); +}; + +struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); +struct irq_domain *devm_irq_domain_instantiate(struct device *dev, + const struct irq_domain_info *info); + struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, @@ -293,7 +357,7 @@ static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) extern const struct fwnode_operations irqchip_fwnode_ops; -static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode) +static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) { return fwnode && fwnode->ops == &irqchip_fwnode_ops; } @@ -350,7 +414,17 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } #ifdef CONFIG_IRQ_DOMAIN_NOMAP @@ -359,7 +433,17 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), 0, max_irq, max_irq, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .hwirq_max = max_irq, + .direct_max = max_irq, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } extern unsigned int irq_create_direct_mapping(struct irq_domain *host); @@ -369,7 +453,16 @@ static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), 0, ~0, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .hwirq_max = ~0U, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle *fwnode, @@ -377,14 +470,33 @@ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle * const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(fwnode, size, size, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = fwnode, + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(fwnode, 0, ~0, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = fwnode, + .hwirq_max = ~0, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } extern void irq_domain_remove(struct irq_domain *host); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b900c642210c..5157d92b6f23 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1595,6 +1595,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh); */ extern struct kmem_cache *jbd2_handle_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) @@ -1609,6 +1614,11 @@ static inline void jbd2_free_handle(handle_t *handle) */ extern struct kmem_cache *jbd2_inode_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) diff --git a/include/linux/jhash.h b/include/linux/jhash.h index ab7f8c152b89..fa26a2dd3b52 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -31,7 +31,7 @@ /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) -/* __jhash_mix -- mix 3 32-bit values reversibly. */ +/* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ @@ -60,7 +60,7 @@ /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key - * @initval: the previous hash, or an arbitray value + * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. @@ -110,7 +110,7 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key - * @initval: the previous hash, or an arbitray value + * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 9c042c6384bb..b97ce2df376f 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -5,7 +5,6 @@ #include <linux/smp.h> #include <linux/threads.h> #include <linux/percpu.h> -#include <linux/cpumask.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/vtime.h> diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e0c23a32cdf0..2b1432cc16d5 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -230,6 +230,67 @@ void kmsan_handle_urb(const struct urb *urb, bool is_out); */ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); +/** + * kmsan_get_metadata() - Return a pointer to KMSAN shadow or origins. + * @addr: kernel address. + * @is_origin: whether to return origins or shadow. + * + * Return NULL if metadata cannot be found. + */ +void *kmsan_get_metadata(void *addr, bool is_origin); + +/** + * kmsan_enable_current(): Enable KMSAN for the current task. + * + * Each kmsan_enable_current() current call must be preceded by a + * kmsan_disable_current() call. These call pairs may be nested. + */ +void kmsan_enable_current(void); + +/** + * kmsan_disable_current(): Disable KMSAN for the current task. + * + * Each kmsan_disable_current() current call must be followed by a + * kmsan_enable_current() call. These call pairs may be nested. + */ +void kmsan_disable_current(void); + +/** + * memset_no_sanitize_memory(): Fill memory without KMSAN instrumentation. + * @s: address of kernel memory to fill. + * @c: constant byte to fill the memory with. + * @n: number of bytes to fill. + * + * This is like memset(), but without KMSAN instrumentation. + */ +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return __memset(s, c, n); +} + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + #else static inline void kmsan_init_shadow(void) @@ -329,6 +390,21 @@ static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { } +static inline void kmsan_enable_current(void) +{ +} + +static inline void kmsan_disable_current(void) +{ +} + +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return memset(s, c, n); +} + +#define KMSAN_WARN_ON WARN_ON + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 929287981afe..dfc59918b3c0 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -31,7 +31,7 @@ struct kmsan_context_state { struct kmsan_ctx { struct kmsan_context_state cstate; int kmsan_in_runtime; - bool allow_reporting; + unsigned int depth; }; #endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 692c01e41a18..689e8be873a7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -378,8 +378,10 @@ struct kvm_vcpu { bool dy_eligible; } spin_loop; #endif + bool wants_to_run; bool preempted; bool ready; + bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; @@ -1494,8 +1496,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); @@ -1955,8 +1955,6 @@ struct _kvm_stats_desc { HALT_POLL_HIST_COUNT), \ STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) -extern struct dentry *kvm_debugfs_dir; - ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, @@ -2096,6 +2094,7 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); +int kvm_init_irq_routing(struct kvm *kvm); int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); @@ -2105,6 +2104,11 @@ void kvm_free_irq_routing(struct kvm *kvm); static inline void kvm_free_irq_routing(struct kvm *kvm) {} +static inline int kvm_init_irq_routing(struct kvm *kvm) +{ + return 0; +} + #endif int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); @@ -2441,4 +2445,45 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); +#endif + +/** + * kvm_gmem_populate() - Populate/prepare a GPA range with guest data + * + * @kvm: KVM instance + * @gfn: starting GFN to be populated + * @src: userspace-provided buffer containing data to copy into GFN range + * (passed to @post_populate, and incremented on each iteration + * if not NULL) + * @npages: number of pages to copy from userspace-buffer + * @post_populate: callback to issue for each gmem page that backs the GPA + * range + * @opaque: opaque data to pass to @post_populate callback + * + * This is primarily intended for cases where a gmem-backed GPA range needs + * to be initialized with userspace-provided data prior to being mapped into + * the guest as a private page. This should be called with the slots->lock + * held so that caller-enforced invariants regarding the expected memory + * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. + * + * Returns the number of pages that were populated. + */ +typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *opaque); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque); + +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +#endif + +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); +#endif + #endif diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 792b67ceb631..5099a8ccd5f4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -50,7 +50,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 45cac33334c8..fc4d75c6cec3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -316,8 +316,6 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) -int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); - #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 030d34e9d117..7e2eb091049a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -69,18 +69,6 @@ struct mem_cgroup_id { refcount_t ref; }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremented by the number of pages. This counter is used - * to trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_NTARGETS, -}; - struct memcg_vmstats_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; @@ -96,23 +84,33 @@ struct mem_cgroup_reclaim_iter { * per-node information in memory controller. */ struct mem_cgroup_per_node { - struct lruvec lruvec; + /* Keep the read-only fields at the start */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; - - unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - - struct mem_cgroup_reclaim_iter iter; - struct shrinker_info __rcu *shrinker_info; +#ifdef CONFIG_MEMCG_V1 + /* + * Memcg-v1 only stuff in middle as buffer between read mostly fields + * and update often fields to avoid false sharing. If v1 stuff is + * not present, an explicit padding is needed. + */ + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ +#else + CACHELINE_PADDING(_pad1_); +#endif + + /* Fields which get updated often at the end. */ + struct lruvec lruvec; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { @@ -194,14 +192,10 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; - /* Legacy consumer-oriented counters */ - struct page_counter kmem; /* v1 only */ - struct page_counter tcpmem; /* v1 only */ - /* Range enforcement for interrupt charges */ struct work_struct high_work; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP unsigned long zswap_max; /* @@ -211,8 +205,6 @@ struct mem_cgroup { bool zswap_writeback; #endif - unsigned long soft_limit; - /* vmpressure notifications */ struct vmpressure vmpressure; @@ -221,13 +213,7 @@ struct mem_cgroup { */ bool oom_group; - /* protected by memcg_oom_lock */ - bool oom_lock; - int under_oom; - - int swappiness; - /* OOM-Killer disable */ - int oom_kill_disable; + int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; @@ -236,29 +222,6 @@ struct mem_cgroup { /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; - /* protect arrays of thresholds */ - struct mutex thresholds_lock; - - /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_thresholds thresholds; - - /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_thresholds memsw_thresholds; - - /* For oom notifier event fd */ - struct list_head oom_notify; - - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - unsigned long move_lock_flags; - - CACHELINE_PADDING(_pad1_); - /* memory.stat */ struct memcg_vmstats *vmstats; @@ -273,11 +236,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; - /* Legacy tcp memory accounting */ - bool tcpmem_active; - int tcpmem_pressure; - -#ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting @@ -288,15 +246,6 @@ struct mem_cgroup { struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; -#endif - - CACHELINE_PADDING(_pad2_); - - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -306,10 +255,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif @@ -319,6 +264,56 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif +#ifdef CONFIG_MEMCG_V1 + /* Legacy consumer-oriented counters */ + struct page_counter kmem; /* v1 only */ + struct page_counter tcpmem; /* v1 only */ + + unsigned long soft_limit; + + /* protected by memcg_oom_lock */ + bool oom_lock; + int under_oom; + + /* OOM-Killer disable */ + int oom_kill_disable; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + /* Legacy tcp memory accounting */ + bool tcpmem_active; + int tcpmem_pressure; + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; +#endif /* CONFIG_MEMCG_V1 */ + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -443,11 +438,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return folio_memcg(page_folio(page)); -} - /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -540,7 +530,6 @@ retry: return memcg; } -#ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. @@ -556,15 +545,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } - -#else -static inline bool folio_memcg_kmem(struct folio *folio) -{ - return false; -} - -#endif - static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); @@ -949,51 +929,13 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); -static inline void mem_cgroup_enter_user_fault(void) -{ - WARN_ON(current->in_user_fault); - current->in_user_fault = 1; -} - -static inline void mem_cgroup_exit_user_fault(void) -{ - WARN_ON(!current->in_user_fault); - current->in_user_fault = 0; -} - -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return p->memcg_in_oom; -} - -bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void folio_memcg_lock(struct folio *folio); -void folio_memcg_unlock(struct folio *folio); - void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); -/* try to stablize folio_memcg() for all the pages in a memcg */ -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - rcu_read_lock(); - - if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) - return true; - - rcu_read_unlock(); - return false; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) @@ -1014,7 +956,7 @@ static inline void mod_memcg_page_state(struct page *page, return; rcu_read_lock(); - memcg = page_memcg(page); + memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); @@ -1120,10 +1062,6 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1133,11 +1071,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} - static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1439,48 +1372,10 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void folio_memcg_lock(struct folio *folio) -{ -} - -static inline void folio_memcg_unlock(struct folio *folio) -{ -} - -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - /* to match folio_memcg_rcu() */ - rcu_read_lock(); - return true; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } -static inline void mem_cgroup_enter_user_fault(void) -{ -} - -static inline void mem_cgroup_exit_user_fault(void) -{ -} - -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return false; -} - -static inline bool mem_cgroup_oom_synchronize(bool wait) -{ - return false; -} - static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { @@ -1574,14 +1469,6 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } - -static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} #endif /* CONFIG_MEMCG */ /* @@ -1589,7 +1476,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -1636,7 +1523,7 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } -/* Test requires a stable page->memcg binding, see page_memcg() */ +/* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { @@ -1734,8 +1621,10 @@ void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { +#ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; @@ -1762,7 +1651,7 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, } #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); @@ -1905,9 +1794,9 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); @@ -1932,4 +1821,100 @@ static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) } #endif + +/* Cgroup v1-related declarations */ + +#ifdef CONFIG_MEMCG_V1 +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + +bool mem_cgroup_oom_synchronize(bool wait); + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return p->memcg_in_oom; +} + +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); + +/* try to stablize folio_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +static inline void mem_cgroup_enter_user_fault(void) +{ + WARN_ON(current->in_user_fault); + current->in_user_fault = 1; +} + +static inline void mem_cgroup_exit_user_fault(void) +{ + WARN_ON(!current->in_user_fault); + current->in_user_fault = 0; +} + +#else /* CONFIG_MEMCG_V1 */ +static inline +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + return 0; +} + +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match folio_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return false; +} + +static inline bool mem_cgroup_oom_synchronize(bool wait) +{ + return false; +} + +static inline void mem_cgroup_enter_user_fault(void) +{ +} + +static inline void mem_cgroup_exit_user_fault(void) +{ +} + +#endif /* CONFIG_MEMCG_V1 */ + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/memfd.h b/include/linux/memfd.h index e7abf6fa4c52..3f2cf339ceaf 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -6,11 +6,16 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { return -EINVAL; } +static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ + return ERR_PTR(-EINVAL); +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 0d70788558f4..0dc0cf2863e2 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -38,6 +38,7 @@ struct access_coordinate; #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; extern struct memory_dev_type *default_dram_type; +extern nodemask_t default_dram_nodes; struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); @@ -76,6 +77,7 @@ static inline bool node_is_toptier(int node) #define numa_demotion_enabled false #define default_dram_type NULL +#define default_dram_nodes NODE_MASK_NONE /* * CONFIG_NUMA implementation returns non NULL error. */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7a9ff464608d..ebe876930e78 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -175,8 +175,8 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); -extern void __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2ce13e8a309b..644be30b69c8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -63,8 +63,6 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION void putback_movable_pages(struct list_head *l); -int migrate_folio_extra(struct address_space *mapping, struct folio *dst, - struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, @@ -78,7 +76,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); -void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); @@ -142,9 +139,16 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING +int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node); int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node); #else +static inline int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node) +{ + return -EAGAIN; /* can't migrate now */ +} static inline int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node) { diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369..265c4328b36a 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -7,16 +7,11 @@ * on most operations but not ->writepage as the potential stall time * is too significant * MIGRATE_SYNC will block when migrating pages - * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages - * with the CPU. Instead, page copy happens outside the migratepage() - * callback and is likely using a DMA engine. See migrate_vma() and HMM - * (mm/hmm.c) for users of this mode. */ enum migrate_mode { MIGRATE_ASYNC, MIGRATE_SYNC_LIGHT, MIGRATE_SYNC, - MIGRATE_SYNC_NO_COPY, }; enum migrate_reason { @@ -29,6 +24,7 @@ enum migrate_reason { MR_CONTIG_RANGE, MR_LONGTERM_PIN, MR_DEMOTION, + MR_DAMON, MR_TYPES }; diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index d52daf45861b..43a7b9dcf15e 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -7,46 +7,89 @@ #include <linux/types.h> /** - * struct min_heap - Data structure to hold a min-heap. - * @data: Start of array holding the heap elements. + * Data structure to hold a min-heap. * @nr: Number of elements currently in the heap. * @size: Maximum number of elements that can be held in current storage. + * @data: Pointer to the start of array holding the heap elements. + * @preallocated: Start of the static preallocated array holding the heap elements. */ -struct min_heap { - void *data; - int nr; - int size; -}; +#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ +struct _name { \ + int nr; \ + int size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0) + +typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char; + +#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *) +#define __minheap_obj_size(_heap) sizeof((_heap)->data[0]) /** * struct min_heap_callbacks - Data/functions to customise the min_heap. - * @elem_size: The nr of each element in bytes. * @less: Partial order function for this heap. * @swp: Swap elements function. */ struct min_heap_callbacks { - int elem_size; - bool (*less)(const void *lhs, const void *rhs); - void (*swp)(void *lhs, void *rhs); + bool (*less)(const void *lhs, const void *rhs, void *args); + void (*swp)(void *lhs, void *rhs, void *args); }; +/* Initialize a min-heap. */ +static __always_inline +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + heap->nr = 0; + heap->size = size; + if (data) + heap->data = data; + else + heap->data = heap->preallocated; +} + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) + +/* Get the minimum element from the heap. */ +static __always_inline +void *__min_heap_peek(struct min_heap_char *heap) +{ + return heap->nr ? heap->data : NULL; +} + +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + +/* Check if the heap is full. */ +static __always_inline +bool __min_heap_full(min_heap_char *heap) +{ + return heap->nr == heap->size; +} + +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) + /* Sift the element at pos down the heap. */ static __always_inline -void min_heapify(struct min_heap *heap, int pos, - const struct min_heap_callbacks *func) +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; - void *root = data + pos * func->elem_size; + void *root = data + pos * elem_size; int i = pos, j; /* Find the sift-down path all the way to the leaves. */ for (;;) { if (i * 2 + 2 >= heap->nr) break; - left = data + (i * 2 + 1) * func->elem_size; - right = data + (i * 2 + 2) * func->elem_size; - i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2; + left = data + (i * 2 + 1) * elem_size; + right = data + (i * 2 + 2) * elem_size; + i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2; } /* Special case for the last leaf with no sibling. */ @@ -54,83 +97,140 @@ void min_heapify(struct min_heap *heap, int pos, i = i * 2 + 1; /* Backtrack to the correct location. */ - while (i != pos && func->less(root, data + i * func->elem_size)) + while (i != pos && func->less(root, data + i * elem_size, args)) i = (i - 1) / 2; /* Shift the element into its correct place. */ j = i; while (i != pos) { i = (i - 1) / 2; - func->swp(data + i * func->elem_size, data + j * func->elem_size); + func->swp(data + i * elem_size, data + j * elem_size, args); + } +} + +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) + +/* Sift up ith element from the heap, O(log2(nr)). */ +static __always_inline +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + size_t parent; + + while (idx) { + parent = (idx - 1) / 2; + if (func->less(data + parent * elem_size, data + idx * elem_size, args)) + break; + func->swp(data + parent * elem_size, data + idx * elem_size, args); + idx = parent; } } +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void min_heapify_all(struct min_heap *heap, - const struct min_heap_callbacks *func) +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - min_heapify(heap, i, func); + __min_heap_sift_down(heap, i, elem_size, func, args); } +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -void min_heap_pop(struct min_heap *heap, - const struct min_heap_callbacks *func) +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) - return; + return false; /* Place last element at the root (position 0) and then sift down. */ heap->nr--; - memcpy(data, data + (heap->nr * func->elem_size), func->elem_size); - min_heapify(heap, 0, func); + memcpy(data, data + (heap->nr * elem_size), elem_size); + __min_heap_sift_down(heap, 0, elem_size, func, args); + + return true; } +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + /* * Remove the minimum element and then push the given element. The * implementation performs 1 sift (O(log2(nr))) and is therefore more * efficient than a pop followed by a push that does 2. */ static __always_inline -void min_heap_pop_push(struct min_heap *heap, - const void *element, - const struct min_heap_callbacks *func) +void __min_heap_pop_push(min_heap_char *heap, + const void *element, size_t elem_size, + const struct min_heap_callbacks *func, + void *args) { - memcpy(heap->data, element, func->elem_size); - min_heapify(heap, 0, func); + memcpy(heap->data, element, elem_size); + __min_heap_sift_down(heap, 0, elem_size, func, args); } +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -void min_heap_push(struct min_heap *heap, const void *element, - const struct min_heap_callbacks *func) +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; - void *child, *parent; int pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) - return; + return false; /* Place at the end of data. */ pos = heap->nr; - memcpy(data + (pos * func->elem_size), element, func->elem_size); + memcpy(data + (pos * elem_size), element, elem_size); heap->nr++; /* Sift child at pos up. */ - for (; pos > 0; pos = (pos - 1) / 2) { - child = data + (pos * func->elem_size); - parent = data + ((pos - 1) / 2) * func->elem_size; - if (func->less(parent, child)) - break; - func->swp(parent, child); - } + __min_heap_sift_up(heap, elem_size, pos, func, args); + + return true; } +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + +/* Remove ith element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + if (idx == heap->nr) + return true; + func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); + __min_heap_sift_up(heap, elem_size, idx, func, args); + __min_heap_sift_down(heap, idx, elem_size, func, args); + + return true; +} + +#define min_heap_del(_heap, _idx, _func, _args) \ + __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + #endif /* _LINUX_MIN_HEAP_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ab3d78116043..7d044e737dba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,8 +1202,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() instead. + * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { @@ -1211,40 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ -static inline void page_mapcount_reset(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -/** - * page_mapcount() - Number of times this precise page is mapped. - * @page: The page. - * - * The number of times this page is mapped. If this page is part of - * a large folio, it includes the number of times this page is mapped - * as part of that folio. - * - * Will report 0 for pages which cannot be mapped into userspace, eg - * slab, page tables and similar. - */ -static inline int page_mapcount(struct page *page) -{ - int mapcount = atomic_read(&page->_mapcount) + 1; - - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) - mapcount = 0; - if (unlikely(PageCompound(page))) - mapcount += folio_entire_mapcount(page_folio(page)); - - return mapcount; -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); @@ -1326,6 +1291,7 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); @@ -1612,17 +1578,19 @@ static inline void put_page(struct page *page) * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() - * provides safe operation for get_user_pages(), page_mkclean() and + * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { @@ -1953,8 +1921,8 @@ static inline struct folio *pfn_folio(unsigned long pfn) * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { @@ -1973,11 +1941,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio) GUP_PIN_COUNTING_BIAS; } -static inline bool page_maybe_dma_pinned(struct page *page) -{ - return folio_maybe_dma_pinned(page_folio(page)); -} - /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. @@ -2295,19 +2258,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not @@ -2550,6 +2500,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); @@ -4038,7 +3991,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); -struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { @@ -4059,12 +4011,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i) } #endif -#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr); -#endif - #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); @@ -4105,10 +4051,10 @@ enum mf_result { enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4122,13 +4068,12 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a199c48bc462..485424979254 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -46,9 +46,7 @@ struct mem_cgroup; * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * - * If your page will not be mapped to userspace, you can also use the four - * bytes in the mapcount union, but you must call page_mapcount_reset() - * before freeing it. + * The mapcount field must not be used for own purposes. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the @@ -152,18 +150,31 @@ struct page { union { /* This union is 4 bytes in size. */ /* - * If the page can be mapped to userspace, encodes the number - * of times this page is referenced by a page table. + * For head pages of typed folios, the value stored here + * allows for determining what this page is used for. The + * tail pages of typed folios will not store a type + * (page_type == _mapcount == -1). + * + * See page-flags.h for a list of page types which are currently + * stored here. + * + * Owners of typed folios may reuse the lower 16 bit of the + * head page page_type field after setting the page type, + * but must reset these 16 bit to -1 before clearing the + * page type. */ - atomic_t _mapcount; + unsigned int page_type; /* - * If the page is neither PageSlab nor mappable to userspace, - * the value stored here may help determine what this page - * is used for. See page-flags.h for a list of page types - * which are currently stored here. + * For pages that are part of non-typed folios for which mappings + * are tracked via the RMAP, encodes the number of times this page + * is directly referenced by a page table. + * + * Note that the mapcount is always initialized to -1, so that + * transitions both from it and to it can be tracked, using + * atomic_inc_and_test() and atomic_add_negative(-1). */ - unsigned int page_type; + atomic_t _mapcount; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1dc6248feb83..41458892bc8a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,8 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + NR_MEMMAP, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/msi.h b/include/linux/msi.h index dc27cf3903d5..944979763825 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -19,13 +19,9 @@ */ #include <linux/irqdomain_defs.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/msi_api.h> -#include <linux/xarray.h> -#include <linux/mutex.h> -#include <linux/list.h> #include <linux/irq.h> -#include <linux/bits.h> #include <asm/msi.h> @@ -81,7 +77,6 @@ extern int pci_msi_ignore_mask; /* Helper functions */ struct msi_desc; struct pci_dev; -struct platform_msi_priv_data; struct device_attribute; struct irq_domain; struct irq_affinity_desc; @@ -228,22 +223,6 @@ struct msi_dev_domain { struct irq_domain *domain; }; -/** - * msi_device_data - MSI per device data - * @properties: MSI properties which are interesting to drivers - * @platform_data: Platform-MSI specific data - * @mutex: Mutex protecting the MSI descriptor store - * @__domains: Internal data for per device MSI domains - * @__iter_idx: Index to search the next entry for iterators - */ -struct msi_device_data { - unsigned long properties; - struct platform_msi_priv_data *platform_data; - struct mutex mutex; - struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS]; - unsigned long __iter_idx; -}; - int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); @@ -556,6 +535,8 @@ enum { MSI_FLAG_USE_DEV_FWNODE = (1 << 7), /* Set parent->dev into domain->pm_dev on device domain creation */ MSI_FLAG_PARENT_PM_DEV = (1 << 8), + /* Support for parent mask/unmask */ + MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), @@ -639,35 +620,6 @@ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid); struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); -struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); - -/* When an MSI domain is used as an intermediate domain */ -int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, - int nvec, msi_alloc_info_t *args); -int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, - int virq, int nvec, msi_alloc_info_t *args); -void msi_domain_depopulate_descs(struct device *dev, int virq, int nvec); - -struct irq_domain * -__platform_msi_create_device_domain(struct device *dev, - unsigned int nvec, - bool is_tree, - irq_write_msi_msg_t write_msi_msg, - const struct irq_domain_ops *ops, - void *host_data); - -#define platform_msi_create_device_domain(dev, nvec, write, ops, data) \ - __platform_msi_create_device_domain(dev, nvec, false, write, ops, data) -#define platform_msi_create_device_tree_domain(dev, nvec, write, ops, data) \ - __platform_msi_create_device_domain(dev, nvec, true, write, ops, data) - -int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs); -void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nvec); -void *platform_msi_get_host_data(struct irq_domain *domain); /* Per device platform MSI */ int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nvec, irq_write_msi_msg_t write_msi_msg); diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 947410faf9e2..35ca19ae21ae 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -308,32 +308,32 @@ static inline uint8_t cfi_read_query(struct map_info *map, uint32_t addr) { map_word val = map_read(map, addr); - if (map_bankwidth_is_1(map)) { + if (map_bankwidth_is_1(map)) return val.x[0]; - } else if (map_bankwidth_is_2(map)) { + if (map_bankwidth_is_2(map)) return cfi16_to_cpu(map, val.x[0]); - } else { - /* No point in a 64-bit byteswap since that would just be - swapping the responses from different chips, and we are - only interested in one chip (a representative sample) */ - return cfi32_to_cpu(map, val.x[0]); - } + /* + * No point in a 64-bit byteswap since that would just be + * swapping the responses from different chips, and we are + * only interested in one chip (a representative sample) + */ + return cfi32_to_cpu(map, val.x[0]); } static inline uint16_t cfi_read_query16(struct map_info *map, uint32_t addr) { map_word val = map_read(map, addr); - if (map_bankwidth_is_1(map)) { + if (map_bankwidth_is_1(map)) return val.x[0] & 0xff; - } else if (map_bankwidth_is_2(map)) { + if (map_bankwidth_is_2(map)) return cfi16_to_cpu(map, val.x[0]); - } else { - /* No point in a 64-bit byteswap since that would just be - swapping the responses from different chips, and we are - only interested in one chip (a representative sample) */ - return cfi32_to_cpu(map, val.x[0]); - } + /* + * No point in a 64-bit byteswap since that would just be + * swapping the responses from different chips, and we are + * only interested in one chip (a representative sample) + */ + return cfi32_to_cpu(map, val.x[0]); } void cfi_udelay(int us); diff --git a/include/linux/node.h b/include/linux/node.h index dfc004e4bee7..9a881c2208b3 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -16,7 +16,6 @@ #define _LINUX_NODE_H_ #include <linux/device.h> -#include <linux/cpumask.h> #include <linux/list.h> /** diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 89ea1ebd975a..9f6acadfe0c8 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -620,7 +620,7 @@ enum { * * Structure used between LLDD and nvmet-fc layer to represent the exchange * context for a FC-NVME FCP I/O operation (e.g. a nvme sqe, the sqe-related - * memory transfers, and its assocated cqe transfer). + * memory transfers, and its associated cqe transfer). * * The structure is allocated by the LLDD whenever a FCP CMD IU is received * from the FC link. The address of the structure is passed to the nvmet-fc diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b9e914e1face..5769fe6e4950 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -30,16 +30,11 @@ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) - * - Pages not added to the page allocator when onlining a section because - * they were excluded via the online_page_callback() or because they are - * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). - * - Pages part of an offline section (struct pages of offline sections should - * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) @@ -616,11 +611,6 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -#define MAGIC_HWPOISON 0x48575053U /* HWPS */ -extern void SetPageHWPoisonTakenOff(struct page *page); -extern void ClearPageHWPoisonTakenOff(struct page *page); -extern bool take_page_off_buddy(struct page *page); -extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 @@ -655,27 +645,28 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; + * On an anonymous folio mapped into a user virtual memory area, + * folio->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON - * bit; and then page->mapping points, not to an anon_vma, but to a private + * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points to a struct movable_operations. + * page and then folio->mapping points to a struct movable_operations. * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. + * Please note that, confusingly, "folio_mapping" refers to the inode + * address_space which maps the folio from disk; whereas "folio_mapped" + * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its - * internal states, the page->mapping does not exist as such, nor do these - * flags below. So in order to avoid testing non-existent bits, please - * make sure that PageSlab(page) actually evaluates to false before calling - * the following functions (e.g., PageAnon). See mm/slab.h. + * internal states, the folio->mapping does not exist as such, nor do + * these flags below. So in order to avoid testing non-existent bits, + * please make sure that folio_test_slab(folio) actually evaluates to + * false before calling the following functions (e.g., folio_test_anon). + * See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 @@ -945,22 +936,28 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) */ enum pagetype { - PG_buddy = 0x00000080, - PG_offline = 0x00000100, - PG_table = 0x00000200, - PG_guard = 0x00000400, - PG_hugetlb = 0x00000800, - PG_slab = 0x00001000, - - PAGE_TYPE_BASE = 0xf0000000, - /* Reserve 0x0000007f to catch underflows of _mapcount */ - PAGE_MAPCOUNT_RESERVE = -128, + PG_buddy = 0x40000000, + PG_offline = 0x20000000, + PG_table = 0x10000000, + PG_guard = 0x08000000, + PG_hugetlb = 0x04000000, + PG_slab = 0x02000000, + PG_zsmalloc = 0x01000000, + + PAGE_TYPE_BASE = 0x80000000, + + /* + * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and + * allow owners that set a type to reuse the lower 16 bit for their own + * purposes. + */ + PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, }; #define PageType(page, flag) \ - ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) #define folio_test_type(folio, flag) \ - ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -969,7 +966,7 @@ static inline int page_type_has_type(unsigned int page_type) static inline int page_has_type(const struct page *page) { - return page_type_has_type(page->page_type); + return page_type_has_type(READ_ONCE(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ @@ -1018,15 +1015,22 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * + * When a memory block gets onlined, all pages are initialized with a + * refcount of 1 and PageOffline(). generic_online_page() will + * take care of clearing PageOffline(). + * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() - * pages (now with a reference count of zero) are treated like free pages, - * allowing the containing memory block to get offlined. A driver that + * pages (now with a reference count of zero) are treated like free (unmanaged) + * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will - * require to re-set the pages PageOffline() and not giving them to the - * buddy via online_page_callback_t. + * require not giving them to the buddy via generic_online_page(). + * + * Memory offlining code will not adjust the managed page count for any + * PageOffline() pages, treating them like they were never exposed to the + * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random @@ -1070,6 +1074,8 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) FOLIO_TEST_FLAG_FALSE(hugetlb) #endif +PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 8cd858d912c4..904c52f97284 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -81,4 +81,8 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = page_counter_read(counter); } +void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection); + #endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a0a026d2d244..483a191bb4df 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -208,7 +208,8 @@ enum mapping_flags { AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES, /* must wait for writeback before modifying folio contents */ - AS_UNMOVABLE, /* The mapping cannot be moved, ever */ + AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, + including to move the mapping */ }; /** @@ -309,20 +310,20 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping) clear_bit(AS_STABLE_WRITES, &mapping->flags); } -static inline void mapping_set_unmovable(struct address_space *mapping) +static inline void mapping_set_inaccessible(struct address_space *mapping) { /* - * It's expected unmovable mappings are also unevictable. Compaction + * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); - set_bit(AS_UNMOVABLE, &mapping->flags); + set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_unmovable(struct address_space *mapping) +static inline bool mapping_inaccessible(struct address_space *mapping) { - return test_bit(AS_UNMOVABLE, &mapping->flags); + return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) @@ -433,7 +434,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); @@ -799,7 +799,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -#define swapcache_index(folio) __page_file_index(&(folio)->page) +extern pgoff_t __folio_swap_cache_index(struct folio *folio); /** * folio_index - File index of a folio. @@ -814,9 +814,9 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, */ static inline pgoff_t folio_index(struct folio *folio) { - if (unlikely(folio_test_swapcache(folio))) - return swapcache_index(folio); - return folio->index; + if (unlikely(folio_test_swapcache(folio))) + return __folio_swap_cache_index(folio); + return folio->index; } /** @@ -939,11 +939,6 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_SHIFT; } -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_index(page)) << PAGE_SHIFT; -} - /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. @@ -953,18 +948,6 @@ static inline loff_t folio_pos(struct folio *folio) return page_offset(&folio->page); } -/** - * folio_file_pos - Returns the byte position of this folio in its file. - * @folio: The folio. - * - * This differs from folio_pos() for folios which belong to a swap file. - * NFS is the only filesystem today which needs to use folio_file_pos(). - */ -static inline loff_t folio_file_pos(struct folio *folio) -{ - return page_file_offset(&folio->page); -} - /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ @@ -1318,8 +1301,7 @@ void page_cache_sync_readahead(struct address_space *mapping, * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. - * @folio: The folio at @index which triggered the readahead call. - * @index: Index of first page to be read. + * @folio: The folio which triggered the readahead call. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which @@ -1330,9 +1312,9 @@ void page_cache_sync_readahead(struct address_space *mapping, static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, - struct folio *folio, pgoff_t index, unsigned long req_count) + struct folio *folio, unsigned long req_count) { - DEFINE_READAHEAD(ractl, file, ra, mapping, index); + DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index); page_cache_async_ra(&ractl, folio, req_count); } diff --git a/include/linux/panic.h b/include/linux/panic.h index 6717b15e798c..3130e0b5116b 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -77,9 +77,10 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) #define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { - char c_true; /* character printed when tainted */ - char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ + char c_true; /* character printed when tainted */ + char c_false; /* character printed when not tainted */ + bool module; /* also show as a per-module taint flag */ + const char *desc; /* verbose description of the set taint flag */ }; extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT]; @@ -90,6 +91,7 @@ enum lockdep_ok { }; extern const char *print_tainted(void); +extern const char *print_tainted_verbose(void); extern void add_taint(unsigned flag, enum lockdep_ok); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index acc5f96161fe..85bdf2adb760 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -197,6 +197,8 @@ struct pci_epc_features { #define to_pci_epc(device) container_of((device), struct pci_epc, dev) +#ifdef CONFIG_PCI_ENDPOINT + #define pci_epc_create(dev, ops) \ __pci_epc_create((dev), (ops), THIS_MODULE) #define devm_pci_epc_create(dev, ops) \ @@ -226,7 +228,8 @@ void pci_epc_linkup(struct pci_epc *epc); void pci_epc_linkdown(struct pci_epc *epc); void pci_epc_init_notify(struct pci_epc *epc); void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf); -void pci_epc_bme_notify(struct pci_epc *epc); +void pci_epc_deinit_notify(struct pci_epc *epc); +void pci_epc_bus_master_enable_notify(struct pci_epc *epc); void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, @@ -272,4 +275,14 @@ void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size); void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, void __iomem *virt_addr, size_t size); + +#else +static inline void pci_epc_init_notify(struct pci_epc *epc) +{ +} + +static inline void pci_epc_deinit_notify(struct pci_epc *epc) +{ +} +#endif /* CONFIG_PCI_ENDPOINT */ #endif /* __LINUX_PCI_EPC_H */ diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index adee6a1b35db..0639d4dc8986 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -70,16 +70,18 @@ struct pci_epf_ops { /** * struct pci_epc_event_ops - Callbacks for capturing the EPC events - * @core_init: Callback for the EPC initialization complete event + * @epc_init: Callback for the EPC initialization complete event + * @epc_deinit: Callback for the EPC deinitialization event * @link_up: Callback for the EPC link up event * @link_down: Callback for the EPC link down event - * @bme: Callback for the EPC BME (Bus Master Enable) event + * @bus_master_enable: Callback for the EPC Bus Master Enable event */ struct pci_epc_event_ops { - int (*core_init)(struct pci_epf *epf); + int (*epc_init)(struct pci_epf *epf); + void (*epc_deinit)(struct pci_epf *epf); int (*link_up)(struct pci_epf *epf); int (*link_down)(struct pci_epf *epf); - int (*bme)(struct pci_epf *epf); + int (*bus_master_enable)(struct pci_epf *epf); }; /** diff --git a/include/linux/pci.h b/include/linux/pci.h index cafc5ab1cbcb..9e36b6c1810e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -367,10 +367,11 @@ struct pci_dev { this is D0-D3, D0 being fully functional, and D3 being off. */ u8 pm_cap; /* PM capability offset */ - unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int pme_support:5; /* Bitmask of states from which PME# can be generated */ unsigned int pme_poll:1; /* Poll device's PME status bit */ + unsigned int pinned:1; /* Whether this dev is pinned */ + unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int d1_support:1; /* Low power state D1 is supported */ unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ @@ -1549,10 +1550,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res, resource_size_t size, resource_size_t align, resource_size_t min, unsigned long type_mask, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), + resource_alignf alignf, void *alignf_data); @@ -2300,6 +2298,8 @@ int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask, const char *name); void pcim_iounmap_regions(struct pci_dev *pdev, int mask); +void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long len); extern int pci_pci_problems; #define PCIPCI_FAIL 1 /* No PCI PCI DMA */ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ec3573119923..8efce7414fad 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -475,6 +475,12 @@ do { \ raw_cpu_cmpxchg(pcp, oval, nval); \ }) +#define __this_cpu_try_cmpxchg(pcp, ovalp, nval) \ +({ \ + __this_cpu_preempt_check("try_cmpxchg"); \ + raw_cpu_try_cmpxchg(pcp, ovalp, nval); \ +}) + #define __this_cpu_sub(pcp, val) __this_cpu_add(pcp, -(typeof(pcp))(val)) #define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1) #define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 03053de557cf..4b2047b78b67 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -6,7 +6,6 @@ #include <linux/mmdebug.h> #include <linux/preempt.h> #include <linux/smp.h> -#include <linux/cpumask.h> #include <linux/pfn.h> #include <linux/init.h> #include <linux/cleanup.h> diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 9cacadbd61f8..18cd0c0c73d9 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -15,7 +15,7 @@ extern struct page_ext_operations page_alloc_tagging_ops; static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) { - return (void *)page_ext + page_alloc_tagging_ops.offset; + return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); } static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) @@ -71,6 +71,7 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) static inline void pgalloc_tag_split(struct page *page, unsigned int nr) { int i; + struct page_ext *first_page_ext; struct page_ext *page_ext; union codetag_ref *ref; struct alloc_tag *tag; @@ -78,7 +79,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) if (!mem_alloc_profiling_enabled()) return; - page_ext = page_ext_get(page); + first_page_ext = page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -94,7 +95,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) page_ext = page_ext_next(page_ext); } out: - page_ext_put(page_ext); + page_ext_put(first_page_ext); } static inline struct alloc_tag *pgalloc_tag_get(struct page *page) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 18019f037bae..2a6a3cccfc36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -729,13 +729,18 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization @@ -1084,6 +1089,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1092,12 +1106,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif @@ -1888,9 +1907,12 @@ typedef unsigned int pgtbl_mod_mask; #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif +#ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif +#define __pte_leaf_size(x,y) pte_leaf_size(y) +#endif /* * We always define pmd_pfn for all archs as it's used in lots of generic diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 015751b64746..858c8e7851fb 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -16,7 +16,7 @@ #include <linux/of.h> #include <linux/notifier.h> #include <linux/spinlock.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/time64.h> /* diff --git a/include/linux/poison.h b/include/linux/poison.h index 9c1a035af97c..331a9a996fa8 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -49,12 +49,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/ia64/hp/common/sba_iommu.c **********/ -/* - * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a - * value of "SBAIOMMU POISON\0" for spill-over poisoning. - */ - /********** fs/jbd/journal.c **********/ #define JBD_POISON_FREE 0x5b #define JBD2_POISON_FREE 0x5c diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c852cc882501..72dc7e45c90c 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -309,18 +309,11 @@ struct power_supply { #endif #ifdef CONFIG_LEDS_TRIGGERS - struct led_trigger *charging_full_trig; - char *charging_full_trig_name; + struct led_trigger *trig; struct led_trigger *charging_trig; - char *charging_trig_name; struct led_trigger *full_trig; - char *full_trig_name; - struct led_trigger *online_trig; - char *online_trig_name; struct led_trigger *charging_blink_full_solid_trig; - char *charging_blink_full_solid_trig_name; struct led_trigger *charging_orange_full_green_trig; - char *charging_orange_full_green_trig_name; #endif }; @@ -743,7 +736,7 @@ struct power_supply_battery_info { int overvoltage_limit_uv; int constant_charge_current_max_ua; int constant_charge_voltage_max_uv; - struct power_supply_maintenance_charge_table *maintenance_charge; + const struct power_supply_maintenance_charge_table *maintenance_charge; int maintenance_charge_size; int alert_low_temp_charge_current_ua; int alert_low_temp_charge_voltage_uv; @@ -762,9 +755,9 @@ struct power_supply_battery_info { int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; struct power_supply_resistance_temp_table *resist_table; int resist_table_size; - struct power_supply_vbat_ri_table *vbat2ri_discharging; + const struct power_supply_vbat_ri_table *vbat2ri_discharging; int vbat2ri_discharging_size; - struct power_supply_vbat_ri_table *vbat2ri_charging; + const struct power_supply_vbat_ri_table *vbat2ri_charging; int vbat2ri_charging_size; int bti_resistance_ohm; int bti_resistance_tolerance; @@ -817,7 +810,7 @@ power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *table int table_len, int temp); extern int power_supply_vbat2ri(struct power_supply_battery_info *info, int vbat_uv, bool charging); -extern struct power_supply_maintenance_charge_table * +extern const struct power_supply_maintenance_charge_table * power_supply_get_maintenance_charging_setting(struct power_supply_battery_info *info, int index); extern bool power_supply_battery_bti_in_range(struct power_supply_battery_info *info, int resistance); @@ -831,7 +824,7 @@ extern int power_supply_set_battery_charged(struct power_supply *psy); static inline bool power_supply_supports_maintenance_charging(struct power_supply_battery_info *info) { - struct power_supply_maintenance_charge_table *mt; + const struct power_supply_maintenance_charge_table *mt; mt = power_supply_get_maintenance_charging_setting(info, 0); diff --git a/include/linux/profile.h b/include/linux/profile.h index 04ae5ebcb637..2fb487f61d12 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -4,7 +4,6 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/cpumask.h> #include <linux/cache.h> #include <asm/errno.h> diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 3705c2044fc0..903ddfea8585 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -658,6 +658,7 @@ struct sev_data_snp_launch_update { * @id_auth_paddr: system physical address of ID block authentication structure * @id_block_en: indicates whether ID block is present * @auth_key_en: indicates whether author key is present in authentication structure + * @vcek_disabled: indicates whether use of VCEK is allowed for attestation reports * @rsvd: reserved * @host_data: host-supplied data for guest, not interpreted by firmware */ @@ -667,7 +668,8 @@ struct sev_data_snp_launch_finish { u64 id_auth_paddr; u8 id_block_en:1; u8 auth_key_en:1; - u64 rsvd:62; + u8 vcek_disabled:1; + u64 rsvd:61; u8 host_data[32]; } __packed; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index be450a3477be..13f6f00aecf9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -29,7 +29,6 @@ #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> -#include <linux/cpumask.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7229b9baf20d..0978c64f49d8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -200,6 +200,9 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + /* When (un)mapping zeropages, we should never touch ref+mapcount. */ + VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); + /* * TODO: we get driver-allocated folios that have nothing to do with * the rmap using vm_insert_page(); therefore, we cannot assume that @@ -241,7 +244,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ @@ -681,16 +684,6 @@ struct page_vma_mapped_walk { unsigned int flags; }; -#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ - struct page_vma_mapped_walk name = { \ - .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(_page), \ - .pgoff = page_to_pgoff(_page), \ - .vma = _vma, \ - .address = _address, \ - .flags = _flags, \ - } - #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = folio_pfn(_folio), \ @@ -710,6 +703,30 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) spin_unlock(pvmw->ptl); } +/** + * page_vma_mapped_walk_restart - Restart the page table walk. + * @pvmw: Pointer to struct page_vma_mapped_walk. + * + * It restarts the page table walk when changes occur in the page + * table, such as splitting a PMD. Ensures that the PTL held during + * the previous walk is released and resets the state to allow for + * a new walk starting at the current address stored in pvmw->address. + */ +static inline void +page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) +{ + WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); + + if (likely(pvmw->ptl)) + spin_unlock(pvmw->ptl); + else + WARN_ON_ONCE(1); + + pvmw->ptl = NULL; + pvmw->pmd = NULL; + pvmw->pte = NULL; +} + bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); /* @@ -730,8 +747,6 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); - /* * rmap_walk_control: To control rmap traversing for specific needs * @@ -787,8 +802,4 @@ static inline int folio_mkclean(struct folio *folio) } #endif /* CONFIG_MMU */ -static inline int page_mkclean(struct page *page) -{ - return folio_mkclean(page_folio(page)); -} #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d662cf136021..c09cdcc99471 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -36,6 +36,11 @@ struct sbitmap_word { * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: serializes simultaneous updates of ->word and ->cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index e330ee0205c0..f8d150343d42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -13,7 +13,7 @@ #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> @@ -942,7 +942,7 @@ struct task_struct { #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN @@ -1458,17 +1458,18 @@ struct task_struct { unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif @@ -1617,7 +1618,7 @@ static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } @@ -1791,7 +1792,8 @@ static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpuma } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - if (!cpumask_test_cpu(0, new_mask)) + /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ + if ((*cpumask_bits(new_mask) & 1) == 0) return -EINVAL; return 0; } diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 8bd4fda6e027..2fb266ea69fa 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -7,7 +7,6 @@ #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> -#include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3fb18f7eb73e..1d06b1e5408a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,12 +113,21 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge); #else static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; } +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} #endif #ifdef CONFIG_SHMEM diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9c29bdd5596d..29c3ea5b6e93 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3429,6 +3429,10 @@ static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask, } #define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order) /** @@ -3445,6 +3449,10 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) } #define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_page() dev_alloc_pages(0) /** diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c9efda9df285..d9b03e0746e7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -414,6 +414,11 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) diff --git a/include/linux/slab.h b/include/linux/slab.h index d99afce36098..eb2bf4629157 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -41,7 +41,7 @@ enum _slab_flag_bits { #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC @@ -171,7 +171,7 @@ enum _slab_flag_bits { # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED @@ -407,7 +407,7 @@ enum kmalloc_cache_type { #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif -#ifndef CONFIG_MEMCG_KMEM +#ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, @@ -420,7 +420,7 @@ enum kmalloc_cache_type { #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES @@ -436,7 +436,7 @@ extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ - (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; @@ -464,7 +464,7 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; - if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index 8c9ca857ccf6..c06d17599ae7 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -69,7 +69,7 @@ struct apple_rtkit; * Initializes the internal state required to handle RTKit. This * should usually be called within _probe. * - * @dev: Pointer to the device node this coprocessor is assocated with + * @dev: Pointer to the device node this coprocessor is associated with * @cookie: opaque cookie passed to all functions defined in rtkit_ops * @mbox_name: mailbox name used to communicate with the co-processor * @mbox_idx: mailbox index to be used if mbox_name is NULL @@ -83,7 +83,7 @@ struct apple_rtkit *devm_apple_rtkit_init(struct device *dev, void *cookie, * Non-devm version of devm_apple_rtkit_init. Must be freed with * apple_rtkit_free. * - * @dev: Pointer to the device node this coprocessor is assocated with + * @dev: Pointer to the device node this coprocessor is associated with * @cookie: opaque cookie passed to all functions defined in rtkit_ops * @mbox_name: mailbox name used to communicate with the co-processor * @mbox_idx: mailbox index to be used if mbox_name is NULL diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 6f6cb5fc1242..835bbb2d1f88 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -378,6 +378,20 @@ static inline void smp_mb__after_srcu_read_unlock(void) /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ } +/** + * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock + * + * Converts the preceding srcu_read_lock into a two-way memory barrier. + * + * Call this after srcu_read_lock, to guarantee that all memory operations + * that occur after smp_mb__after_srcu_read_lock will appear to happen after + * the preceding srcu_read_lock. + */ +static inline void smp_mb__after_srcu_read_lock(void) +{ + /* __srcu_read_lock has smp_mb() internally so nothing to do here. */ +} + DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, _T->idx = srcu_read_lock(_T->lock), srcu_read_unlock(_T->lock, _T->idx), diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index ea7a74ea7389..3132262a404d 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -3,7 +3,7 @@ #define _LINUX_STOP_MACHINE #include <linux/cpu.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/smp.h> #include <linux/list.h> diff --git a/include/linux/swap.h b/include/linux/swap.h index e685e93ba354..ba7ea95d1c57 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -405,10 +405,13 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MIN_SWAPPINESS 0 +#define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -478,7 +481,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -556,7 +559,7 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -604,6 +607,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a5c560a2f8c2..cb468e418ea1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -334,7 +334,7 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -359,7 +359,7 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *pte) { } + unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 8d8fac1626bd..cdb58d61c152 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -521,6 +521,6 @@ static inline struct switchtec_dev *to_stdev(struct device *dev) return container_of(dev, struct switchtec_dev, dev); } -extern struct class *switchtec_class; +extern const struct class switchtec_class; #endif diff --git a/include/linux/torture.h b/include/linux/torture.h index 1541454da03e..c2e979f82f8d 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -14,7 +14,7 @@ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/threads.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/seqlock.h> #include <linux/lockdep.h> #include <linux/completion.h> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 689b6d71590e..6be396bb4297 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -16,7 +16,6 @@ #include <linux/srcu.h> #include <linux/errno.h> #include <linux/types.h> -#include <linux/cpumask.h> #include <linux/rcupdate.h> #include <linux/tracepoint-defs.h> #include <linux/static_call.h> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 735eae6e272c..16b0cfa80502 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -624,4 +624,8 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } + +void __meminit mod_node_early_perpage_metadata(int nid, long delta); +void __meminit store_early_perpage_metadata(void); + #endif /* _LINUX_VMSTAT_H */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index d9968bfc8eac..4eb8f9563136 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -12,7 +12,7 @@ #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/rcupdate.h> #include <linux/workqueue_types.h> diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a85b941db97..6cecb4a4f68b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,7 +35,8 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); -bool is_zswap_enabled(void); +bool zswap_is_enabled(void); +bool zswap_never_enabled(void); #else struct zswap_lruvec_state {}; @@ -60,11 +61,16 @@ static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} -static inline bool is_zswap_enabled(void) +static inline bool zswap_is_enabled(void) { return false; } +static inline bool zswap_never_enabled(void) +{ + return true; +} + #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 7c47151d5c72..e5f7ee0864e7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -356,10 +356,9 @@ TRACE_EVENT(aer_event, #define MF_PAGE_TYPE \ EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ - EM ( MF_MSG_SLAB, "kernel slab page" ) \ - EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ @@ -373,6 +372,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ + EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index 5ccc0d91b220..b108176deb22 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later // Copyright (c) 2024 Takashi Sakamoto +#undef TRACE_SYSTEM #define TRACE_SYSTEM firewire #if !defined(_FIREWIRE_TRACE_EVENT_H) || defined(TRACE_HEADER_MULTI_READ) @@ -11,7 +12,7 @@ #include <linux/firewire-constants.h> -#include "../../../drivers/firewire/packet-header-definitions.h" +// Some macros are defined in 'drivers/firewire/packet-header-definitions.h'. // The content of TP_printk field is preprocessed, then put to the module binary. #define ASYNC_HEADER_GET_DESTINATION(header) \ @@ -366,6 +367,544 @@ TRACE_EVENT(bus_reset_handle, ) ); +// Some macros are defined in 'drivers/firewire/phy-packet-definitions.h'. + +// The content of TP_printk field is preprocessed, then put to the module binary. + +#define PHY_PACKET_SELF_ID_GET_PHY_ID(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_PHY_ID_MASK) >> SELF_ID_PHY_ID_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_LINK_ACTIVE(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_LINK_ACTIVE_MASK) >> SELF_ID_ZERO_LINK_ACTIVE_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_GAP_COUNT(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_GAP_COUNT_MASK) >> SELF_ID_ZERO_GAP_COUNT_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_SCODE(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_SCODE_MASK) >> SELF_ID_ZERO_SCODE_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_CONTENDER(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_CONTENDER_MASK) >> SELF_ID_ZERO_CONTENDER_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_POWER_CLASS(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_POWER_CLASS_MASK) >> SELF_ID_ZERO_POWER_CLASS_SHIFT) + +#define PHY_PACKET_SELF_ID_GET_INITIATED_RESET(quads) \ + ((((const u32 *)quads)[0] & SELF_ID_ZERO_INITIATED_RESET_MASK) >> SELF_ID_ZERO_INITIATED_RESET_SHIFT) + +TRACE_EVENT(self_id_sequence, + TP_PROTO(unsigned int card_index, const u32 *self_id_sequence, unsigned int quadlet_count, unsigned int generation), + TP_ARGS(card_index, self_id_sequence, quadlet_count, generation), + TP_STRUCT__entry( + __field(u8, card_index) + __field(u8, generation) + __dynamic_array(u8, port_status, self_id_sequence_get_port_capacity(quadlet_count)) + __dynamic_array(u32, self_id_sequence, quadlet_count) + ), + TP_fast_assign( + __entry->card_index = card_index; + __entry->generation = generation; + { + u8 *port_status = __get_dynamic_array(port_status); + unsigned int port_index; + + for (port_index = 0; port_index < __get_dynamic_array_len(port_status); ++port_index) { + port_status[port_index] = + self_id_sequence_get_port_status(self_id_sequence, + quadlet_count, port_index); + } + } + memcpy(__get_dynamic_array(self_id_sequence), self_id_sequence, + __get_dynamic_array_len(self_id_sequence)); + ), + TP_printk( + "card_index=%u generation=%u phy_id=0x%02x link_active=%s gap_count=%u scode=%u contender=%s power_class=%u initiated_reset=%s port_status=%s self_id_sequence=%s", + __entry->card_index, + __entry->generation, + PHY_PACKET_SELF_ID_GET_PHY_ID(__get_dynamic_array(self_id_sequence)), + PHY_PACKET_SELF_ID_GET_LINK_ACTIVE(__get_dynamic_array(self_id_sequence)) ? "true" : "false", + PHY_PACKET_SELF_ID_GET_GAP_COUNT(__get_dynamic_array(self_id_sequence)), + PHY_PACKET_SELF_ID_GET_SCODE(__get_dynamic_array(self_id_sequence)), + PHY_PACKET_SELF_ID_GET_CONTENDER(__get_dynamic_array(self_id_sequence)) ? "true" : "false", + PHY_PACKET_SELF_ID_GET_POWER_CLASS(__get_dynamic_array(self_id_sequence)), + PHY_PACKET_SELF_ID_GET_INITIATED_RESET(__get_dynamic_array(self_id_sequence)) ? "true" : "false", + __print_array(__get_dynamic_array(port_status), __get_dynamic_array_len(port_status), 1), + __print_array(__get_dynamic_array(self_id_sequence), + __get_dynamic_array_len(self_id_sequence) / QUADLET_SIZE, QUADLET_SIZE) + ) +); + +#undef PHY_PACKET_SELF_ID_GET_PHY_ID +#undef PHY_PACKET_SELF_ID_GET_LINK_ACTIVE +#undef PHY_PACKET_SELF_ID_GET_GAP_COUNT +#undef PHY_PACKET_SELF_ID_GET_SCODE +#undef PHY_PACKET_SELF_ID_GET_CONTENDER +#undef PHY_PACKET_SELF_ID_GET_POWER_CLASS +#undef PHY_PACKET_SELF_ID_GET_INITIATED_RESET + +TRACE_EVENT_CONDITION(isoc_outbound_allocate, + TP_PROTO(const struct fw_iso_context *ctx, unsigned int channel, unsigned int scode), + TP_ARGS(ctx, channel, scode), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(u8, channel) + __field(u8, scode) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->channel = channel; + __entry->scode = scode; + ), + TP_printk( + "context=0x%llx card_index=%u channel=%u scode=%u", + __entry->context, + __entry->card_index, + __entry->channel, + __entry->scode + ) +); + +TRACE_EVENT_CONDITION(isoc_inbound_single_allocate, + TP_PROTO(const struct fw_iso_context *ctx, unsigned int channel, unsigned int header_size), + TP_ARGS(ctx, channel, header_size), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(u8, channel) + __field(u8, header_size) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->channel = channel; + __entry->header_size = header_size; + ), + TP_printk( + "context=0x%llx card_index=%u channel=%u header_size=%u", + __entry->context, + __entry->card_index, + __entry->channel, + __entry->header_size + ) +); + +TRACE_EVENT_CONDITION(isoc_inbound_multiple_allocate, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + ), + TP_printk( + "context=0x%llx card_index=%u", + __entry->context, + __entry->card_index + ) +); + +DECLARE_EVENT_CLASS(isoc_destroy_template, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + ), + TP_printk( + "context=0x%llx card_index=%u", + __entry->context, + __entry->card_index + ) +) + +DEFINE_EVENT_CONDITION(isoc_destroy_template, isoc_outbound_destroy, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT) +); + +DEFINE_EVENT_CONDITION(isoc_destroy_template, isoc_inbound_single_destroy, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +DEFINE_EVENT_CONDITION(isoc_destroy_template, isoc_inbound_multiple_destroy, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL) +); + +TRACE_EVENT(isoc_inbound_multiple_channels, + TP_PROTO(const struct fw_iso_context *ctx, u64 channels), + TP_ARGS(ctx, channels), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(u64, channels) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->channels = channels; + ), + TP_printk( + "context=0x%llx card_index=%u channels=0x%016llx", + __entry->context, + __entry->card_index, + __entry->channels + ) +); + +TRACE_EVENT_CONDITION(isoc_outbound_start, + TP_PROTO(const struct fw_iso_context *ctx, int cycle_match), + TP_ARGS(ctx, cycle_match), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(bool, cycle_match) + __field(u16, cycle) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->cycle_match = cycle_match < 0 ? false : true; + __entry->cycle = __entry->cycle_match ? (u16)cycle_match : 0; + ), + TP_printk( + "context=0x%llx card_index=%u cycle_match=%s cycle=0x%04x", + __entry->context, + __entry->card_index, + __entry->cycle_match ? "true" : "false", + __entry->cycle + ) +); + +DECLARE_EVENT_CLASS(isoc_inbound_start_template, + TP_PROTO(const struct fw_iso_context *ctx, int cycle_match, unsigned int sync, unsigned int tags), + TP_ARGS(ctx, cycle_match, sync, tags), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(bool, cycle_match) + __field(u16, cycle) + __field(u8, sync) + __field(u8, tags) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->cycle_match = cycle_match < 0 ? false : true; + __entry->cycle = __entry->cycle_match ? (u16)cycle_match : 0; + __entry->sync = sync; + __entry->tags = tags; + ), + TP_printk( + "context=0x%llx card_index=%u cycle_match=%s cycle=0x%04x sync=%u tags=%s", + __entry->context, + __entry->card_index, + __entry->cycle_match ? "true" : "false", + __entry->cycle, + __entry->sync, + __print_flags(__entry->tags, "|", + { FW_ISO_CONTEXT_MATCH_TAG0, "0" }, + { FW_ISO_CONTEXT_MATCH_TAG1, "1" }, + { FW_ISO_CONTEXT_MATCH_TAG2, "2" }, + { FW_ISO_CONTEXT_MATCH_TAG3, "3" } + ) + ) +); + +DEFINE_EVENT_CONDITION(isoc_inbound_start_template, isoc_inbound_single_start, + TP_PROTO(const struct fw_iso_context *ctx, int cycle_match, unsigned int sync, unsigned int tags), + TP_ARGS(ctx, cycle_match, sync, tags), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +DEFINE_EVENT_CONDITION(isoc_inbound_start_template, isoc_inbound_multiple_start, + TP_PROTO(const struct fw_iso_context *ctx, int cycle_match, unsigned int sync, unsigned int tags), + TP_ARGS(ctx, cycle_match, sync, tags), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL) +); + +DECLARE_EVENT_CLASS(isoc_stop_template, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + ), + TP_printk( + "context=0x%llx card_index=%u", + __entry->context, + __entry->card_index + ) +) + +DEFINE_EVENT_CONDITION(isoc_stop_template, isoc_outbound_stop, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT) +); + +DEFINE_EVENT_CONDITION(isoc_stop_template, isoc_inbound_single_stop, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +DEFINE_EVENT_CONDITION(isoc_stop_template, isoc_inbound_multiple_stop, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL) +); + +DECLARE_EVENT_CLASS(isoc_flush_template, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + ), + TP_printk( + "context=0x%llx card_index=%u", + __entry->context, + __entry->card_index + ) +); + +DEFINE_EVENT_CONDITION(isoc_flush_template, isoc_outbound_flush, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT) +); + +DEFINE_EVENT_CONDITION(isoc_flush_template, isoc_inbound_single_flush, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +DEFINE_EVENT_CONDITION(isoc_flush_template, isoc_inbound_multiple_flush, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL) +); + +DECLARE_EVENT_CLASS(isoc_flush_completions_template, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + ), + TP_printk( + "context=0x%llx card_index=%u", + __entry->context, + __entry->card_index + ) +); + +DEFINE_EVENT_CONDITION(isoc_flush_completions_template, isoc_outbound_flush_completions, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT) +); + +DEFINE_EVENT_CONDITION(isoc_flush_completions_template, isoc_inbound_single_flush_completions, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +DEFINE_EVENT_CONDITION(isoc_flush_completions_template, isoc_inbound_multiple_flush_completions, + TP_PROTO(const struct fw_iso_context *ctx), + TP_ARGS(ctx), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL) +); + +#define TP_STRUCT__entry_iso_packet(ctx, buffer_offset, packet) \ + TP_STRUCT__entry( \ + __field(u64, context) \ + __field(u8, card_index) \ + __field(u32, buffer_offset) \ + __field(bool, interrupt) \ + __field(bool, skip) \ + __field(u8, sy) \ + __field(u8, tag) \ + __dynamic_array(u32, header, packet->header_length / QUADLET_SIZE) \ + ) + +#define TP_fast_assign_iso_packet(ctx, buffer_offset, packet) \ + TP_fast_assign( \ + __entry->context = (uintptr_t)ctx; \ + __entry->card_index = ctx->card->index; \ + __entry->buffer_offset = buffer_offset; \ + __entry->interrupt = packet->interrupt; \ + __entry->skip = packet->skip; \ + __entry->sy = packet->sy; \ + __entry->tag = packet->tag; \ + memcpy(__get_dynamic_array(header), packet->header, \ + __get_dynamic_array_len(header)); \ + ) + +TRACE_EVENT_CONDITION(isoc_outbound_queue, + TP_PROTO(const struct fw_iso_context *ctx, unsigned long buffer_offset, const struct fw_iso_packet *packet), + TP_ARGS(ctx, buffer_offset, packet), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT), + TP_STRUCT__entry_iso_packet(ctx, buffer_offset, packet), + TP_fast_assign_iso_packet(ctx, buffer_offset, packet), + TP_printk( + "context=0x%llx card_index=%u buffer_offset=0x%x interrupt=%s skip=%s sy=%d tag=%u header=%s", + __entry->context, + __entry->card_index, + __entry->buffer_offset, + __entry->interrupt ? "true" : "false", + __entry->skip ? "true" : "false", + __entry->sy, + __entry->tag, + __print_array(__get_dynamic_array(header), + __get_dynamic_array_len(header) / QUADLET_SIZE, QUADLET_SIZE) + ) +); + +TRACE_EVENT_CONDITION(isoc_inbound_single_queue, + TP_PROTO(const struct fw_iso_context *ctx, unsigned long buffer_offset, const struct fw_iso_packet *packet), + TP_ARGS(ctx, buffer_offset, packet), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE), + TP_STRUCT__entry_iso_packet(ctx, buffer_offset, packet), + TP_fast_assign_iso_packet(ctx, buffer_offset, packet), + TP_printk( + "context=0x%llx card_index=%u buffer_offset=0x%x interrupt=%s skip=%s", + __entry->context, + __entry->card_index, + __entry->buffer_offset, + __entry->interrupt ? "true" : "false", + __entry->skip ? "true" : "false" + ) +); + +TRACE_EVENT_CONDITION(isoc_inbound_multiple_queue, + TP_PROTO(const struct fw_iso_context *ctx, unsigned long buffer_offset, const struct fw_iso_packet *packet), + TP_ARGS(ctx, buffer_offset, packet), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL), + TP_STRUCT__entry_iso_packet(ctx, buffer_offset, packet), + TP_fast_assign_iso_packet(ctx, buffer_offset, packet), + TP_printk( + "context=0x%llx card_index=%u buffer_offset=0x%x interrupt=%s", + __entry->context, + __entry->card_index, + __entry->buffer_offset, + __entry->interrupt ? "true" : "false" + ) +); + +#undef TP_STRUCT__entry_iso_packet +#undef TP_fast_assign_iso_packet + +#ifndef show_cause +enum fw_iso_context_completions_cause { + FW_ISO_CONTEXT_COMPLETIONS_CAUSE_FLUSH = 0, + FW_ISO_CONTEXT_COMPLETIONS_CAUSE_IRQ, + FW_ISO_CONTEXT_COMPLETIONS_CAUSE_HEADER_OVERFLOW, +}; +#define show_cause(cause) \ + __print_symbolic(cause, \ + { FW_ISO_CONTEXT_COMPLETIONS_CAUSE_FLUSH, "FLUSH" }, \ + { FW_ISO_CONTEXT_COMPLETIONS_CAUSE_IRQ, "IRQ" }, \ + { FW_ISO_CONTEXT_COMPLETIONS_CAUSE_HEADER_OVERFLOW, "HEADER_OVERFLOW" } \ + ) +#endif + +DECLARE_EVENT_CLASS(isoc_single_completions_template, + TP_PROTO(const struct fw_iso_context *ctx, u16 timestamp, enum fw_iso_context_completions_cause cause, const u32 *header, unsigned int header_length), + TP_ARGS(ctx, timestamp, cause, header, header_length), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(u16, timestamp) + __field(u8, cause) + __dynamic_array(u32, header, header_length / QUADLET_SIZE) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->timestamp = timestamp; + __entry->cause = cause; + memcpy(__get_dynamic_array(header), header, __get_dynamic_array_len(header)); + ), + TP_printk( + "context=0x%llx card_index=%u timestamp=0x%04x cause=%s header=%s", + __entry->context, + __entry->card_index, + __entry->timestamp, + show_cause(__entry->cause), + __print_array(__get_dynamic_array(header), + __get_dynamic_array_len(header) / QUADLET_SIZE, QUADLET_SIZE) + ) +) + +DEFINE_EVENT_CONDITION(isoc_single_completions_template, isoc_outbound_completions, + TP_PROTO(const struct fw_iso_context *ctx, u16 timestamp, enum fw_iso_context_completions_cause cause, const u32 *header, unsigned int header_length), + TP_ARGS(ctx, timestamp, cause, header, header_length), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_TRANSMIT) +); + +DEFINE_EVENT_CONDITION(isoc_single_completions_template, isoc_inbound_single_completions, + TP_PROTO(const struct fw_iso_context *ctx, u16 timestamp, enum fw_iso_context_completions_cause cause, const u32 *header, unsigned int header_length), + TP_ARGS(ctx, timestamp, cause, header, header_length), + TP_CONDITION(ctx->type == FW_ISO_CONTEXT_RECEIVE) +); + +TRACE_EVENT(isoc_inbound_multiple_completions, + TP_PROTO(const struct fw_iso_context *ctx, unsigned int completed, enum fw_iso_context_completions_cause cause), + TP_ARGS(ctx, completed, cause), + TP_STRUCT__entry( + __field(u64, context) + __field(u8, card_index) + __field(u16, completed) + __field(u8, cause) + ), + TP_fast_assign( + __entry->context = (uintptr_t)ctx; + __entry->card_index = ctx->card->index; + __entry->completed = completed; + __entry->cause = cause; + ), + TP_printk( + "context=0x%llx card_index=%u completed=%u cause=%s", + __entry->context, + __entry->card_index, + __entry->completed, + show_cause(__entry->cause) + ) +); + #undef QUADLET_SIZE #endif // _FIREWIRE_TRACE_EVENT_H diff --git a/include/trace/events/firewire_ohci.h b/include/trace/events/firewire_ohci.h new file mode 100644 index 000000000000..4f9a7f2577f3 --- /dev/null +++ b/include/trace/events/firewire_ohci.h @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Takashi Sakamoto + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM firewire_ohci + +#if !defined(_FIREWIRE_OHCI_TRACE_EVENT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _FIREWIRE_OHCI_TRACE_EVENT_H + +#include <linux/tracepoint.h> + +// Some macros and helper functions are defined in 'drivers/firewire/ohci.c'. + +TRACE_EVENT(irqs, + TP_PROTO(unsigned int card_index, u32 events), + TP_ARGS(card_index, events), + TP_STRUCT__entry( + __field(u8, card_index) + __field(u32, events) + ), + TP_fast_assign( + __entry->card_index = card_index; + __entry->events = events; + ), + TP_printk( + "card_index=%u events=%s", + __entry->card_index, + __print_flags(__entry->events, "|", + { OHCI1394_selfIDComplete, "selfIDComplete" }, + { OHCI1394_RQPkt, "RQPkt" }, + { OHCI1394_RSPkt, "RSPkt" }, + { OHCI1394_reqTxComplete, "reqTxComplete" }, + { OHCI1394_respTxComplete, "respTxComplete" }, + { OHCI1394_isochRx, "isochRx" }, + { OHCI1394_isochTx, "isochTx" }, + { OHCI1394_postedWriteErr, "postedWriteErr" }, + { OHCI1394_cycleTooLong, "cycleTooLong" }, + { OHCI1394_cycle64Seconds, "cycle64Seconds" }, + { OHCI1394_cycleInconsistent, "cycleInconsistent" }, + { OHCI1394_regAccessFail, "regAccessFail" }, + { OHCI1394_unrecoverableError, "unrecoverableError" }, + { OHCI1394_busReset, "busReset" } + ) + ) +); + +#define QUADLET_SIZE 4 + +#define SELF_ID_COUNT_IS_ERROR(reg) \ + (!!(((reg) & OHCI1394_SelfIDCount_selfIDError_MASK) >> OHCI1394_SelfIDCount_selfIDError_SHIFT)) + +#define SELF_ID_COUNT_GET_GENERATION(reg) \ + (((reg) & OHCI1394_SelfIDCount_selfIDGeneration_MASK) >> OHCI1394_SelfIDCount_selfIDGeneration_SHIFT) + +#define SELF_ID_RECEIVE_Q0_GET_GENERATION(quadlet) \ + (((quadlet) & OHCI1394_SELF_ID_RECEIVE_Q0_GENERATION_MASK) >> OHCI1394_SELF_ID_RECEIVE_Q0_GENERATION_SHIFT) + +#define SELF_ID_RECEIVE_Q0_GET_TIMESTAMP(quadlet) \ + (((quadlet) & OHCI1394_SELF_ID_RECEIVE_Q0_TIMESTAMP_MASK) >> OHCI1394_SELF_ID_RECEIVE_Q0_TIMESTAMP_SHIFT) + +TRACE_EVENT(self_id_complete, + TP_PROTO(unsigned int card_index, u32 reg, const __le32 *self_id_receive, bool has_be_header_quirk), + TP_ARGS(card_index, reg, self_id_receive, has_be_header_quirk), + TP_STRUCT__entry( + __field(u8, card_index) + __field(u32, reg) + __dynamic_array(u32, self_id_receive, ohci1394_self_id_count_get_size(reg)) + ), + TP_fast_assign( + __entry->card_index = card_index; + __entry->reg = reg; + { + u32 *ptr = __get_dynamic_array(self_id_receive); + int i; + + for (i = 0; i < __get_dynamic_array_len(self_id_receive) / QUADLET_SIZE; ++i) + ptr[i] = cond_le32_to_cpu(self_id_receive[i], has_be_header_quirk); + } + ), + TP_printk( + "card_index=%u is_error=%s generation_at_bus_reset=%u generation_at_completion=%u timestamp=0x%04x packet_data=%s", + __entry->card_index, + SELF_ID_COUNT_IS_ERROR(__entry->reg) ? "true" : "false", + SELF_ID_COUNT_GET_GENERATION(__entry->reg), + SELF_ID_RECEIVE_Q0_GET_GENERATION(((const u32 *)__get_dynamic_array(self_id_receive))[0]), + SELF_ID_RECEIVE_Q0_GET_TIMESTAMP(((const u32 *)__get_dynamic_array(self_id_receive))[0]), + __print_array(((const u32 *)__get_dynamic_array(self_id_receive)) + 1, + (__get_dynamic_array_len(self_id_receive) / QUADLET_SIZE) - 1, QUADLET_SIZE) + ) +); + +#undef SELF_ID_COUNT_IS_ERROR +#undef SELF_ID_COUNT_GET_GENERATION +#undef SELF_ID_RECEIVE_Q0_GET_GENERATION +#undef SELF_ID_RECEIVE_Q0_GET_TIMESTAMP + +#undef QUADLET_SIZE + +#endif // _FIREWIRE_OHCI_TRACE_EVENT_H + +#include <trace/define_trace.h> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 8a829e0f6e55..b37eb0a7060f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -36,7 +36,7 @@ TRACE_EVENT(kmem_cache_alloc, __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? + __entry->accounted = IS_ENABLED(CONFIG_MEMCG) ? ((gfp_flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)) : false; ), @@ -87,7 +87,7 @@ TRACE_EVENT(kmalloc, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, - (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (IS_ENABLED(CONFIG_MEMCG) && (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 0190ef725b43..cd01dd7b3640 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -22,7 +22,8 @@ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ - EMe(MR_DEMOTION, "demotion") + EM( MR_DEMOTION, "demotion") \ + EMe(MR_DAMON, "damon") /* * First define the enums in the above macros to be exported to userspace diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 191a7e88a8ab..753971770733 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -336,8 +336,10 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) +#define PROCFS_IOCTL_MAGIC 'f' + /* Pagemap ioctl */ -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) +#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg) /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ #define PAGE_IS_WPALLOWED (1 << 0) @@ -396,4 +398,158 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc/<pid>/maps ioctl */ +#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query) + +enum procmap_query_flags { + /* + * VMA permission flags. + * + * Can be used as part of procmap_query.query_flags field to look up + * only VMAs satisfying specified subset of permissions. E.g., specifying + * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs, + * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only + * return read/write VMAs, though both executable/non-executable and + * private/shared will be ignored. + * + * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags + * field to specify actual VMA permissions. + */ + PROCMAP_QUERY_VMA_READABLE = 0x01, + PROCMAP_QUERY_VMA_WRITABLE = 0x02, + PROCMAP_QUERY_VMA_EXECUTABLE = 0x04, + PROCMAP_QUERY_VMA_SHARED = 0x08, + /* + * Query modifier flags. + * + * By default VMA that covers provided address is returned, or -ENOENT + * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest + * VMA with vma_start > addr will be returned if no covering VMA is + * found. + * + * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that + * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA + * to iterate all VMAs with file backing. + */ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10, + PROCMAP_QUERY_FILE_BACKED_VMA = 0x20, +}; + +/* + * Input/output argument structured passed into ioctl() call. It can be used + * to query a set of VMAs (Virtual Memory Areas) of a process. + * + * Each field can be one of three kinds, marked in a short comment to the + * right of the field: + * - "in", input argument, user has to provide this value, kernel doesn't modify it; + * - "out", output argument, kernel sets this field with VMA data; + * - "in/out", input and output argument; user provides initial value (used + * to specify maximum allowable buffer size), and kernel sets it to actual + * amount of data written (or zero, if there is no data). + * + * If matching VMA is found (according to criterias specified by + * query_addr/query_flags, all the out fields are filled out, and ioctl() + * returns 0. If there is no matching VMA, -ENOENT will be returned. + * In case of any other error, negative error code other than -ENOENT is + * returned. + * + * Most of the data is similar to the one returned as text in /proc/<pid>/maps + * file, but procmap_query provides more querying flexibility. There are no + * consistency guarantees between subsequent ioctl() calls, but data returned + * for matched VMA is self-consistent. + */ +struct procmap_query { + /* Query struct size, for backwards/forward compatibility */ + __u64 size; + /* + * Query flags, a combination of enum procmap_query_flags values. + * Defines query filtering and behavior, see enum procmap_query_flags. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_flags; /* in */ + /* + * Query address. By default, VMA that covers this address will + * be looked up. PROCMAP_QUERY_* flags above modify this default + * behavior further. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_addr; /* in */ + /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */ + __u64 vma_flags; /* out */ + /* VMA backing page size granularity. */ + __u64 vma_page_size; /* out */ + /* + * VMA file offset. If VMA has file backing, this specifies offset + * within the file that VMA's start address corresponds to. + * Is set to zero if VMA has no backing file. + */ + __u64 vma_offset; /* out */ + /* Backing file's inode number, or zero, if VMA has no backing file. */ + __u64 inode; /* out */ + /* Backing file's device major/minor number, or zero, if VMA has no backing file. */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + /* + * If set to non-zero value, signals the request to return VMA name + * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix + * appended, if file was unlinked from FS) for matched VMA. VMA name + * can also be some special name (e.g., "[heap]", "[stack]") or could + * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME). + * + * Kernel will set this field to zero, if VMA has no associated name. + * Otherwise kernel will return actual amount of bytes filled in + * user-supplied buffer (see vma_name_addr field below), including the + * terminating zero. + * + * If VMA name is longer that user-supplied maximum buffer size, + * -E2BIG error is returned. + * + * If this field is set to non-zero value, vma_name_addr should point + * to valid user space memory buffer of at least vma_name_size bytes. + * If set to zero, vma_name_addr should be set to zero as well + */ + __u32 vma_name_size; /* in/out */ + /* + * If set to non-zero value, signals the request to extract and return + * VMA's backing file's build ID, if the backing file is an ELF file + * and it contains embedded build ID. + * + * Kernel will set this field to zero, if VMA has no backing file, + * backing file is not an ELF file, or ELF file has no build ID + * embedded. + * + * Build ID is a binary value (not a string). Kernel will set + * build_id_size field to exact number of bytes used for build ID. + * If build ID is requested and present, but needs more bytes than + * user-supplied maximum buffer size (see build_id_addr field below), + * -E2BIG error will be returned. + * + * If this field is set to non-zero value, build_id_addr should point + * to valid user space memory buffer of at least build_id_size bytes. + * If set to zero, build_id_addr should be set to zero as well + */ + __u32 build_id_size; /* in/out */ + /* + * User-supplied address of a buffer of at least vma_name_size bytes + * for kernel to fill with matched VMA's name (see vma_name_size field + * description above for details). + * + * Should be set to zero if VMA name should not be returned. + */ + __u64 vma_name_addr; /* in */ + /* + * User-supplied address of a buffer of at least build_id_size bytes + * for kernel to fill with matched VMA's ELF build ID, if available + * (see build_id_size field description above for details). + * + * Should be set to zero if build ID should not be returned. + */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..637efc055145 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -192,11 +192,24 @@ struct kvm_xen_exit { /* Flags that describe what fields in emulation_failure hold valid data. */ #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) +/* + * struct kvm_run can be modified by userspace at any time, so KVM must be + * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM() + * renames fields in struct kvm_run from <symbol> to <symbol>__unsafe when + * compiled into the kernel, ensuring that any use within KVM is obvious and + * gets extra scrutiny. + */ +#ifdef __KERNEL__ +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe +#else +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol +#endif + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 immediate_exit; + __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ @@ -917,6 +930,9 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 +#define KVM_CAP_X86_GUEST_MODE 238 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1564,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 68625e728f43..2c8dbc74b955 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -12,29 +12,36 @@ #include <linux/types.h> /** - * struct landlock_ruleset_attr - Ruleset definition + * struct landlock_ruleset_attr - Ruleset definition. * - * Argument of sys_landlock_create_ruleset(). This structure can grow in - * future versions. + * Argument of sys_landlock_create_ruleset(). + * + * This structure defines a set of *handled access rights*, a set of actions on + * different object types, which should be denied by default when the ruleset is + * enacted. Vice versa, access rights that are not specifically listed here are + * not going to be denied by this ruleset when it is enacted. + * + * For historical reasons, the %LANDLOCK_ACCESS_FS_REFER right is always denied + * by default, even when its bit is not set in @handled_access_fs. In order to + * add new rules with this access right, the bit must still be set explicitly + * (cf. `Filesystem flags`_). + * + * The explicit listing of *handled access rights* is required for backwards + * compatibility reasons. In most use cases, processes that use Landlock will + * *handle* a wide range or all access rights that they know about at build time + * (and that they have tested with a kernel that supported them all). + * + * This structure can grow in future Landlock versions. */ struct landlock_ruleset_attr { /** - * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_) - * that is handled by this ruleset and should then be forbidden if no - * rule explicitly allow them: it is a deny-by-default list that should - * contain as much Landlock access rights as possible. Indeed, all - * Landlock filesystem access rights that are not part of - * handled_access_fs are allowed. This is needed for backward - * compatibility reasons. One exception is the - * %LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly - * handled, but must still be explicitly handled to add new rules with - * this access right. + * @handled_access_fs: Bitmask of handled filesystem actions + * (cf. `Filesystem flags`_). */ __u64 handled_access_fs; /** - * @handled_access_net: Bitmask of actions (cf. `Network flags`_) - * that is handled by this ruleset and should then be forbidden if no - * rule explicitly allow them. + * @handled_access_net: Bitmask of handled network actions (cf. `Network + * flags`_). */ __u64 handled_access_net; }; @@ -97,20 +104,21 @@ struct landlock_path_beneath_attr { */ struct landlock_net_port_attr { /** - * @allowed_access: Bitmask of allowed access network for a port + * @allowed_access: Bitmask of allowed network actions for a port * (cf. `Network flags`_). */ __u64 allowed_access; /** * @port: Network port in host endianness. * - * It should be noted that port 0 passed to :manpage:`bind(2)` will - * bind to an available port from a specific port range. This can be - * configured thanks to the ``/proc/sys/net/ipv4/ip_local_port_range`` - * sysctl (also used for IPv6). A Landlock rule with port 0 and the - * ``LANDLOCK_ACCESS_NET_BIND_TCP`` right means that requesting to bind - * on port 0 is allowed and it will automatically translate to binding - * on the related port range. + * It should be noted that port 0 passed to :manpage:`bind(2)` will bind + * to an available port from the ephemeral port range. This can be + * configured with the ``/proc/sys/net/ipv4/ip_local_port_range`` sysctl + * (also used for IPv6). + * + * A Landlock rule with port 0 and the ``LANDLOCK_ACCESS_NET_BIND_TCP`` + * right means that requesting to bind on port 0 is allowed and it will + * automatically translate to binding on the related port range. */ __u64 port; }; @@ -131,10 +139,10 @@ struct landlock_net_port_attr { * The following access rights apply only to files: * * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file. - * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that - * you might additionally need the %LANDLOCK_ACCESS_FS_TRUNCATE right in order - * to overwrite files with :manpage:`open(2)` using ``O_TRUNC`` or - * :manpage:`creat(2)`. + * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. When + * opening files for writing, you will often additionally need the + * %LANDLOCK_ACCESS_FS_TRUNCATE right. In many cases, these system calls + * truncate existing files when overwriting them (e.g., :manpage:`creat(2)`). * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access. * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`, * :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with @@ -256,7 +264,7 @@ struct landlock_net_port_attr { * These flags enable to restrict a sandboxed process to a set of network * actions. This is supported since the Landlock ABI version 4. * - * TCP sockets with allowed actions: + * The following access rights apply to TCP port numbers: * * - %LANDLOCK_ACCESS_NET_BIND_TCP: Bind a TCP socket to a local port. * - %LANDLOCK_ACCESS_NET_CONNECT_TCP: Connect an active TCP socket to diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index b7a2c2ee35b7..2289b7c76c59 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -31,6 +31,7 @@ enum { SNP_PLATFORM_STATUS, SNP_COMMIT, SNP_SET_CONFIG, + SNP_VLEK_LOAD, SEV_MAX, }; @@ -215,6 +216,32 @@ struct sev_user_data_snp_config { } __packed; /** + * struct sev_data_snp_vlek_load - SNP_VLEK_LOAD structure + * + * @len: length of the command buffer read by the PSP + * @vlek_wrapped_version: version of wrapped VLEK hashstick (Must be 0h) + * @rsvd: reserved + * @vlek_wrapped_address: address of a wrapped VLEK hashstick + * (struct sev_user_data_snp_wrapped_vlek_hashstick) + */ +struct sev_user_data_snp_vlek_load { + __u32 len; /* In */ + __u8 vlek_wrapped_version; /* In */ + __u8 rsvd[3]; /* In */ + __u64 vlek_wrapped_address; /* In */ +} __packed; + +/** + * struct sev_user_data_snp_vlek_wrapped_vlek_hashstick - Wrapped VLEK data + * + * @data: Opaque data provided by AMD KDS (as described in SEV-SNP Firmware ABI + * 1.54, SNP_VLEK_LOAD) + */ +struct sev_user_data_snp_wrapped_vlek_hashstick { + __u8 data[432]; /* In */ +} __packed; + +/** * struct sev_issue_cmd - SEV ioctl parameters * * @cmd: SEV commands to execute diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h index 154a87a1eca9..fcdfea767fca 100644 --- a/include/uapi/linux/sev-guest.h +++ b/include/uapi/linux/sev-guest.h @@ -89,6 +89,9 @@ struct snp_ext_report_req { #define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) #define SNP_GUEST_VMM_ERR_SHIFT 32 #define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) +#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK) +#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \ + SNP_GUEST_FW_ERR(fw_err)) #define SNP_GUEST_VMM_ERR_INVALID_LEN 1 #define SNP_GUEST_VMM_ERR_BUSY 2 |