From 52db6685932e326ed607644ab7ebdae8c194adda Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 10 Jul 2019 16:59:55 +0900 Subject: ASoC: simple_card_utils.h: care NULL dai at asoc_simple_debug_dai() props->xxx_dai might be NULL when DPCM. This patch cares it for debug. Fixes: commit 0580dde59438 ("ASoC: simple-card-utils: add asoc_simple_debug_info()") Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87o922gw4u.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 954563ee2277..985a5f583de4 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -141,6 +141,10 @@ inline void asoc_simple_debug_dai(struct asoc_simple_priv *priv, { struct device *dev = simple_priv_to_dev(priv); + /* dai might be NULL */ + if (!dai) + return; + if (dai->name) dev_dbg(dev, "%s dai name = %s\n", name, dai->name); -- cgit v1.2.3-59-g8ed1b From 62ec3d13601bd626ca7a0edef6d45dbb753d94e8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Jul 2019 23:23:08 +0900 Subject: ASoC: SOF: use __u32 instead of uint32_t in uapi headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_UAPI_HEADER_TEST=y, exported headers are compile-tested to make sure they can be included from user-space. Currently, header.h and fw.h are excluded from the test coverage. To make them join the compile-test, we need to fix the build errors attached below. For a case like this, we decided to use __u{8,16,32,64} variable types in this discussion: https://lkml.org/lkml/2019/6/5/18 Build log: CC usr/include/sound/sof/header.h.s CC usr/include/sound/sof/fw.h.s In file included from :32:0: ./usr/include/sound/sof/header.h:19:2: error: unknown type name ‘uint32_t’ uint32_t magic; /**< 'S', 'O', 'F', '\0' */ ^~~~~~~~ ./usr/include/sound/sof/header.h:20:2: error: unknown type name ‘uint32_t’ uint32_t type; /**< component specific type */ ^~~~~~~~ ./usr/include/sound/sof/header.h:21:2: error: unknown type name ‘uint32_t’ uint32_t size; /**< size in bytes of data excl. this struct */ ^~~~~~~~ ./usr/include/sound/sof/header.h:22:2: error: unknown type name ‘uint32_t’ uint32_t abi; /**< SOF ABI version */ ^~~~~~~~ ./usr/include/sound/sof/header.h:23:2: error: unknown type name ‘uint32_t’ uint32_t reserved[4]; /**< reserved for future use */ ^~~~~~~~ ./usr/include/sound/sof/header.h:24:2: error: unknown type name ‘uint32_t’ uint32_t data[0]; /**< Component data - opaque to core */ ^~~~~~~~ In file included from :32:0: ./usr/include/sound/sof/fw.h:49:2: error: unknown type name ‘uint32_t’ uint32_t size; /* bytes minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:50:2: error: unknown type name ‘uint32_t’ uint32_t offset; /* offset from base */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:64:2: error: unknown type name ‘uint32_t’ uint32_t size; /* bytes minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:65:2: error: unknown type name ‘uint32_t’ uint32_t num_blocks; /* number of blocks */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:73:2: error: unknown type name ‘uint32_t’ uint32_t file_size; /* size of file minus this header */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:74:2: error: unknown type name ‘uint32_t’ uint32_t num_modules; /* number of modules */ ^~~~~~~~ ./usr/include/sound/sof/fw.h:75:2: error: unknown type name ‘uint32_t’ uint32_t abi; /* version of header format */ ^~~~~~~~ Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20190721142308.30306-1-yamada.masahiro@socionext.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/fw.h | 16 +++++++++------- include/uapi/sound/sof/header.h | 14 ++++++++------ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/sof/fw.h b/include/uapi/sound/sof/fw.h index 1afca973eb09..e9f697467a86 100644 --- a/include/uapi/sound/sof/fw.h +++ b/include/uapi/sound/sof/fw.h @@ -13,6 +13,8 @@ #ifndef __INCLUDE_UAPI_SOF_FW_H__ #define __INCLUDE_UAPI_SOF_FW_H__ +#include + #define SND_SOF_FW_SIG_SIZE 4 #define SND_SOF_FW_ABI 1 #define SND_SOF_FW_SIG "Reef" @@ -46,8 +48,8 @@ enum snd_sof_fw_blk_type { struct snd_sof_blk_hdr { enum snd_sof_fw_blk_type type; - uint32_t size; /* bytes minus this header */ - uint32_t offset; /* offset from base */ + __u32 size; /* bytes minus this header */ + __u32 offset; /* offset from base */ } __packed; /* @@ -61,8 +63,8 @@ enum snd_sof_fw_mod_type { struct snd_sof_mod_hdr { enum snd_sof_fw_mod_type type; - uint32_t size; /* bytes minus this header */ - uint32_t num_blocks; /* number of blocks */ + __u32 size; /* bytes minus this header */ + __u32 num_blocks; /* number of blocks */ } __packed; /* @@ -70,9 +72,9 @@ struct snd_sof_mod_hdr { */ struct snd_sof_fw_header { unsigned char sig[SND_SOF_FW_SIG_SIZE]; /* "Reef" */ - uint32_t file_size; /* size of file minus this header */ - uint32_t num_modules; /* number of modules */ - uint32_t abi; /* version of header format */ + __u32 file_size; /* size of file minus this header */ + __u32 num_modules; /* number of modules */ + __u32 abi; /* version of header format */ } __packed; #endif diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h index 7868990b0d6f..5f4518e7a972 100644 --- a/include/uapi/sound/sof/header.h +++ b/include/uapi/sound/sof/header.h @@ -9,6 +9,8 @@ #ifndef __INCLUDE_UAPI_SOUND_SOF_USER_HEADER_H__ #define __INCLUDE_UAPI_SOUND_SOF_USER_HEADER_H__ +#include + /* * Header for all non IPC ABI data. * @@ -16,12 +18,12 @@ * Used by any bespoke component data structures or binary blobs. */ struct sof_abi_hdr { - uint32_t magic; /**< 'S', 'O', 'F', '\0' */ - uint32_t type; /**< component specific type */ - uint32_t size; /**< size in bytes of data excl. this struct */ - uint32_t abi; /**< SOF ABI version */ - uint32_t reserved[4]; /**< reserved for future use */ - uint32_t data[0]; /**< Component data - opaque to core */ + __u32 magic; /**< 'S', 'O', 'F', '\0' */ + __u32 type; /**< component specific type */ + __u32 size; /**< size in bytes of data excl. this struct */ + __u32 abi; /**< SOF ABI version */ + __u32 reserved[4]; /**< reserved for future use */ + __u32 data[0]; /**< Component data - opaque to core */ } __packed; #endif -- cgit v1.2.3-59-g8ed1b From bca031e2c8aa22a978a2452bf959e27e9fa73dc7 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 18 Jul 2019 08:15:10 +0000 Subject: KVM: arm/arm64: Introduce kvm_pmu_vcpu_init() to setup PMU counter index We use "pmc->idx" and the "chained" bitmap to determine if the pmc is chained, in kvm_pmu_pmc_is_chained(). But idx might be uninitialized (and random) when we doing this decision, through a KVM_ARM_VCPU_INIT ioctl -> kvm_pmu_vcpu_reset(). And the test_bit() against this random idx will potentially hit a KASAN BUG [1]. In general, idx is the static property of a PMU counter that is not expected to be modified across resets, as suggested by Julien. It looks more reasonable if we can setup the PMU counter idx for a vcpu in its creation time. Introduce a new function - kvm_pmu_vcpu_init() for this basic setup. Oh, and the KASAN BUG will get fixed this way. [1] https://www.spinics.net/lists/kvm-arm/msg36700.html Fixes: 80f393a23be6 ("KVM: arm/arm64: Support chained PMU counters") Suggested-by: Andrew Murray Suggested-by: Julien Thierry Acked-by: Julien Thierry Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- include/kvm/arm_pmu.h | 2 ++ virt/kvm/arm/arm.c | 2 ++ virt/kvm/arm/pmu.c | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 16c769a7f979..6db030439e29 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -34,6 +34,7 @@ struct kvm_pmu { u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val); @@ -71,6 +72,7 @@ static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { return 0; } +static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..c704fa696184 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -340,6 +340,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /* Set up the timer */ kvm_timer_vcpu_init(vcpu); + kvm_pmu_vcpu_init(vcpu); + kvm_arm_reset_debug_ptr(vcpu); return kvm_vgic_vcpu_init(vcpu); diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 3dd8238ed246..362a01886bab 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -214,6 +214,20 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) kvm_pmu_release_perf_event(pmc); } +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + /** * kvm_pmu_vcpu_reset - reset pmu state for cpu * @vcpu: The vcpu pointer @@ -224,10 +238,8 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - pmu->pmc[i].idx = i; - } bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); } -- cgit v1.2.3-59-g8ed1b From 9f00baf74e4b6f79a3a3dfab44fb7bb2e797b551 Mon Sep 17 00:00:00 2001 From: Gary R Hook Date: Tue, 30 Jul 2019 16:05:24 +0000 Subject: crypto: ccp - Add support for valid authsize values less than 16 AES GCM encryption allows for authsize values of 4, 8, and 12-16 bytes. Validate the requested authsize, and retain it to save in the request context. Fixes: 36cf515b9bbe2 ("crypto: ccp - Enable support for AES GCM on v5 CCPs") Cc: Signed-off-by: Gary R Hook Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-aes-galois.c | 14 ++++++++++++++ drivers/crypto/ccp/ccp-ops.c | 26 +++++++++++++++++++++----- include/linux/ccp.h | 2 ++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c index d22631cb2bb3..02eba84028b3 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c @@ -58,6 +58,19 @@ static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key, static int ccp_aes_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { + switch (authsize) { + case 16: + case 15: + case 14: + case 13: + case 12: + case 8: + case 4: + break; + default: + return -EINVAL; + } + return 0; } @@ -104,6 +117,7 @@ static int ccp_aes_gcm_crypt(struct aead_request *req, bool encrypt) memset(&rctx->cmd, 0, sizeof(rctx->cmd)); INIT_LIST_HEAD(&rctx->cmd.entry); rctx->cmd.engine = CCP_ENGINE_AES; + rctx->cmd.u.aes.authsize = crypto_aead_authsize(tfm); rctx->cmd.u.aes.type = ctx->u.aes.type; rctx->cmd.u.aes.mode = ctx->u.aes.mode; rctx->cmd.u.aes.action = encrypt; diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index 59f9849c3662..ef723e2722a8 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -622,6 +622,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, unsigned long long *final; unsigned int dm_offset; + unsigned int authsize; unsigned int jobid; unsigned int ilen; bool in_place = true; /* Default value */ @@ -643,6 +644,21 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, if (!aes->key) /* Gotta have a key SGL */ return -EINVAL; + /* Zero defaults to 16 bytes, the maximum size */ + authsize = aes->authsize ? aes->authsize : AES_BLOCK_SIZE; + switch (authsize) { + case 16: + case 15: + case 14: + case 13: + case 12: + case 8: + case 4: + break; + default: + return -EINVAL; + } + /* First, decompose the source buffer into AAD & PT, * and the destination buffer into AAD, CT & tag, or * the input into CT & tag. @@ -657,7 +673,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, p_tag = scatterwalk_ffwd(sg_tag, p_outp, ilen); } else { /* Input length for decryption includes tag */ - ilen = aes->src_len - AES_BLOCK_SIZE; + ilen = aes->src_len - authsize; p_tag = scatterwalk_ffwd(sg_tag, p_inp, ilen); } @@ -839,19 +855,19 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, if (aes->action == CCP_AES_ACTION_ENCRYPT) { /* Put the ciphered tag after the ciphertext. */ - ccp_get_dm_area(&final_wa, 0, p_tag, 0, AES_BLOCK_SIZE); + ccp_get_dm_area(&final_wa, 0, p_tag, 0, authsize); } else { /* Does this ciphered tag match the input? */ - ret = ccp_init_dm_workarea(&tag, cmd_q, AES_BLOCK_SIZE, + ret = ccp_init_dm_workarea(&tag, cmd_q, authsize, DMA_BIDIRECTIONAL); if (ret) goto e_tag; - ret = ccp_set_dm_area(&tag, 0, p_tag, 0, AES_BLOCK_SIZE); + ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize); if (ret) goto e_tag; ret = crypto_memneq(tag.address, final_wa.address, - AES_BLOCK_SIZE) ? -EBADMSG : 0; + authsize) ? -EBADMSG : 0; ccp_dm_free(&tag); } diff --git a/include/linux/ccp.h b/include/linux/ccp.h index 7e9c991c95e0..43ed9e77cf81 100644 --- a/include/linux/ccp.h +++ b/include/linux/ccp.h @@ -173,6 +173,8 @@ struct ccp_aes_engine { enum ccp_aes_mode mode; enum ccp_aes_action action; + u32 authsize; + struct scatterlist *key; u32 key_len; /* In bytes */ -- cgit v1.2.3-59-g8ed1b From 17e433b54393a6269acbcb792da97791fe1592d8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 5 Aug 2019 10:03:19 +0800 Subject: KVM: Fix leak vCPU's VMCS value into other pCPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts), a five years old bug is exposed. Running ebizzy benchmark in three 80 vCPUs VMs on one 80 pCPUs Skylake server, a lot of rcu_sched stall warning splatting in the VMs after stress testing: INFO: rcu_sched detected stalls on CPUs/tasks: { 4 41 57 62 77} (detected by 15, t=60004 jiffies, g=899, c=898, q=15073) Call Trace: flush_tlb_mm_range+0x68/0x140 tlb_flush_mmu.part.75+0x37/0xe0 tlb_finish_mmu+0x55/0x60 zap_page_range+0x142/0x190 SyS_madvise+0x3cd/0x9c0 system_call_fastpath+0x1c/0x21 swait_active() sustains to be true before finish_swait() is called in kvm_vcpu_block(), voluntarily preempted vCPUs are taken into account by kvm_vcpu_on_spin() loop greatly increases the probability condition kvm_arch_vcpu_runnable(vcpu) is checked and can be true, when APICv is enabled the yield-candidate vCPU's VMCS RVI field leaks(by vmx_sync_pir_to_irr()) into spinning-on-a-taken-lock vCPU's current VMCS. This patch fixes it by checking conservatively a subset of events. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Christian Borntraeger Cc: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 98f4a1467 (KVM: add kvm_arch_vcpu_runnable() test to kvm_vcpu_on_spin() loop) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 5 +++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 16 ++++++++++++++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++++- 7 files changed, 59 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0dba7eb24f92..3e34d5fa6708 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -50,6 +50,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) return !!(v->arch.pending_exceptions) || kvm_request_pending(v); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e74f0711eaaf..fc046ca89d32 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1175,6 +1175,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7eafc6907861..d685491fce4d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5190,6 +5190,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -7314,6 +7319,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 074385c86c09..42ed3faa6af8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_test_on(vcpu_to_pi_desc(vcpu)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -7726,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6d951cbd76c..93b0bd45ac73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9698,6 +9698,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c5b5867024c..9e4c2bb90297 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -872,6 +872,7 @@ int kvm_arch_check_processor_compat(void); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed061d8a457c..1f05aeb9da27 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2477,6 +2477,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #endif } +/* + * Unlike kvm_arch_vcpu_runnable, this function is called outside + * a vcpu_load/vcpu_put pair. However, for most architectures + * kvm_arch_vcpu_runnable does not require vcpu_load. + */ +bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + +static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_dy_runnable(vcpu)) + return true; + +#ifdef CONFIG_KVM_ASYNC_PF + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; +#endif + + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -2506,7 +2529,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) + if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) -- cgit v1.2.3-59-g8ed1b From 741cbbae0768b828be2d48331eb371a4f08bbea8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Aug 2019 08:14:25 +0200 Subject: KVM: remove kvm_arch_has_vcpu_debugfs() There is no need for this function as all arches have to implement kvm_arch_create_vcpu_debugfs() no matter what. A #define symbol let us actually simplify the code. Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/kvm/powerpc.c | 10 ---------- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/debugfs.c | 5 ----- include/linux/kvm_host.h | 3 ++- virt/kvm/arm/arm.c | 5 ----- virt/kvm/kvm_main.c | 5 ++--- 8 files changed, 6 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2cfe839f0b3a..1109924560d8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -150,16 +150,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e34d5fa6708..3e566c2e6066 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -457,16 +457,6 @@ err_out: return -EINVAL; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f520cd837fb..f329dcb3f44c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2516,16 +2516,6 @@ out_err: return rc; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc046ca89d32..e92725b2a46f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,8 @@ #include #include +#define __KVM_HAVE_ARCH_VCPU_DEBUGFS + #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 329361b69d5e..9bd93e0d5f63 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -8,11 +8,6 @@ #include #include "lapic.h" -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - static int vcpu_get_timer_advance_ns(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e4c2bb90297..8d34db3c8bc6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -861,8 +861,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); -bool kvm_arch_has_vcpu_debugfs(void); +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +#endif int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index acc43242a310..13f5a1aa6d79 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -144,11 +144,6 @@ out_fail_alloc: return ret; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1f05aeb9da27..4afb1a234018 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2617,12 +2617,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; int ret; - if (!kvm_arch_has_vcpu_debugfs()) - return 0; - if (!debugfs_initialized()) return 0; @@ -2637,6 +2635,7 @@ static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) debugfs_remove_recursive(vcpu->debugfs_dentry); return ret; } +#endif return 0; } -- cgit v1.2.3-59-g8ed1b From 3e7093d045196b1016517631645e874fe903db7e Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 31 Jul 2019 20:56:20 +0200 Subject: KVM: no need to check return value of debugfs_create functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also, when doing this, change kvm_arch_create_vcpu_debugfs() to return void instead of an integer, as we should not care at all about if this function actually does anything or not. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Signed-off-by: Greg Kroah-Hartman Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 41 +++++++++++++---------------------------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 21 +++++---------------- 3 files changed, 19 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9bd93e0d5f63..018aebce33ff 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -43,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { - struct dentry *ret; + debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_offset_fops); - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; - - if (lapic_in_kernel(vcpu)) { - ret = debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_timer_advance_ns_fops); - if (!ret) - return -ENOMEM; - } + if (lapic_in_kernel(vcpu)) + debugfs_create_file("lapic_timer_advance_ns", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - + debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_fops); + debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_frac_fops); } - - return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d34db3c8bc6..fcb46b3374c6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -862,7 +862,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); #endif int kvm_arch_hardware_enable(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4afb1a234018..4feceaa03fb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2615,29 +2615,20 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } -static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; - int ret; if (!debugfs_initialized()) - return 0; + return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); - if (!vcpu->debugfs_dentry) - return -ENOMEM; + vcpu->kvm->debugfs_dentry); - ret = kvm_arch_create_vcpu_debugfs(vcpu); - if (ret < 0) { - debugfs_remove_recursive(vcpu->debugfs_dentry); - return ret; - } + kvm_arch_create_vcpu_debugfs(vcpu); #endif - - return 0; } /* @@ -2672,9 +2663,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_destroy; - r = kvm_create_vcpu_debugfs(vcpu); - if (r) - goto vcpu_destroy; + kvm_create_vcpu_debugfs(vcpu); mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { -- cgit v1.2.3-59-g8ed1b From 5eeaf10eec394b28fad2c58f1f5c3a5da0e87d1c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 2 Aug 2019 10:28:32 +0100 Subject: KVM: arm/arm64: Sync ICH_VMCR_EL2 back when about to block Since commit commit 328e56647944 ("KVM: arm/arm64: vgic: Defer touching GICH_VMCR to vcpu_load/put"), we leave ICH_VMCR_EL2 (or its GICv2 equivalent) loaded as long as we can, only syncing it back when we're scheduled out. There is a small snag with that though: kvm_vgic_vcpu_pending_irq(), which is indirectly called from kvm_vcpu_check_block(), needs to evaluate the guest's view of ICC_PMR_EL1. At the point were we call kvm_vcpu_check_block(), the vcpu is still loaded, and whatever changes to PMR is not visible in memory until we do a vcpu_put(). Things go really south if the guest does the following: mov x0, #0 // or any small value masking interrupts msr ICC_PMR_EL1, x0 [vcpu preempted, then rescheduled, VMCR sampled] mov x0, #ff // allow all interrupts msr ICC_PMR_EL1, x0 wfi // traps to EL2, so samping of VMCR [interrupt arrives just after WFI] Here, the hypervisor's view of PMR is zero, while the guest has enabled its interrupts. kvm_vgic_vcpu_pending_irq() will then say that no interrupts are pending (despite an interrupt being received) and we'll block for no reason. If the guest doesn't have a periodic interrupt firing once it has blocked, it will stay there forever. To avoid this unfortuante situation, let's resync VMCR from kvm_arch_vcpu_blocking(), ensuring that a following kvm_vcpu_check_block() will observe the latest value of PMR. This has been found by booting an arm64 Linux guest with the pseudo NMI feature, and thus using interrupt priorities to mask interrupts instead of the usual PSTATE masking. Cc: stable@vger.kernel.org # 4.12 Fixes: 328e56647944 ("KVM: arm/arm64: vgic: Defer touching GICH_VMCR to vcpu_load/put") Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/arm.c | 11 +++++++++++ virt/kvm/arm/vgic/vgic-v2.c | 9 ++++++++- virt/kvm/arm/vgic/vgic-v3.c | 7 ++++++- virt/kvm/arm/vgic/vgic.c | 11 +++++++++++ virt/kvm/arm/vgic/vgic.h | 2 ++ 6 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 46bbc949c20a..7a30524a80ee 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -350,6 +350,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c704fa696184..482b20256fa8 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -323,6 +323,17 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { + /* + * If we're about to block (most likely because we've just hit a + * WFI), we need to sync back the state of the GIC CPU interface + * so that we have the lastest PMR and group enables. This ensures + * that kvm_arch_vcpu_runnable has up-to-date data to decide + * whether we have pending interrupts. + */ + preempt_disable(); + kvm_vgic_vmcr_sync(vcpu); + preempt_enable(); + kvm_vgic_v4_enable_doorbell(vcpu); } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 6dd5ad706c92..96aab77d0471 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -484,10 +484,17 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_put(struct kvm_vcpu *vcpu) +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + vgic_v2_vmcr_sync(vcpu); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index c2c9ce009f63..0c653a1e5215 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -662,12 +662,17 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) __vgic_v3_activate_traps(vcpu); } -void vgic_v3_put(struct kvm_vcpu *vcpu) +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(cpu_if->vgic_sre)) cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + vgic_v3_vmcr_sync(vcpu); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 04786c8ec77e..13d4b38a94ec 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -919,6 +919,17 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu) vgic_v3_put(vcpu); } +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_vmcr_sync(vcpu); + else + vgic_v3_vmcr_sync(vcpu); +} + int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 57205beaa981..11adbdac1d56 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -193,6 +193,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); @@ -223,6 +224,7 @@ bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); -- cgit v1.2.3-59-g8ed1b From 4b3e30ed3ec7864e798403a63ff2e96bd0c19ab0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 7 Aug 2019 00:23:07 -0500 Subject: Revert "drm/amdkfd: New IOCTL to allocate queue GWS" This reverts commit 1a058c3376765ee31d65e28cbbb9d4ff15120056. This interface is still in too much flux. Revert until it's sorted out. Acked-by: Oak Zeng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 28 ---------------------------- include/uapi/linux/kfd_ioctl.h | 20 +------------------- 2 files changed, 1 insertion(+), 47 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 26b15cc56c31..1d3cd5c50d5f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1567,32 +1567,6 @@ copy_from_user_failed: return err; } -static int kfd_ioctl_alloc_queue_gws(struct file *filep, - struct kfd_process *p, void *data) -{ - int retval; - struct kfd_ioctl_alloc_queue_gws_args *args = data; - struct kfd_dev *dev; - - if (!hws_gws_support) - return -ENODEV; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) { - pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); - return -ENODEV; - } - if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - return -ENODEV; - - mutex_lock(&p->mutex); - retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); - mutex_unlock(&p->mutex); - - args->first_gws = 0; - return retval; -} - static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1795,8 +1769,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, - kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 070d1bc7e725..20917c59f39c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -410,21 +410,6 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { __u32 n_success; /* to/from KFD */ }; -/* Allocate GWS for specific queue - * - * @gpu_id: device identifier - * @queue_id: queue's id that GWS is allocated for - * @num_gws: how many GWS to allocate - * @first_gws: index of the first GWS allocated. - * only support contiguous GWS allocation - */ -struct kfd_ioctl_alloc_queue_gws_args { - __u32 gpu_id; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ -}; - struct kfd_ioctl_get_dmabuf_info_args { __u64 size; /* from KFD */ __u64 metadata_ptr; /* to KFD */ @@ -544,10 +529,7 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_IMPORT_DMABUF \ AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) -#define AMDKFD_IOC_ALLOC_QUEUE_GWS \ - AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) - #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1F +#define AMDKFD_COMMAND_END 0x1E #endif -- cgit v1.2.3-59-g8ed1b From 75354284cc3aa58f7e54d479d9bee69bd2ca828f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 6 Aug 2019 16:14:44 +0900 Subject: auxdisplay: charlcd: move charlcd.h to drivers/auxdisplay This header is included in drivers/auxdisplay/. Make it a local header. Reviewed-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 2 +- drivers/auxdisplay/charlcd.h | 39 +++++++++++++++++++++++++++++++++++++++ drivers/auxdisplay/hd44780.c | 3 +-- drivers/auxdisplay/panel.c | 2 +- include/misc/charlcd.h | 39 --------------------------------------- 5 files changed, 42 insertions(+), 43 deletions(-) create mode 100644 drivers/auxdisplay/charlcd.h delete mode 100644 include/misc/charlcd.h (limited to 'include') diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 92745efefb54..bef6b85778b6 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -20,7 +20,7 @@ #include -#include +#include "charlcd.h" #define LCD_MINOR 156 diff --git a/drivers/auxdisplay/charlcd.h b/drivers/auxdisplay/charlcd.h new file mode 100644 index 000000000000..8cf6c18b0adb --- /dev/null +++ b/drivers/auxdisplay/charlcd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Character LCD driver for Linux + * + * Copyright (C) 2000-2008, Willy Tarreau + * Copyright (C) 2016-2017 Glider bvba + */ + +struct charlcd { + const struct charlcd_ops *ops; + const unsigned char *char_conv; /* Optional */ + + int ifwidth; /* 4-bit or 8-bit (default) */ + int height; + int width; + int bwidth; /* Default set by charlcd_alloc() */ + int hwidth; /* Default set by charlcd_alloc() */ + + void *drvdata; /* Set by charlcd_alloc() */ +}; + +struct charlcd_ops { + /* Required */ + void (*write_cmd)(struct charlcd *lcd, int cmd); + void (*write_data)(struct charlcd *lcd, int data); + + /* Optional */ + void (*write_cmd_raw4)(struct charlcd *lcd, int cmd); /* 4-bit only */ + void (*clear_fast)(struct charlcd *lcd); + void (*backlight)(struct charlcd *lcd, int on); +}; + +struct charlcd *charlcd_alloc(unsigned int drvdata_size); +void charlcd_free(struct charlcd *lcd); + +int charlcd_register(struct charlcd *lcd); +int charlcd_unregister(struct charlcd *lcd); + +void charlcd_poke(struct charlcd *lcd); diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index ab15b64707ad..bcbe13092327 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -14,8 +14,7 @@ #include #include -#include - +#include "charlcd.h" enum hd44780_pin { /* Order does matter due to writing to GPIO array subsets! */ diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index e6bd727da503..85965953683e 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -55,7 +55,7 @@ #include #include -#include +#include "charlcd.h" #define KEYPAD_MINOR 185 diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h deleted file mode 100644 index 8cf6c18b0adb..000000000000 --- a/include/misc/charlcd.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Character LCD driver for Linux - * - * Copyright (C) 2000-2008, Willy Tarreau - * Copyright (C) 2016-2017 Glider bvba - */ - -struct charlcd { - const struct charlcd_ops *ops; - const unsigned char *char_conv; /* Optional */ - - int ifwidth; /* 4-bit or 8-bit (default) */ - int height; - int width; - int bwidth; /* Default set by charlcd_alloc() */ - int hwidth; /* Default set by charlcd_alloc() */ - - void *drvdata; /* Set by charlcd_alloc() */ -}; - -struct charlcd_ops { - /* Required */ - void (*write_cmd)(struct charlcd *lcd, int cmd); - void (*write_data)(struct charlcd *lcd, int data); - - /* Optional */ - void (*write_cmd_raw4)(struct charlcd *lcd, int cmd); /* 4-bit only */ - void (*clear_fast)(struct charlcd *lcd); - void (*backlight)(struct charlcd *lcd, int on); -}; - -struct charlcd *charlcd_alloc(unsigned int drvdata_size); -void charlcd_free(struct charlcd *lcd); - -int charlcd_register(struct charlcd *lcd); -int charlcd_unregister(struct charlcd *lcd); - -void charlcd_poke(struct charlcd *lcd); -- cgit v1.2.3-59-g8ed1b From 26149e3e1f44d27897d0af9ca4bcd723674bad44 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 21 Jul 2019 14:18:42 +0300 Subject: net/mlx5: kTLS, Fix wrong TIS opmod constants Fix the used constants for TLS TIS opmods, per the HW specification. Fixes: a12ff35e0fb7 ("net/mlx5: Introduce TLS TX offload hardware bits and structures") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ce9839c8bc1a..c2f056b5766d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -446,11 +446,11 @@ enum { }; enum { - MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x20, + MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x1, }; enum { - MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x20, + MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x1, }; enum { -- cgit v1.2.3-59-g8ed1b From a9bc3390327317345dd4683b70970c83ab400ea3 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 30 Jul 2019 11:55:25 +0300 Subject: net/mlx5e: kTLS, Fix progress params context WQE layout The TLS progress params context WQE should not include an Eth segment, drop it. In addition, align the tls_progress_params layout with the HW specification document: - fix the tisn field name. - remove the valid bit. Fixes: a12ff35e0fb7 ("net/mlx5: Introduce TLS TX offload hardware bits and structures") Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +++++++-- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 4 ++-- include/linux/mlx5/mlx5_ifc.h | 5 ++--- 4 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index ce1be2a84231..f6b64a03cd06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -184,8 +184,13 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; - struct mlx5_wqe_eth_seg eth; - struct mlx5_wqe_data_seg data[0]; + union { + struct { + struct mlx5_wqe_eth_seg eth; + struct mlx5_wqe_data_seg data[0]; + }; + u8 tls_progress_params_ctx[0]; + }; }; struct mlx5e_rx_wqe_ll { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h index 407da83474ef..b7298f9ee3d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h @@ -11,12 +11,14 @@ #include "accel/tls.h" #define MLX5E_KTLS_STATIC_UMR_WQE_SZ \ - (sizeof(struct mlx5e_umr_wqe) + MLX5_ST_SZ_BYTES(tls_static_params)) + (offsetof(struct mlx5e_umr_wqe, tls_static_params_ctx) + \ + MLX5_ST_SZ_BYTES(tls_static_params)) #define MLX5E_KTLS_STATIC_WQEBBS \ (DIV_ROUND_UP(MLX5E_KTLS_STATIC_UMR_WQE_SZ, MLX5_SEND_WQE_BB)) #define MLX5E_KTLS_PROGRESS_WQE_SZ \ - (sizeof(struct mlx5e_tx_wqe) + MLX5_ST_SZ_BYTES(tls_progress_params)) + (offsetof(struct mlx5e_tx_wqe, tls_progress_params_ctx) + \ + MLX5_ST_SZ_BYTES(tls_progress_params)) #define MLX5E_KTLS_PROGRESS_WQEBBS \ (DIV_ROUND_UP(MLX5E_KTLS_PROGRESS_WQE_SZ, MLX5_SEND_WQE_BB)) #define MLX5E_KTLS_MAX_DUMP_WQEBBS 2 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 3766545ce259..9f67bfb559f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -80,7 +80,7 @@ build_static_params(struct mlx5e_umr_wqe *wqe, u16 pc, u32 sqn, static void fill_progress_params_ctx(void *ctx, struct mlx5e_ktls_offload_context_tx *priv_tx) { - MLX5_SET(tls_progress_params, ctx, pd, priv_tx->tisn); + MLX5_SET(tls_progress_params, ctx, tisn, priv_tx->tisn); MLX5_SET(tls_progress_params, ctx, record_tracker_state, MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); MLX5_SET(tls_progress_params, ctx, auth_state, @@ -104,7 +104,7 @@ build_progress_params(struct mlx5e_tx_wqe *wqe, u16 pc, u32 sqn, PROGRESS_PARAMS_DS_CNT); cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; - fill_progress_params_ctx(wqe->data, priv_tx); + fill_progress_params_ctx(wqe->tls_progress_params_ctx, priv_tx); } static void tx_fill_wi(struct mlx5e_txqsq *sq, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ec571fd7fcf8..b8b570c30b5e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10054,9 +10054,8 @@ struct mlx5_ifc_tls_static_params_bits { }; struct mlx5_ifc_tls_progress_params_bits { - u8 valid[0x1]; - u8 reserved_at_1[0x7]; - u8 pd[0x18]; + u8 reserved_at_0[0x8]; + u8 tisn[0x18]; u8 next_record_tcp_sn[0x20]; -- cgit v1.2.3-59-g8ed1b From 891584f48a9084ba462f10da4c6bb28b6181b543 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 2 Aug 2019 17:15:03 +0200 Subject: inet: frags: re-introduce skb coalescing for local delivery Before commit d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag"), a netperf UDP_STREAM test[0] using big IPv6 datagrams (thus generating many fragments) and running over an IPsec tunnel, reported more than 6Gbps throughput. After that patch, the same test gets only 9Mbps when receiving on a be2net nic (driver can make a big difference here, for example, ixgbe doesn't seem to be affected). By reusing the IPv4 defragmentation code, IPv6 lost fragment coalescing (IPv4 fragment coalescing was dropped by commit 14fe22e33462 ("Revert "ipv4: use skb coalescing in defragmentation"")). Without fragment coalescing, be2net runs out of Rx ring entries and starts to drop frames (ethtool reports rx_drops_no_frags errors). Since the netperf traffic is only composed of UDP fragments, any lost packet prevents reassembly of the full datagram. Therefore, fragments which have no possibility to ever get reassembled pile up in the reassembly queue, until the memory accounting exeeds the threshold. At that point no fragment is accepted anymore, which effectively discards all netperf traffic. When reassembly timeout expires, some stale fragments are removed from the reassembly queue, so a few packets can be received, reassembled and delivered to the netperf receiver. But the nic still drops frames and soon the reassembly queue gets filled again with stale fragments. These long time frames where no datagram can be received explain why the performance drop is so significant. Re-introducing fragment coalescing is enough to get the initial performances again (6.6Gbps with be2net): driver doesn't drop frames anymore (no more rx_drops_no_frags errors) and the reassembly engine works at full speed. This patch is quite conservative and only coalesces skbs for local IPv4 and IPv6 delivery (in order to avoid changing skb geometry when forwarding). Coalescing could be extended in the future if need be, as more scenarios would probably benefit from it. [0]: Test configuration Sender: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir in tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir out tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow netserver -D -L fc00:2::1 Receiver: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir in tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir out tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow netperf -H fc00:2::1 -f k -P 0 -L fc00:1::1 -l 60 -t UDP_STREAM -I 99,5 -i 5,5 -T5,5 -6 Signed-off-by: Guillaume Nault Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv4/inet_fragment.c | 39 +++++++++++++++++++++++---------- net/ipv4/ip_fragment.c | 8 ++++++- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 2 +- 6 files changed, 39 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 010f26b31c89..bac79e817776 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -171,7 +171,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, - void *reasm_data); + void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index e4aba5d485be..bbe9b3b2d395 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -170,7 +170,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index a999451345f9..10d31733297d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -475,11 +475,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, - void *reasm_data) + void *reasm_data, bool try_coalesce) { struct sk_buff **nextp = (struct sk_buff **)reasm_data; struct rb_node *rbn; struct sk_buff *fp; + int sum_truesize; skb_push(head, head->data - skb_network_header(head)); @@ -487,25 +488,41 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &q->rb_fragments); + + sum_truesize = head->truesize; while (rbn || fp) { /* fp points to the next sk_buff in the current run; * rbn points to the next run. */ /* Go through the current run. */ while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; + struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; + bool stolen; + int delta; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; + + if (try_coalesce && skb_try_coalesce(head, fp, &stolen, + &delta)) { + kfree_skb_partial(fp, stolen); + } else { + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + + *nextp = fp; + nextp = &fp->next; + } + + fp = next_frag; } /* Move to the next run. */ if (rbn) { @@ -516,7 +533,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->fqdir, head->truesize); + sub_frag_mem_limit(q->fqdir, sum_truesize); *nextp = NULL; skb_mark_not_on_list(head); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4385eb9e781f..cfeb8890f94e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -393,6 +393,11 @@ err: return err; } +static bool ip_frag_coalesce_ok(const struct ipq *qp) +{ + return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; +} + /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) @@ -421,7 +426,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; - inet_frag_reasm_finish(&qp->q, skb, reasm_data); + inet_frag_reasm_finish(&qp->q, skb, reasm_data, + ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0f82c150543b..fed9666a2f7d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -348,7 +348,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_reset_transport_header(skb); - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->ignore_df = 1; skb->dev = dev; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index ca05b16f1bb9..1f5d4d196dcc 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -282,7 +282,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_reset_transport_header(skb); - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, true); skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); -- cgit v1.2.3-59-g8ed1b From 414776621d1006e57e80e6db7fdc3837897aaa64 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Aug 2019 17:03:59 -0700 Subject: net/tls: prevent skb_orphan() from leaking TLS plain text with offload sk_validate_xmit_skb() and drivers depend on the sk member of struct sk_buff to identify segments requiring encryption. Any operation which removes or does not preserve the original TLS socket such as skb_orphan() or skb_clone() will cause clear text leaks. Make the TCP socket underlying an offloaded TLS connection mark all skbs as decrypted, if TLS TX is in offload mode. Then in sk_validate_xmit_skb() catch skbs which have no socket (or a socket with no validation) and decrypted flag set. Note that CONFIG_SOCK_VALIDATE_XMIT, CONFIG_TLS_DEVICE and sk->sk_validate_xmit_skb are slightly interchangeable right now, they all imply TLS offload. The new checks are guarded by CONFIG_TLS_DEVICE because that's the option guarding the sk_buff->decrypted member. Second, smaller issue with orphaning is that it breaks the guarantee that packets will be delivered to device queues in-order. All TLS offload drivers depend on that scheduling property. This means skb_orphan_partial()'s trick of preserving partial socket references will cause issues in the drivers. We need a full orphan, and as a result netem delay/throttling will cause all TLS offload skbs to be dropped. Reusing the sk_buff->decrypted flag also protects from leaking clear text when incoming, decrypted skb is redirected (e.g. by TC). See commit 0608c69c9a80 ("bpf: sk_msg, sock{map|hash} redirect through ULP") for justification why the internal flag is safe. The only location which could leak the flag in is tcp_bpf_sendmsg(), which is taken care of by clearing the previously unused bit. v2: - remove superfluous decrypted mark copy (Willem); - remove the stale doc entry (Boris); - rely entirely on EOR marking to prevent coalescing (Boris); - use an internal sendpages flag instead of marking the socket (Boris). v3 (Willem): - reorganize the can_skb_orphan_partial() condition; - fix the flag leak-in through tcp_bpf_sendmsg. Signed-off-by: Jakub Kicinski Acked-by: Willem de Bruijn Reviewed-by: Boris Pismenny Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 18 ------------------ include/linux/skbuff.h | 8 ++++++++ include/linux/socket.h | 3 +++ include/net/sock.h | 10 +++++++++- net/core/sock.c | 19 ++++++++++++++----- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_bpf.c | 6 +++++- net/ipv4/tcp_output.c | 3 +++ net/tls/tls_device.c | 9 +++++++-- 9 files changed, 52 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index b70b70dc4524..0dd3f748239f 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -506,21 +506,3 @@ Drivers should ignore the changes to TLS the device feature flags. These flags will be acted upon accordingly by the core ``ktls`` code. TLS device feature flags only control adding of new TLS connection offloads, old connections will remain active after flags are cleared. - -Known bugs -========== - -skb_orphan() leaks clear text ------------------------------ - -Currently drivers depend on the :c:member:`sk` member of -:c:type:`struct sk_buff ` to identify segments requiring -encryption. Any operation which removes or does not preserve the socket -association such as :c:func:`skb_orphan` or :c:func:`skb_clone` -will cause the driver to miss the packets and lead to clear text leaks. - -Redirects leak clear text -------------------------- - -In the RX direction, if segment has already been decrypted by the device -and it gets redirected or mirrored - clear text will be transmitted out. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8af86d995d6..ba5583522d24 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1374,6 +1374,14 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_copy_decrypted(struct sk_buff *to, + const struct sk_buff *from) +{ +#ifdef CONFIG_TLS_DEVICE + to->decrypted = from->decrypted; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 97523818cb14..fc0bed59fc84 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -292,6 +292,9 @@ struct ucred { #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ +#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry + * plain text and require encryption + */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ diff --git a/include/net/sock.h b/include/net/sock.h index 228db3998e46..2c53f1a1d905 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2482,6 +2482,7 @@ static inline bool sk_fullsock(const struct sock *sk) /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) @@ -2489,8 +2490,15 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#ifdef CONFIG_TLS_DEVICE + } else if (unlikely(skb->decrypted)) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; +#endif + } #endif return skb; diff --git a/net/core/sock.c b/net/core/sock.c index d57b0cc995a0..6d08553f885c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1992,6 +1992,19 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ +#ifdef CONFIG_TLS_DEVICE + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb->decrypted) + return false; +#endif + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. @@ -2003,11 +2016,7 @@ void skb_orphan_partial(struct sk_buff *skb) if (skb_is_tcp_pure_ack(skb)) return; - if (skb->destructor == sock_wfree -#ifdef CONFIG_INET - || skb->destructor == tcp_wfree -#endif - ) { + if (can_skb_orphan_partial(skb)) { struct sock *sk = skb->sk; if (refcount_inc_not_zero(&sk->sk_refcnt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 776905899ac0..77b485d60b9d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -984,6 +984,9 @@ new_segment: if (!skb) goto wait_for_memory; +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif skb_entail(sk, skb); copy = size_goal; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3d1e15401384..8a56e09cfb0e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -398,10 +398,14 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; int copied = 0, err = 0; struct sk_psock *psock; long timeo; + int flags; + + /* Don't let internal do_tcp_sendpages() flags through */ + flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); + flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e4afc48d7bb..979520e46e33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1320,6 +1320,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -1874,6 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -2143,6 +2145,7 @@ static int tcp_mtu_probe(struct sock *sk) sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 7c0b2b778703..43922d86e510 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -373,9 +373,9 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; + int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; @@ -390,6 +390,9 @@ static int tls_push_data(struct sock *sk, if (sk->sk_err) return -sk->sk_err; + flags |= MSG_SENDPAGE_DECRYPTED; + tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); @@ -576,7 +579,9 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) gfp_t sk_allocation = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; - tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL); + tls_push_partial_record(sk, ctx, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } -- cgit v1.2.3-59-g8ed1b From 6a0a8d10a3661a036b55af695542a714c429ab7c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Aug 2019 11:01:27 +0200 Subject: netfilter: nf_tables: use-after-free in failing rule with bound set If a rule that has already a bound anonymous set fails to be added, the preparation phase releases the rule and the bound set. However, the transaction object from the abort path still has a reference to the set object that is stale, leading to a use-after-free when checking for the set->bound field. Add a new field to the transaction that specifies if the set is bound, so the abort path can skip releasing it since the rule command owns it and it takes care of releasing it. After this update, the set->bound field is removed. [ 24.649883] Unable to handle kernel paging request at virtual address 0000000000040434 [ 24.657858] Mem abort info: [ 24.660686] ESR = 0x96000004 [ 24.663769] Exception class = DABT (current EL), IL = 32 bits [ 24.669725] SET = 0, FnV = 0 [ 24.672804] EA = 0, S1PTW = 0 [ 24.675975] Data abort info: [ 24.678880] ISV = 0, ISS = 0x00000004 [ 24.682743] CM = 0, WnR = 0 [ 24.685723] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000428952000 [ 24.692207] [0000000000040434] pgd=0000000000000000 [ 24.697119] Internal error: Oops: 96000004 [#1] SMP [...] [ 24.889414] Call trace: [ 24.891870] __nf_tables_abort+0x3f0/0x7a0 [ 24.895984] nf_tables_abort+0x20/0x40 [ 24.899750] nfnetlink_rcv_batch+0x17c/0x588 [ 24.904037] nfnetlink_rcv+0x13c/0x190 [ 24.907803] netlink_unicast+0x18c/0x208 [ 24.911742] netlink_sendmsg+0x1b0/0x350 [ 24.915682] sock_sendmsg+0x4c/0x68 [ 24.919185] ___sys_sendmsg+0x288/0x2c8 [ 24.923037] __sys_sendmsg+0x7c/0xd0 [ 24.926628] __arm64_sys_sendmsg+0x2c/0x38 [ 24.930744] el0_svc_common.constprop.0+0x94/0x158 [ 24.935556] el0_svc_handler+0x34/0x90 [ 24.939322] el0_svc+0x8/0xc [ 24.942216] Code: 37280300 f9404023 91014262 aa1703e0 (f9401863) [ 24.948336] ---[ end trace cebbb9dcbed3b56f ]--- Fixes: f6ac85858976 ("netfilter: nf_tables: unbind set in rule from commit path") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +++++++-- net/netfilter/nf_tables_api.c | 15 ++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9b624566b82d..475d6f28ca67 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -421,8 +421,7 @@ struct nft_set { unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:13, - bound:1, + u16 flags:14, genmask:2; u8 klen; u8 dlen; @@ -1348,12 +1347,15 @@ struct nft_trans_rule { struct nft_trans_set { struct nft_set *set; u32 set_id; + bool bound; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) +#define nft_trans_set_bound(trans) \ + (((struct nft_trans_set *)trans->data)->bound) struct nft_trans_chain { bool update; @@ -1384,12 +1386,15 @@ struct nft_trans_table { struct nft_trans_elem { struct nft_set *set; struct nft_set_elem elem; + bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) +#define nft_trans_elem_set_bound(trans) \ + (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 605a7cfe7ca7..88abbddf8967 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -138,9 +138,14 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) return; list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET && - nft_trans_set(trans) == set) { - set->bound = true; + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (nft_trans_set(trans) == set) + nft_trans_set_bound(trans) = true; + break; + case NFT_MSG_NEWSETELEM: + if (nft_trans_elem_set(trans) == set) + nft_trans_elem_set_bound(trans) = true; break; } } @@ -6906,7 +6911,7 @@ static int __nf_tables_abort(struct net *net) break; case NFT_MSG_NEWSET: trans->ctx.table->use--; - if (nft_trans_set(trans)->bound) { + if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } @@ -6918,7 +6923,7 @@ static int __nf_tables_abort(struct net *net) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_set(trans)->bound) { + if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } -- cgit v1.2.3-59-g8ed1b From cd48bdda4fb82c2fe569d97af4217c530168c99c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Aug 2019 13:57:25 +0200 Subject: sock: make cookie generation global instead of per netns Generating and retrieving socket cookies are a useful feature that is exposed to BPF for various program types through bpf_get_socket_cookie() helper. The fact that the cookie counter is per netns is quite a limitation for BPF in practice in particular for programs in host namespace that use socket cookies as part of a map lookup key since they will be causing socket cookie collisions e.g. when attached to BPF cgroup hooks or cls_bpf on tc egress in host namespace handling container traffic from veth or ipvlan devices with peer in different netns. Change the counter to be global instead. Socket cookie consumers must assume the value as opqaue in any case. Not every socket must have a cookie generated and knowledge of the counter value itself does not provide much value either way hence conversion to global is fine. Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Willem de Bruijn Cc: Martynas Pumputis Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 - include/uapi/linux/bpf.h | 4 ++-- net/core/sock_diag.c | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4a9da951a794..cb668bc2692d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -61,7 +61,6 @@ struct net { spinlock_t rules_mod_lock; u32 hash_mix; - atomic64_t cookie_gen; struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fa1c753dcdbc..a5aa7d3ac6a1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1466,8 +1466,8 @@ union bpf_attr { * If no cookie has been set yet, generate a new cookie. Once * generated, the socket cookie remains stable for the life of the * socket. This helper can be useful for monitoring per socket - * networking traffic statistics as it provides a unique socket - * identifier per namespace. + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. * Return * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 3312a5849a97..c13ffbd33d8d 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,6 +19,7 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; +static atomic64_t cookie_gen; u64 sock_gen_cookie(struct sock *sk) { @@ -27,7 +28,7 @@ u64 sock_gen_cookie(struct sock *sk) if (res) return res; - res = atomic64_inc_return(&sock_net(sk)->cookie_gen); + res = atomic64_inc_return(&cookie_gen); atomic64_cmpxchg(&sk->sk_cookie, 0, res); } } -- cgit v1.2.3-59-g8ed1b From 33dcb37cef741294b481f4d889a465b8091f11bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jul 2019 09:26:40 +0200 Subject: dma-mapping: fix page attributes for dma_mmap_* All the way back to introducing dma_common_mmap we've defaulted to mark the pages as uncached. But this is wrong for DMA coherent devices. Later on DMA_ATTR_WRITE_COMBINE also got incorrect treatment as that flag is only treated special on the alloc side for non-coherent devices. Introduce a new dma_pgprot helper that deals with the check for coherent devices so that only the remapping cases ever reach arch_dma_mmap_pgprot and we thus ensure no aliasing of page attributes happens, which makes the powerpc version of arch_dma_mmap_pgprot obsolete and simplifies the remaining ones. Note that this means arch_dma_mmap_pgprot is a bit misnamed now, but we'll phase it out soon. Fixes: 64ccc9c033c6 ("common: dma-mapping: add support for generic dma_mmap_* calls") Reported-by: Shawn Anastasio Reported-by: Gavin Li Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas # arm64 --- arch/arm/mm/dma-mapping.c | 4 +--- arch/arm64/mm/dma-mapping.c | 4 +--- arch/powerpc/Kconfig | 1 - arch/powerpc/kernel/Makefile | 3 +-- arch/powerpc/kernel/dma-common.c | 17 ----------------- drivers/iommu/dma-iommu.c | 6 +++--- include/linux/dma-noncoherent.h | 13 +++++++++---- kernel/dma/mapping.c | 19 ++++++++++++++++++- kernel/dma/remap.c | 2 +- 9 files changed, 34 insertions(+), 35 deletions(-) delete mode 100644 arch/powerpc/kernel/dma-common.c (limited to 'include') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 6774b03aa405..d42557ee69c2 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2405,9 +2405,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) - return __get_dma_pgprot(attrs, prot); - return prot; + return __get_dma_pgprot(attrs, prot); } void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 1d3f0b5a9940..bd2b039f43a6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -14,9 +14,7 @@ pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (!dev_is_dma_coherent(dev) || (attrs & DMA_ATTR_WRITE_COMBINE)) - return pgprot_writecombine(prot); - return prot; + return pgprot_writecombine(prot); } void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 77f6ebf97113..d8dcd8820369 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,7 +121,6 @@ config PPC select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_MMAP_PGPROT select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ea0c69236789..56dfa7a2a6f2 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -49,8 +49,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o \ - dma-common.o + of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o diff --git a/arch/powerpc/kernel/dma-common.c b/arch/powerpc/kernel/dma-common.c deleted file mode 100644 index dc7ef6b17b69..000000000000 --- a/arch/powerpc/kernel/dma-common.c +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Contains common dma routines for all powerpc platforms. - * - * Copyright (C) 2019 Shawn Anastasio. - */ - -#include -#include - -pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, - unsigned long attrs) -{ - if (!dev_is_dma_coherent(dev)) - return pgprot_noncached(prot); - return prot; -} diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a7f9c3edbcb2..0015fe610b23 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -574,7 +574,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, struct iova_domain *iovad = &cookie->iovad; bool coherent = dev_is_dma_coherent(dev); int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; struct page **pages; struct sg_table sgt; @@ -975,7 +975,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size, return NULL; if (IS_ENABLED(CONFIG_DMA_REMAP) && (!coherent || PageHighMem(page))) { - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); cpu_addr = dma_common_contiguous_remap(page, alloc_size, VM_USERMAP, prot, __builtin_return_address(0)); @@ -1035,7 +1035,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn, off = vma->vm_pgoff; int ret; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index 3813211a9aad..0bff3d7fac92 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -42,13 +42,18 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, dma_addr_t dma_addr); - -#ifdef CONFIG_ARCH_HAS_DMA_MMAP_PGPROT pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs); + +#ifdef CONFIG_MMU +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs); #else -# define arch_dma_mmap_pgprot(dev, prot, attrs) pgprot_noncached(prot) -#endif +static inline pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, + unsigned long attrs) +{ + return prot; /* no protection bits supported without page tables */ +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b945239621d8..b0038ca3aa92 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -150,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, } EXPORT_SYMBOL(dma_get_sgtable_attrs); +#ifdef CONFIG_MMU +/* + * Return the page attributes used for mapping dma_alloc_* memory, either in + * kernel space if remapping is needed, or to userspace through dma_mmap_*. + */ +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) +{ + if (dev_is_dma_coherent(dev) || + (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && + (attrs & DMA_ATTR_NON_CONSISTENT))) + return prot; + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) + return arch_dma_mmap_pgprot(dev, prot, attrs); + return pgprot_noncached(prot); +} +#endif /* CONFIG_MMU */ + /* * Create userspace mapping for the DMA-coherent memory. */ @@ -164,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn; int ret = -ENXIO; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a594aec07882..ffe78f0b2fe4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, - arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { __dma_direct_free_pages(dev, size, page); -- cgit v1.2.3-59-g8ed1b From accd2dd72c8f087441d725dd916688171519e4e6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 Aug 2019 10:23:57 +0200 Subject: PCI/ASPM: Add pcie_aspm_enabled() Add a function checking whether or not PCIe ASPM has been enabled for a given device. It will be used by the NVMe driver to decide how to handle the device during system suspend. Signed-off-by: Rafael J. Wysocki Reviewed-by: Keith Busch Acked-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 20 ++++++++++++++++++++ include/linux/pci.h | 2 ++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..464f8f92653f 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1170,6 +1170,26 @@ static int pcie_aspm_get_policy(char *buffer, const struct kernel_param *kp) module_param_call(policy, pcie_aspm_set_policy, pcie_aspm_get_policy, NULL, 0644); +/** + * pcie_aspm_enabled - Check if PCIe ASPM has been enabled for a device. + * @pdev: Target device. + */ +bool pcie_aspm_enabled(struct pci_dev *pdev) +{ + struct pci_dev *bridge = pci_upstream_bridge(pdev); + bool ret; + + if (!bridge) + return false; + + mutex_lock(&aspm_lock); + ret = bridge->link_state ? !!bridge->link_state->aspm_enabled : false; + mutex_unlock(&aspm_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcie_aspm_enabled); + #ifdef CONFIG_PCIEASPM_DEBUG static ssize_t link_state_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..82e4cd1b7ac3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1567,8 +1567,10 @@ extern bool pcie_ports_native; #ifdef CONFIG_PCIEASPM bool pcie_aspm_support_enabled(void); +bool pcie_aspm_enabled(struct pci_dev *pdev); #else static inline bool pcie_aspm_support_enabled(void) { return false; } +static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEAER -- cgit v1.2.3-59-g8ed1b From 2c8ccb37b08fe364f02a9914daca474d43151453 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Fri, 9 Aug 2019 17:18:16 +0200 Subject: RDMA/siw: Change CQ flags from 64->32 bits This patch changes the driver/user shared (mmapped) CQ notification flags field from unsigned 64-bits size to unsigned 32-bits size. This enables building siw on 32-bit architectures. This patch changes the siw-abi, but as siw was only just merged in this merge window cycle, there are no released kernels with the prior abi. We are making no attempt to be binary compatible with siw user space libraries prior to the merge of siw into the upstream kernel, only moving forward with upstream kernels and upstream rdma-core provided siw libraries are we guaranteeing compatibility. Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20190809151816.13018-1-bmt@zurich.ibm.com Signed-off-by: Doug Ledford --- drivers/infiniband/sw/siw/Kconfig | 2 +- drivers/infiniband/sw/siw/siw.h | 2 +- drivers/infiniband/sw/siw/siw_qp.c | 14 ++++++++++---- drivers/infiniband/sw/siw/siw_verbs.c | 16 +++++++++++----- include/uapi/rdma/siw-abi.h | 3 ++- 5 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index dace276aea14..b622fc62f2cd 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,6 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && LIBCRC32C && 64BIT + depends on INET && INFINIBAND && LIBCRC32C select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 03fd7b2f595f..77b1aabf6ff3 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -214,7 +214,7 @@ struct siw_wqe { struct siw_cq { struct ib_cq base_cq; spinlock_t lock; - u64 *notify; + struct siw_cq_ctrl *notify; struct siw_cqe *queue; u32 cq_put; u32 cq_get; diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index e27bd5b35b96..0990307c5d2c 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -1013,18 +1013,24 @@ out: */ static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags) { - u64 cq_notify; + u32 cq_notify; if (!cq->base_cq.comp_handler) return false; - cq_notify = READ_ONCE(*cq->notify); + /* Read application shared notification state */ + cq_notify = READ_ONCE(cq->notify->flags); if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) || ((cq_notify & SIW_NOTIFY_SOLICITED) && (flags & SIW_WQE_SOLICITED))) { - /* dis-arm CQ */ - smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); + /* + * CQ notification is one-shot: Since the + * current CQE causes user notification, + * the CQ gets dis-aremd and must be re-aremd + * by the user for a new notification. + */ + WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT); return true; } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 32dc79d0e898..e7f3a2379d9d 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1049,7 +1049,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, spin_lock_init(&cq->lock); - cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; if (udata) { struct siw_uresp_create_cq uresp = {}; @@ -1141,11 +1141,17 @@ int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) siw_dbg_cq(cq, "flags: 0x%02x\n", flags); if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) - /* CQ event for next solicited completion */ - smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + /* + * Enable CQ event for next solicited completion. + * and make it visible to all associated producers. + */ + smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); else - /* CQ event for any signalled completion */ - smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + /* + * Enable CQ event for any signalled completion. + * and make it visible to all associated producers. + */ + smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); if (flags & IB_CQ_REPORT_MISSED_EVENTS) return cq->cq_put - cq->cq_get; diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h index 7de68f1dc707..af735f55b291 100644 --- a/include/uapi/rdma/siw-abi.h +++ b/include/uapi/rdma/siw-abi.h @@ -180,6 +180,7 @@ struct siw_cqe { * to control CQ arming. */ struct siw_cq_ctrl { - __aligned_u64 notify; + __u32 flags; + __u32 pad; }; #endif -- cgit v1.2.3-59-g8ed1b From 76470ccd62f18bfa0954bec10f2329339f793914 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 13 Aug 2019 15:37:04 -0700 Subject: mm: document zone device struct page field usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/hmm: fixes for device private page migration", v3. Testing the latest linux git tree turned up a few bugs with page migration to and from ZONE_DEVICE private and anonymous pages. Hopefully it clarifies how ZONE_DEVICE private struct page uses the same mapping and index fields from the source anonymous page mapping. This patch (of 3): Struct page for ZONE_DEVICE private pages uses the page->mapping and and page->index fields while the source anonymous pages are migrated to device private memory. This is so rmap_walk() can find the page when migrating the ZONE_DEVICE private page back to system memory. ZONE_DEVICE pmem backed fsdax pages also use the page->mapping and page->index fields when files are mapped into a process address space. Add comments to struct page and remove the unused "_zd_pad_1" field to make this more clear. Link: http://lkml.kernel.org/r/20190724232700.23327-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a37a89eb7a7..6a7a1083b6fb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,7 +159,16 @@ struct page { /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; - unsigned long _zd_pad_1; /* uses mapping */ + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ }; /** @rcu_head: You can use this to free a page by RCU. */ -- cgit v1.2.3-59-g8ed1b From ec9f02384f6053f2a5417e82b65078edc5364a8d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 13 Aug 2019 15:37:41 -0700 Subject: mm: workingset: fix vmstat counters for shadow nodes Memcg counters for shadow nodes are broken because the memcg pointer is obtained in a wrong way. The following approach is used: virt_to_page(xa_node)->mem_cgroup Since commit 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") page->mem_cgroup pointer isn't set for slab pages, so memcg_from_slab_page() should be used instead. Also I doubt that it ever worked correctly: virt_to_head_page() should be used instead of virt_to_page(). Otherwise objects residing on tail pages are not accounted, because only the head page contains a valid mem_cgroup pointer. That was a case since the introduction of these counters by the commit 68d48e6a2df5 ("mm: workingset: add vmstat counter for shadow nodes"). Link: http://lkml.kernel.org/r/20190801233532.138743-1-guro@fb.com Fixes: 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Vladimir Davydov Cc: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++++++++++ mm/memcontrol.c | 20 ++++++++++++++++++++ mm/workingset.c | 10 ++++------ 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44c41462be33..2cd4359cb38c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -668,6 +668,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) @@ -1072,6 +1073,14 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } +static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + struct page *page = virt_to_head_page(p); + + __mod_node_page_state(page_pgdat(page), idx, val); +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1159,6 +1168,16 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, 1); +} + +static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, -1); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_state(struct mem_cgroup *memcg, int idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e405e058eda..6f5c0c517c49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -768,6 +768,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +{ + struct page *page = virt_to_head_page(p); + pg_data_t *pgdat = page_pgdat(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = memcg_from_slab_page(page); + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg || memcg == root_mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(pgdat, memcg); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup diff --git a/mm/workingset.c b/mm/workingset.c index e0b4edcb88c8..c963831d354f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -380,14 +380,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __inc_lruvec_slab_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); } } } @@ -480,7 +478,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -503,7 +501,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * shadow entries we were tracking ... */ xas_store(&xas, NULL); - __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3-59-g8ed1b From 0cfaee2af3a04c0be5f056cebe5f804dedc59a43 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 13 Aug 2019 15:37:47 -0700 Subject: include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used A compiler throws a warning on an arm64 system since commit 9849a5697d3d ("arch, mm: convert all architectures to use 5level-fixup.h"), mm/kasan/init.c: In function 'kasan_free_p4d': mm/kasan/init.c:344:9: warning: variable 'p4d' set but not used [-Wunused-but-set-variable] p4d_t *p4d; ^~~ because p4d_none() in "5level-fixup.h" is compiled away while it is a static inline function in "pgtable-nopud.h". However, if converted p4d_none() to a static inline there, powerpc would be unhappy as it reads those in assembler language in "arch/powerpc/include/asm/book3s/64/pgtable.h", so it needs to skip assembly include for the static inline C function. While at it, converted a few similar functions to be consistent with the ones in "pgtable-nopud.h". Link: http://lkml.kernel.org/r/20190806232917.881-1-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Arnd Bergmann Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index bb6cb347018c..f6947da70d71 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -19,9 +19,24 @@ #define p4d_alloc(mm, pgd, address) (pgd) #define p4d_offset(pgd, start) (pgd) -#define p4d_none(p4d) 0 -#define p4d_bad(p4d) 0 -#define p4d_present(p4d) 1 + +#ifndef __ASSEMBLY__ +static inline int p4d_none(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_bad(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return 1; +} +#endif + #define p4d_ERROR(p4d) do { } while (0) #define p4d_clear(p4d) pgd_clear(p4d) #define p4d_val(p4d) pgd_val(p4d) -- cgit v1.2.3-59-g8ed1b From 92717d429b38e4f9f934eed7e605cc42858f1839 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:50 -0700 Subject: Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" Patch series "reapply: relax __GFP_THISNODE for MADV_HUGEPAGE mappings". The fixes for what was originally reported as "pathological THP behavior" we rightfully reverted to be sure not to introduced regressions at end of a merge window after a severe regression report from the kernel bot. We can safely re-apply them now that we had time to analyze the problem. The mm process worked fine, because the good fixes were eventually committed upstream without excessive delay. The regression reported by the kernel bot however forced us to revert the good fixes to be sure not to introduce regressions and to give us the time to analyze the issue further. The silver lining is that this extra time allowed to think more at this issue and also plan for a future direction to improve things further in terms of THP NUMA locality. This patch (of 2): This reverts commit 356ff8a9a78fb35d ("Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). So it reapplies 89c83fb539f954 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). Consolidation of the THP allocation flags at the same place was meant to be a clean up to easier handle otherwise scattered code which is imposing a maintenance burden. There were no real problems observed with the gfp mask consolidation but the reversion was rushed through without a larger consensus regardless. This patch brings the consolidation back because this should make the long term maintainability easier as well as it should allow future changes to be less error prone. [mhocko@kernel.org: changelog additions] Link: http://lkml.kernel.org/r/20190503223146.2312-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++-------- mm/huge_memory.c | 27 ++++++++++++++------------- mm/mempolicy.c | 32 +++----------------------------- mm/shmem.c | 2 +- 4 files changed, 22 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fb07b503dc45..f33881688f42 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ - alloc_pages(gfp_mask, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) + alloc_pages_vma(gfp_mask, 0, vma, addr, node) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1334ede667a8..f7e388b8662d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -644,30 +644,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return gfp_mask | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return GFP_TRANSHUGE_LIGHT; + return gfp_mask; } /* Caller must hold page table lock. */ @@ -739,8 +739,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1347,8 +1347,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 547cd403ed02..9c9877a43d58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1180,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2083,7 +2083,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2094,7 +2093,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2112,31 +2111,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 626d8c74b973..2bed4761f279 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3-59-g8ed1b From a8282608c88e08b1782141026eab61204c1e533f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:53 -0700 Subject: Revert "mm, thp: restore node-local hugepage allocations" This reverts commit 2f0799a0ffc033b ("mm, thp: restore node-local hugepage allocations"). commit 2f0799a0ffc033b was rightfully applied to avoid the risk of a severe regression that was reported by the kernel test robot at the end of the merge window. Now we understood the regression was a false positive and was caused by a significant increase in fairness during a swap trashing benchmark. So it's safe to re-apply the fix and continue improving the code from there. The benchmark that reported the regression is very useful, but it provides a meaningful result only when there is no significant alteration in fairness during the workload. The removal of __GFP_THISNODE increased fairness. __GFP_THISNODE cannot be used in the generic page faults path for new memory allocations under the MPOL_DEFAULT mempolicy, or the allocation behavior significantly deviates from what the MPOL_DEFAULT semantics are supposed to be for THP and 4k allocations alike. Setting THP defrag to "always" or using MADV_HUGEPAGE (with THP defrag set to "madvise") has never meant to provide an implicit MPOL_BIND on the "current" node the task is running on, causing swap storms and providing a much more aggressive behavior than even zone_reclaim_node = 3. Any workload who could have benefited from __GFP_THISNODE has now to enable zone_reclaim_mode=1||2||3. __GFP_THISNODE implicitly provided the zone_reclaim_mode behavior, but it only did so if THP was enabled: if THP was disabled, there would have been no chance to get any 4k page from the current node if the current node was full of pagecache, which further shows how this __GFP_THISNODE was misplaced in MADV_HUGEPAGE. MADV_HUGEPAGE has never been intended to provide any zone_reclaim_mode semantics, in fact the two are orthogonal, zone_reclaim_mode = 1|2|3 must work exactly the same with MADV_HUGEPAGE set or not. The performance characteristic of memory depends on the hardware details. The numbers below are obtained on Naples/EPYC architecture and the N/A projection extends them to show what we should aim for in the future as a good THP NUMA locality default. The benchmark used exercises random memory seeks (note: the cost of the page faults is not part of the measurement). D0 THP | D0 4k | D1 THP | D1 4k | D2 THP | D2 4k | D3 THP | D3 4k | ... 0% | +43% | +45% | +106% | +131% | +224% | N/A | N/A D0 means distance zero (i.e. local memory), D1 means distance one (i.e. intra socket memory), D2 means distance two (i.e. inter socket memory), etc... For the guest physical memory allocated by qemu and for guest mode kernel the performance characteristic of RAM is more complex and an ideal default could be: D0 THP | D1 THP | D0 4k | D2 THP | D1 4k | D3 THP | D2 4k | D3 4k | ... 0% | +58% | +101% | N/A | +222% | N/A | N/A | N/A NOTE: the N/A are projections and haven't been measured yet, the measurement in this case is done on a 1950x with only two NUMA nodes. The THP case here means THP was used both in the host and in the guest. After applying this commit the THP NUMA locality order that we'll get out of MADV_HUGEPAGE is this: D0 THP | D1 THP | D2 THP | D3 THP | ... | D0 4k | D1 4k | D2 4k | D3 4k | ... Before this commit it was: D0 THP | D0 4k | D1 4k | D2 4k | D3 4k | ... Even if we ignore the breakage of large workloads that can't fit in a single node that the __GFP_THISNODE implicit "current node" mbind caused, the THP NUMA locality order provided by __GFP_THISNODE was still not the one we shall aim for in the long term (i.e. the first one at the top). After this commit is applied, we can introduce a new allocator multi order API and to replace those two alloc_pages_vmas calls in the page fault path, with a single multi order call: unsigned int order = (1 << HPAGE_PMD_ORDER) | (1 << 0); page = alloc_pages_multi_order(..., &order); if (!page) goto out; if (!(order & (1 << 0))) { VM_WARN_ON(order != 1 << HPAGE_PMD_ORDER); /* THP fault */ } else { VM_WARN_ON(order != 1 << 0); /* 4k fallback */ } The page allocator logic has to be altered so that when it fails on any zone with order 9, it has to try again with a order 0 before falling back to the next zone in the zonelist. After that we need to do more measurements and evaluate if adding an opt-in feature for guest mode is worth it, to swap "DN 4k | DN+1 THP" with "DN+1 THP | DN 4k" at every NUMA distance crossing. Link: http://lkml.kernel.org/r/20190503223146.2312-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 ++ mm/huge_memory.c | 42 ++++++++++++++++++++++++++---------------- mm/mempolicy.c | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5228c62af416..bac395f1d00a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7e388b8662d..738065f765ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -647,27 +647,37 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + gfp_t this_node = 0; - /* Always do synchronous compaction */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif - /* Kick kcompactd and fail quickly */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; - - /* Synchronous compaction if madvised, otherwise kick kcompactd */ + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); - - /* Only do synchronous compaction if madvised */ + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - - return gfp_mask; + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c9877a43d58..65e0874fce17 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1734,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); -- cgit v1.2.3-59-g8ed1b From d00ee64e1dcf09b3afefd1340f3e9eb637272714 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 12 Aug 2019 13:07:07 -0700 Subject: netlink: Fix nlmsg_parse as a wrapper for strict message parsing Eric reported a syzbot warning: BUG: KMSAN: uninit-value in nh_valid_get_del_req+0x6f1/0x8c0 net/ipv4/nexthop.c:1510 CPU: 0 PID: 11812 Comm: syz-executor444 Not tainted 5.3.0-rc3+ #17 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x191/0x1f0 lib/dump_stack.c:113 kmsan_report+0x162/0x2d0 mm/kmsan/kmsan_report.c:109 __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:294 nh_valid_get_del_req+0x6f1/0x8c0 net/ipv4/nexthop.c:1510 rtm_del_nexthop+0x1b1/0x610 net/ipv4/nexthop.c:1543 rtnetlink_rcv_msg+0x115a/0x1580 net/core/rtnetlink.c:5223 netlink_rcv_skb+0x431/0x620 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf6c/0x1050 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x110f/0x1330 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg net/socket.c:657 [inline] ___sys_sendmsg+0x14ff/0x1590 net/socket.c:2311 __sys_sendmmsg+0x53a/0xae0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg+0xbd/0xe0 net/socket.c:2439 __x64_sys_sendmmsg+0x56/0x70 net/socket.c:2439 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:297 entry_SYSCALL_64_after_hwframe+0x63/0xe7 The root cause is nlmsg_parse calling __nla_parse which means the header struct size is not checked. nlmsg_parse should be a wrapper around __nlmsg_parse with NL_VALIDATE_STRICT for the validate argument very much like nlmsg_parse_deprecated is for NL_VALIDATE_LIBERAL. Fixes: 3de6440354465 ("netlink: re-add parse/validate functions in strict mode") Reported-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index e4650e5b64a1..b140c8f1be22 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -684,9 +684,8 @@ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, const struct nla_policy *policy, struct netlink_ext_ack *extack) { - return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy, - NL_VALIDATE_STRICT, extack); + return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, + NL_VALIDATE_STRICT, extack); } /** -- cgit v1.2.3-59-g8ed1b From 06d9532fa6b34f12a6d75711162d47c17c1add72 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Aug 2019 22:26:36 +0100 Subject: rxrpc: Fix read-after-free in rxrpc_queue_local() rxrpc_queue_local() attempts to queue the local endpoint it is given and then, if successful, prints a trace line. The trace line includes the current usage count - but we're not allowed to look at the local endpoint at this point as we passed our ref on it to the workqueue. Fix this by reading the usage count before queuing the work item. Also fix the reading of local->debug_id for trace lines, which must be done with the same consideration as reading the usage count. Fixes: 09d2bf595db4 ("rxrpc: Add a tracepoint to track rxrpc_local refcounting") Reported-by: syzbot+78e71c5bab4f76a6a719@syzkaller.appspotmail.com Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/local_object.c | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cc1d060cbf13..fa06b528c73c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -498,10 +498,10 @@ rxrpc_tx_points; #define E_(a, b) { a, b } TRACE_EVENT(rxrpc_local, - TP_PROTO(struct rxrpc_local *local, enum rxrpc_local_trace op, + TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, int usage, const void *where), - TP_ARGS(local, op, usage, where), + TP_ARGS(local_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, local ) @@ -511,7 +511,7 @@ TRACE_EVENT(rxrpc_local, ), TP_fast_assign( - __entry->local = local->debug_id; + __entry->local = local_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c45765b7263e..72a6e12a9304 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -93,7 +93,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; - trace_rxrpc_local(local, rxrpc_local_new, 1, NULL); + trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, NULL); } _leave(" = %p", local); @@ -321,7 +321,7 @@ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) int n; n = atomic_inc_return(&local->usage); - trace_rxrpc_local(local, rxrpc_local_got, n, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_got, n, here); return local; } @@ -335,7 +335,8 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) if (local) { int n = atomic_fetch_add_unless(&local->usage, 1, 0); if (n > 0) - trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_got, + n + 1, here); else local = NULL; } @@ -343,16 +344,16 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) } /* - * Queue a local endpoint unless it has become unreferenced and pass the - * caller's reference to the work item. + * Queue a local endpoint and pass the caller's reference to the work item. */ void rxrpc_queue_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); + unsigned int debug_id = local->debug_id; + int n = atomic_read(&local->usage); if (rxrpc_queue_work(&local->processor)) - trace_rxrpc_local(local, rxrpc_local_queued, - atomic_read(&local->usage), here); + trace_rxrpc_local(debug_id, rxrpc_local_queued, n, here); else rxrpc_put_local(local); } @@ -367,7 +368,7 @@ void rxrpc_put_local(struct rxrpc_local *local) if (local) { n = atomic_dec_return(&local->usage); - trace_rxrpc_local(local, rxrpc_local_put, n, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_put, n, here); if (n == 0) call_rcu(&local->rcu, rxrpc_local_rcu); @@ -456,7 +457,7 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; - trace_rxrpc_local(local, rxrpc_local_processing, + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, atomic_read(&local->usage), NULL); do { -- cgit v1.2.3-59-g8ed1b From edfbcb321faf07ca970e4191abe061deeb7d3788 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Aug 2019 10:05:16 +0200 Subject: usb: add a hcd_uses_dma helper The USB buffer allocation code is the only place in the usb core (and in fact the whole kernel) that uses is_device_dma_capable, while the URB mapping code uses the uses_dma flag in struct usb_bus. Switch the buffer allocation to use the uses_dma flag used by the rest of the USB code, and create a helper in hcd.h that checks this flag as well as the CONFIG_HAS_DMA to simplify the caller a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20190811080520.21712-3-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/buffer.c | 10 +++------- drivers/usb/core/hcd.c | 4 ++-- drivers/usb/dwc2/hcd.c | 2 +- include/linux/usb.h | 2 +- include/linux/usb/hcd.h | 3 +++ 5 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c index 1a5b3dcae930..6cf22c27f2d2 100644 --- a/drivers/usb/core/buffer.c +++ b/drivers/usb/core/buffer.c @@ -66,9 +66,7 @@ int hcd_buffer_create(struct usb_hcd *hcd) char name[16]; int i, size; - if (hcd->localmem_pool || - !IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(hcd->self.sysdev)) + if (hcd->localmem_pool || !hcd_uses_dma(hcd)) return 0; for (i = 0; i < HCD_BUFFER_POOLS; i++) { @@ -129,8 +127,7 @@ void *hcd_buffer_alloc( return gen_pool_dma_alloc(hcd->localmem_pool, size, dma); /* some USB hosts just use PIO */ - if (!IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(bus->sysdev)) { + if (!hcd_uses_dma(hcd)) { *dma = ~(dma_addr_t) 0; return kmalloc(size, mem_flags); } @@ -160,8 +157,7 @@ void hcd_buffer_free( return; } - if (!IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(bus->sysdev)) { + if (!hcd_uses_dma(hcd)) { kfree(addr); return; } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 2ccbc2f83570..8592c0344fe8 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1412,7 +1412,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, if (usb_endpoint_xfer_control(&urb->ep->desc)) { if (hcd->self.uses_pio_for_control) return ret; - if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (is_vmalloc_addr(urb->setup_packet)) { WARN_ONCE(1, "setup packet is not dma capable\n"); return -EAGAIN; @@ -1446,7 +1446,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; if (urb->transfer_buffer_length != 0 && !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)) { - if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (urb->num_sgs) { int n; diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index ee144ff8af5b..111787a137ee 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -4608,7 +4608,7 @@ static int _dwc2_hcd_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, buf = urb->transfer_buffer; - if (hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (!buf && (urb->transfer_dma & 3)) { dev_err(hsotg->dev, "%s: unaligned transfer with no transfer_buffer", diff --git a/include/linux/usb.h b/include/linux/usb.h index 83d35d993e8c..e87826e23d59 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1457,7 +1457,7 @@ typedef void (*usb_complete_t)(struct urb *); * field rather than determining a dma address themselves. * * Note that transfer_buffer must still be set if the controller - * does not support DMA (as indicated by bus.uses_dma) and when talking + * does not support DMA (as indicated by hcd_uses_dma()) and when talking * to root hub. If you have to trasfer between highmem zone and the device * on such controller, create a bounce buffer or bail out with an error. * If transfer_buffer cannot be set (is in highmem) and the controller is DMA diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index bab27ccc8ff5..a20e7815d814 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -422,6 +422,9 @@ static inline bool hcd_periodic_completion_in_progress(struct usb_hcd *hcd, return hcd->high_prio_bh.completing_ep == ep; } +#define hcd_uses_dma(hcd) \ + (IS_ENABLED(CONFIG_HAS_DMA) && (hcd)->self.uses_dma) + extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb); extern int usb_hcd_check_unlink_urb(struct usb_hcd *hcd, struct urb *urb, int status); -- cgit v1.2.3-59-g8ed1b From 7b6620d7db566a46f49b4b9deab9fa061fd4b59b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Aug 2019 11:09:16 -0600 Subject: block: remove REQ_NOWAIT_INLINE We had a few issues with this code, and there's still a problem around how we deal with error handling for chained/split bios. For now, just revert the code and we'll try again with a thoroug solution. This reverts commits: e15c2ffa1091 ("block: fix O_DIRECT error handling for bio fragments") 0eb6ddfb865c ("block: Fix __blkdev_direct_IO() for bio fragments") 6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO") 893a1c97205a ("blk-mq: allow REQ_NOWAIT to return an error inline") Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++------ fs/block_dev.c | 49 +++++------------------------------------------ include/linux/blk_types.h | 5 +---- 3 files changed, 8 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index a8e6a58f5f28..0835f4d8d42e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1958,13 +1958,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - - cookie = BLK_QC_T_NONE; - if (bio->bi_opf & REQ_NOWAIT_INLINE) - cookie = BLK_QC_T_EAGAIN; - else if (bio->bi_opf & REQ_NOWAIT) + if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return cookie; + return BLK_QC_T_NONE; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/fs/block_dev.c b/fs/block_dev.c index eb657ab94060..677cb364d33f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -345,24 +345,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; - bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - gfp_t gfp; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - if (nowait) - gfp = GFP_NOWAIT; - else - gfp = GFP_KERNEL; - - bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool); - if (!bio) - return -EAGAIN; + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -384,7 +375,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!is_poll) blk_start_plug(&plug); - ret = 0; for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -409,14 +399,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) task_io_account_write(bio->bi_iter.bi_size); } - /* - * Tell underlying layer to not block for resource shortage. - * And if we would have blocked, return error inline instead - * of through the bio->bi_end_io() callback. - */ - if (nowait) - bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); - + dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); @@ -428,13 +411,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) polled = true; } - dio->size += bio->bi_iter.bi_size; qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } if (polled) WRITE_ONCE(iocb->ki_cookie, qc); @@ -455,19 +432,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) atomic_inc(&dio->ref); } - dio->size += bio->bi_iter.bi_size; - qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } - - bio = bio_alloc(gfp, nr_pages); - if (!bio) { - ret = -EAGAIN; - goto error; - } + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) @@ -487,7 +453,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); -out: if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) @@ -495,10 +460,6 @@ out: bio_put(&dio->bio); return ret; -error: - if (!is_poll) - blk_finish_plug(&plug); - goto out; } static ssize_t diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b1fa1557e68..feff3fe4467e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,7 +311,6 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ - __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -346,7 +345,6 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) -#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -420,13 +418,12 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U -#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; + return cookie != BLK_QC_T_NONE; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3-59-g8ed1b From 58a96fc35375ab87db7c5b69336f5befde1b548f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 16 Jul 2019 20:34:41 +0200 Subject: Bluetooth: Add debug setting for changing minimum encryption key size For testing and qualification purposes it is useful to allow changing the minimum encryption key size value that the host stack is going to enforce. This adds a new debugfs setting min_encrypt_key_size to achieve this functionality. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 1 + net/bluetooth/hci_debugfs.c | 31 +++++++++++++++++++++++++++++++ net/bluetooth/l2cap_core.c | 2 +- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ded574b32c20..ffc95b382eb5 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -278,6 +278,7 @@ struct hci_dev { __u16 conn_info_min_age; __u16 conn_info_max_age; __u16 auth_payload_timeout; + __u8 min_enc_key_size; __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b9585e7d9d2e..04bc79359a17 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3202,6 +3202,7 @@ struct hci_dev *hci_alloc_dev(void) hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; + hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index bb67f4a5479a..402e2cc54044 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -433,6 +433,35 @@ static int auto_accept_delay_set(void *data, u64 val) return 0; } +static int min_encrypt_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val < 1 || val > 16) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->min_enc_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int min_encrypt_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->min_enc_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(min_encrypt_key_size_fops, + min_encrypt_key_size_get, + min_encrypt_key_size_set, "%llu\n"); + static int auto_accept_delay_get(void *data, u64 *val) { struct hci_dev *hdev = data; @@ -545,6 +574,8 @@ void hci_debugfs_create_bredr(struct hci_dev *hdev) if (lmp_ssp_capable(hdev)) { debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs, hdev, &ssp_debug_mode_fops); + debugfs_create_file("min_encrypt_key_size", 0644, hdev->debugfs, + hdev, &min_encrypt_key_size_fops); debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs, hdev, &auto_accept_delay_fops); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index cc506fe99b4d..dfc1edb168b7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1361,7 +1361,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) * actually encrypted before enforcing a key size. */ return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || - hcon->enc_key_size >= HCI_MIN_ENC_KEY_SIZE); + hcon->enc_key_size >= hcon->hdev->min_enc_key_size); } static void l2cap_do_start(struct l2cap_chan *chan) -- cgit v1.2.3-59-g8ed1b From ef01adae0e43cfb2468d0ea07137cc63cf31495c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Aug 2019 03:24:09 +0200 Subject: net: sched: use major priority number as hardware priority tc transparently maps the software priority number to hardware. Update it to pass the major priority which is what most drivers expect. Update drivers too so they do not need to lshift the priority field of the flow_cls_common_offload object. The stmmac driver is an exception, since this code assumes the tc software priority is fine, therefore, lshift it just to be conservative. Signed-off-by: Pablo Neira Ayuso Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 2 +- drivers/net/ethernet/mscc/ocelot_flower.c | 12 +++--------- drivers/net/ethernet/netronome/nfp/flower/qos_conf.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 2 +- include/net/pkt_cls.h | 2 +- 6 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index deeb65da99f3..00b2d4a86159 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3167,7 +3167,7 @@ mlx5e_flow_esw_attr_init(struct mlx5_esw_flow_attr *esw_attr, esw_attr->parse_attr = parse_attr; esw_attr->chain = f->common.chain_index; - esw_attr->prio = TC_H_MAJ(f->common.prio) >> 16; + esw_attr->prio = f->common.prio; esw_attr->in_rep = in_rep; esw_attr->in_mdev = in_mdev; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index e8ac90564dbe..84a87d059333 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -471,7 +471,7 @@ int mlxsw_sp_acl_rulei_commit(struct mlxsw_sp_acl_rule_info *rulei) void mlxsw_sp_acl_rulei_priority(struct mlxsw_sp_acl_rule_info *rulei, unsigned int priority) { - rulei->priority = priority >> 16; + rulei->priority = priority; } void mlxsw_sp_acl_rulei_keymask_u32(struct mlxsw_sp_acl_rule_info *rulei, diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 59487d446a09..b894bc0c9c16 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -13,12 +13,6 @@ struct ocelot_port_block { struct ocelot_port *port; }; -static u16 get_prio(u32 prio) -{ - /* prio starts from 0x1000 while the ids starts from 0 */ - return prio >> 16; -} - static int ocelot_flower_parse_action(struct flow_cls_offload *f, struct ocelot_ace_rule *rule) { @@ -168,7 +162,7 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, } finished_key_parsing: - ocelot_rule->prio = get_prio(f->common.prio); + ocelot_rule->prio = f->common.prio; ocelot_rule->id = f->cookie; return ocelot_flower_parse_action(f, ocelot_rule); } @@ -218,7 +212,7 @@ static int ocelot_flower_destroy(struct flow_cls_offload *f, struct ocelot_ace_rule rule; int ret; - rule.prio = get_prio(f->common.prio); + rule.prio = f->common.prio; rule.port = port_block->port; rule.id = f->cookie; @@ -236,7 +230,7 @@ static int ocelot_flower_stats_update(struct flow_cls_offload *f, struct ocelot_ace_rule rule; int ret; - rule.prio = get_prio(f->common.prio); + rule.prio = f->common.prio; rule.port = port_block->port; rule.id = f->cookie; ret = ocelot_ace_rule_stats_update(&rule); diff --git a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c index 86e968cd5ffd..124a43dc136a 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c @@ -93,7 +93,7 @@ nfp_flower_install_rate_limiter(struct nfp_app *app, struct net_device *netdev, return -EOPNOTSUPP; } - if (flow->common.prio != (1 << 16)) { + if (flow->common.prio != 1) { NL_SET_ERR_MSG_MOD(extack, "unsupported offload: qos rate limit offload requires highest priority"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 37c0bc699cd9..6c305b6ecad0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -94,7 +94,7 @@ static int tc_fill_entry(struct stmmac_priv *priv, struct stmmac_tc_entry *entry, *frag = NULL; struct tc_u32_sel *sel = cls->knode.sel; u32 off, data, mask, real_off, rem; - u32 prio = cls->common.prio; + u32 prio = cls->common.prio << 16; int ret; /* Only 1 match per entry */ diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e429809ca90d..98be18ef1ed3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -646,7 +646,7 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; - cls_common->prio = tp->prio; + cls_common->prio = tp->prio >> 16; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3-59-g8ed1b From 3bc158f8d0330f0ac58597c023acca2234c14616 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Aug 2019 03:24:10 +0200 Subject: netfilter: nf_tables: map basechain priority to hardware priority This patch adds initial support for offloading basechains using the priority range from 1 to 65535. This is restricting the netfilter priority range to 16-bit integer since this is what most drivers assume so far from tc. It should be possible to extend this range of supported priorities later on once drivers are updated to support for 32-bit integer priorities. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables_offload.h | 2 ++ net/netfilter/nf_tables_api.c | 4 ++++ net/netfilter/nf_tables_offload.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 3196663a10e3..c8b9dec376f5 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -73,4 +73,6 @@ int nft_flow_rule_offload_commit(struct net *net); (__reg)->key = __key; \ memset(&(__reg)->mask, 0xff, (__reg)->len); +int nft_chain_offload_priority(struct nft_base_chain *basechain); + #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 88abbddf8967..d47469f824a1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1667,6 +1667,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + flow_block_init(&basechain->flow_block); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64f5fd5f240e..c0d18c1d77ac 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -103,10 +103,11 @@ void nft_offload_update_dependency(struct nft_offload_ctx *ctx, } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, - __be16 proto, - struct netlink_ext_ack *extack) + __be16 proto, int priority, + struct netlink_ext_ack *extack) { common->protocol = proto; + common->prio = priority; common->extack = extack; } @@ -124,6 +125,15 @@ static int nft_setup_cb_call(struct nft_base_chain *basechain, return 0; } +int nft_chain_offload_priority(struct nft_base_chain *basechain) +{ + if (basechain->ops.priority <= 0 || + basechain->ops.priority > USHRT_MAX) + return -1; + + return 0; +} + static int nft_flow_offload_rule(struct nft_trans *trans, enum flow_cls_command command) { @@ -142,7 +152,8 @@ static int nft_flow_offload_rule(struct nft_trans *trans, if (flow) proto = flow->proto; - nft_flow_offload_common_init(&cls_flow.common, proto, &extack); + nft_flow_offload_common_init(&cls_flow.common, proto, + basechain->ops.priority, &extack); cls_flow.command = command; cls_flow.cookie = (unsigned long) rule; if (flow) -- cgit v1.2.3-59-g8ed1b From 555df336c754ac9de1af9a5c72508918b3796b18 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 16:02:01 +0100 Subject: keys: Fix description size The maximum key description size is 4095. Commit f771fde82051 ("keys: Simplify key description management") inadvertantly reduced that to 255 and made sizes between 256 and 4095 work weirdly, and any size whereby size & 255 == 0 would cause an assertion in __key_link_begin() at the following line: BUG_ON(index_key->desc_len == 0); This can be fixed by simply increasing the size of desc_len in struct keyring_index_key to a u16. Note the argument length test in keyutils only checked empty descriptions and descriptions with a size around the limit (ie. 4095) and not for all the values in between, so it missed this. This has been addressed and https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/keyutils.git/commit/?id=066bf56807c26cd3045a25f355b34c1d8a20a5aa now exhaustively tests all possible lengths of type, description and payload and then some. The assertion failure looks something like: kernel BUG at security/keys/keyring.c:1245! ... RIP: 0010:__key_link_begin+0x88/0xa0 ... Call Trace: key_create_or_update+0x211/0x4b0 __x64_sys_add_key+0x101/0x200 do_syscall_64+0x5b/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 It can be triggered by: keyctl add user "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" a @s Fixes: f771fde82051 ("keys: Simplify key description management") Reported-by: kernel test robot Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/key.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/key.h b/include/linux/key.h index 91f391cd272e..50028338a4cc 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -94,11 +94,11 @@ struct keyring_index_key { union { struct { #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */ - u8 desc_len; - char desc[sizeof(long) - 1]; /* First few chars of description */ + u16 desc_len; + char desc[sizeof(long) - 2]; /* First few chars of description */ #else - char desc[sizeof(long) - 1]; /* First few chars of description */ - u8 desc_len; + char desc[sizeof(long) - 2]; /* First few chars of description */ + u16 desc_len; #endif }; unsigned long x; -- cgit v1.2.3-59-g8ed1b