From 1755c4b0372a2cf1e7124956b8cfebcb51083208 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 9 Jan 2024 11:49:06 +0000 Subject: [PATCH 001/327] dt-bindings: clock: gs101: rename cmu_misc clock-names 'bus' and 'ip' are sufficient because naming is local to the module. As the bindings have not made a release yet, rename the cmu_misc clock-names. Fixes: 0a910f160638 ("dt-bindings: clock: Add Google gs101 clock management unit bindings") Suggested-by: Rob Herring Signed-off-by: Tudor Ambarus Reviewed-by: Sam Protsenko Acked-by: Rob Herring Link: https://lore.kernel.org/r/20240109114908.3623645-2-tudor.ambarus@linaro.org Signed-off-by: Krzysztof Kozlowski --- .../devicetree/bindings/clock/google,gs101-clock.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml index 3eebc03a309b..ca7fdada3ff2 100644 --- a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml +++ b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml @@ -85,8 +85,8 @@ allOf: clock-names: items: - - const: dout_cmu_misc_bus - - const: dout_cmu_misc_sss + - const: bus + - const: sss additionalProperties: false From d76c762e7ee04af79e1c127422e0bbcb5f123018 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 9 Jan 2024 11:49:08 +0000 Subject: [PATCH 002/327] clk: samsung: clk-gs101: comply with the new dt cmu_misc clock names The cmu_misc clock-names were renamed to just "bus" and "sss" because naming is local to the module, so cmu_misc is implied. As the bindings and the device tree have not made a release yet, comply with the renamed clocks. Suggested-by: Rob Herring Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20240109114908.3623645-4-tudor.ambarus@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/clk/samsung/clk-gs101.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/samsung/clk-gs101.c b/drivers/clk/samsung/clk-gs101.c index 0964bb11657f..782993951fff 100644 --- a/drivers/clk/samsung/clk-gs101.c +++ b/drivers/clk/samsung/clk-gs101.c @@ -2475,7 +2475,7 @@ static const struct samsung_cmu_info misc_cmu_info __initconst = { .nr_clk_ids = CLKS_NR_MISC, .clk_regs = misc_clk_regs, .nr_clk_regs = ARRAY_SIZE(misc_clk_regs), - .clk_name = "dout_cmu_misc_bus", + .clk_name = "bus", }; /* ---- platform_driver ----------------------------------------------------- */ From 6dd9a236042e305d7b69ee92db7347bf5943e7d3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 16:04:14 +0100 Subject: [PATCH 003/327] soc: microchip: Fix POLARFIRE_SOC_SYS_CTRL input prompt The symbol's prompt should be a one-line description, instead of just duplicating the symbol name. Signed-off-by: Geert Uytterhoeven Signed-off-by: Conor Dooley --- drivers/soc/microchip/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/microchip/Kconfig b/drivers/soc/microchip/Kconfig index 9b0fdd95276e..19f4b576f822 100644 --- a/drivers/soc/microchip/Kconfig +++ b/drivers/soc/microchip/Kconfig @@ -1,5 +1,5 @@ config POLARFIRE_SOC_SYS_CTRL - tristate "POLARFIRE_SOC_SYS_CTRL" + tristate "Microchip PolarFire SoC (MPFS) system controller support" depends on POLARFIRE_SOC_MAILBOX depends on MTD help From 809aa64ebff51eb170ee31a95f83b2d21efa32e2 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Fri, 12 Jan 2024 16:55:23 +0800 Subject: [PATCH 004/327] IB/hfi1: Fix a memleak in init_credit_return When dma_alloc_coherent fails to allocate dd->cr_base[i].va, init_credit_return should deallocate dd->cr_base and dd->cr_base[i] that allocated before. Or those resources would be never freed and a memleak is triggered. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Signed-off-by: Zhipeng Lu Link: https://lore.kernel.org/r/20240112085523.3731720-1-alexious@zju.edu.cn Acked-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/pio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 68c621ff59d0..5a91cbda4aee 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -2086,7 +2086,7 @@ int init_credit_return(struct hfi1_devdata *dd) "Unable to allocate credit return DMA range for NUMA %d\n", i); ret = -ENOMEM; - goto done; + goto free_cr_base; } } set_dev_node(&dd->pcidev->dev, dd->node); @@ -2094,6 +2094,10 @@ int init_credit_return(struct hfi1_devdata *dd) ret = 0; done: return ret; + +free_cr_base: + free_credit_return(dd); + goto done; } void free_credit_return(struct hfi1_devdata *dd) From 282fd66e2ef6e5d72b8fcd77efb2b282d2569464 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 22 Jan 2024 20:54:33 -0800 Subject: [PATCH 005/327] RDMA/bnxt_re: Avoid creating fence MR for newer adapters Limit the usage of fence MR to adapters older than Gen P5 products. Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Kashyap Desai Signed-off-by: Bhargava Chenna Marreddy Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1705985677-15551-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 824349659d69..e1ea49263145 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -401,6 +401,10 @@ static void bnxt_re_create_fence_wqe(struct bnxt_re_pd *pd) struct bnxt_re_fence_data *fence = &pd->fence; struct ib_mr *ib_mr = &fence->mr->ib_mr; struct bnxt_qplib_swqe *wqe = &fence->bind_wqe; + struct bnxt_re_dev *rdev = pd->rdev; + + if (bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return; memset(wqe, 0, sizeof(*wqe)); wqe->type = BNXT_QPLIB_SWQE_TYPE_BIND_MW; @@ -455,6 +459,9 @@ static void bnxt_re_destroy_fence_mr(struct bnxt_re_pd *pd) struct device *dev = &rdev->en_dev->pdev->dev; struct bnxt_re_mr *mr = fence->mr; + if (bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return; + if (fence->mw) { bnxt_re_dealloc_mw(fence->mw); fence->mw = NULL; @@ -486,6 +493,9 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) struct ib_mw *mw; int rc; + if (bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return 0; + dma_addr = dma_map_single(dev, fence->va, BNXT_RE_FENCE_BYTES, DMA_BIDIRECTIONAL); rc = dma_mapping_error(dev, dma_addr); From 8fcbf0a55f71be7e562f10222abc9a34148189d0 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 22 Jan 2024 20:54:34 -0800 Subject: [PATCH 006/327] RDMA/bnxt_re: Remove a redundant check inside bnxt_re_vf_res_config After the cited commit, there is no possibility that this check can return true. Remove it. Fixes: a43c26fa2e6c ("RDMA/bnxt_re: Remove the sriov config callback") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1705985677-15551-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index f022c922fae5..54b4d2f3a5d8 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -280,9 +280,6 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) { - - if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) - return; rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { bnxt_re_set_resource_limits(rdev); From 8eaca6b5997bd8fd7039f2693e4ecf112823c816 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 22 Jan 2024 20:54:35 -0800 Subject: [PATCH 007/327] RDMA/bnxt_re: Fix unconditional fence for newer adapters Older adapters required an unconditional fence for non-wire memory operations. Newer adapters doesn't require this and therefore, disabling the unconditional fence. Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1705985677-15551-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e1ea49263145..fdb4fde1acbe 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2566,11 +2566,6 @@ static int bnxt_re_build_inv_wqe(const struct ib_send_wr *wr, wqe->type = BNXT_QPLIB_SWQE_TYPE_LOCAL_INV; wqe->local_inv.inv_l_key = wr->ex.invalidate_rkey; - /* Need unconditional fence for local invalidate - * opcode to work as expected. - */ - wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE; - if (wr->send_flags & IB_SEND_SIGNALED) wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP; if (wr->send_flags & IB_SEND_SOLICITED) @@ -2593,12 +2588,6 @@ static int bnxt_re_build_reg_wqe(const struct ib_reg_wr *wr, wqe->frmr.levels = qplib_frpl->hwq.level; wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR; - /* Need unconditional fence for reg_mr - * opcode to function as expected. - */ - - wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE; - if (wr->wr.send_flags & IB_SEND_SIGNALED) wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP; @@ -2729,6 +2718,18 @@ bad: return rc; } +static void bnxt_re_legacy_set_uc_fence(struct bnxt_qplib_swqe *wqe) +{ + /* Need unconditional fence for non-wire memory opcode + * to work as expected. + */ + if (wqe->type == BNXT_QPLIB_SWQE_TYPE_LOCAL_INV || + wqe->type == BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR || + wqe->type == BNXT_QPLIB_SWQE_TYPE_REG_MR || + wqe->type == BNXT_QPLIB_SWQE_TYPE_BIND_MW) + wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE; +} + int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { @@ -2808,8 +2809,11 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr, rc = -EINVAL; goto bad; } - if (!rc) + if (!rc) { + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_re_legacy_set_uc_fence(&wqe); rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe); + } bad: if (rc) { ibdev_err(&qp->rdev->ibdev, From 3687b450c5f32e80f179ce4b09e0454da1449eac Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 22 Jan 2024 20:54:36 -0800 Subject: [PATCH 008/327] RDMA/bnxt_re: Return error for SRQ resize SRQ resize is not supported in the driver. But driver is not returning error from bnxt_re_modify_srq() for SRQ resize. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1705985677-15551-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index fdb4fde1acbe..ce9c5bae83bf 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1827,7 +1827,7 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr, switch (srq_attr_mask) { case IB_SRQ_MAX_WR: /* SRQ resize is not supported */ - break; + return -EINVAL; case IB_SRQ_LIMIT: /* Change the SRQ threshold */ if (srq_attr->srq_limit > srq->qplib_srq.max_wqe) @@ -1842,13 +1842,12 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr, /* On success, update the shadow */ srq->srq_limit = srq_attr->srq_limit; /* No need to Build and send response back to udata */ - break; + return 0; default: ibdev_err(&rdev->ibdev, "Unsupported srq_attr_mask 0x%x", srq_attr_mask); return -EINVAL; } - return 0; } int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr) From 80dde187f734cf9ccf988d5c2ef1a46b990660fd Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 22 Jan 2024 20:54:37 -0800 Subject: [PATCH 009/327] RDMA/bnxt_re: Add a missing check in bnxt_qplib_query_srq Before populating the response, driver has to check the status of HWRM command. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1705985677-15551-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index c98e04fe2ddd..439d0c7c5d0c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -744,7 +744,8 @@ int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); - srq->threshold = le16_to_cpu(sb->srq_limit); + if (!rc) + srq->threshold = le16_to_cpu(sb->srq_limit); dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); From fc4657971be31ae679e2bbeee2fb8e93a7a063eb Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 17 Jan 2024 20:14:48 +0100 Subject: [PATCH 010/327] arm64: dts: rockchip: mark system power controller on rk3588-evb1 Mark the primary PMIC as system-power-controller, so that the system properly shuts down on poweroff. Signed-off-by: Sebastian Reichel Link: https://lore.kernel.org/r/20240117191555.86138-1-sebastian.reichel@collabora.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts b/arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts index ac7c677b0fb9..de30c2632b8e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts @@ -448,6 +448,7 @@ <&rk806_dvs2_null>, <&rk806_dvs3_null>; pinctrl-names = "default"; spi-max-frequency = <1000000>; + system-power-controller; vcc1-supply = <&vcc5v0_sys>; vcc2-supply = <&vcc5v0_sys>; From 2147caaac7349698f2a392c5e2911a6861a09650 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 29 Jan 2024 09:49:18 +0000 Subject: [PATCH 011/327] netfs: Fix i_dio_count leak on DIO read past i_size If netfs_begin_read gets a NETFS_DIO_READ request that begins past i_size, it won't perform any i/o and just return 0. This will leak an increment to i_dio_count that is done at the top of the function. This can cause subsequent buffered read requests to block indefinitely, waiting for a non existing dio operation to complete. Add a inode_dio_end() for the NETFS_DIO_READ case, before returning. Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240129094924.1221977-2-dhowells@redhat.com Reviewed-by: Jeff Layton cc: Jeff Layton cc: cc: cc: Signed-off-by: Christian Brauner --- fs/netfs/io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e8ff1e61ce79..4261ad6c55b6 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -748,6 +748,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (!rreq->submitted) { netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); ret = 0; goto out; } From ca9ca1a5d5a980550db1001ea825f9fdfa550b83 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Jan 2024 09:49:19 +0000 Subject: [PATCH 012/327] netfs: Fix missing zero-length check in unbuffered write Fix netfs_unbuffered_write_iter() to return immediately if generic_write_checks() returns 0, indicating there's nothing to write. Note that netfs_file_write_iter() already does this. Also, whilst we're at it, put in checks for the size being zero before we even take the locks. Note that generic_write_checks() can still reduce the size to zero, so we still need that check. Without this, a warning similar to the following is logged to dmesg: netfs: Zero-sized write [R=1b6da] and the syscall fails with EIO, e.g.: /sbin/ldconfig.real: Writing of cache extension data failed: Input/output error This can be reproduced on 9p by: xfs_io -f -c 'pwrite 0 0' /xfstest.test/foo Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Reported-by: Eric Van Hensbergen Link: https://lore.kernel.org/r/ZbQUU6QKmIftKsmo@FV7GG9FTHL/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240129094924.1221977-3-dhowells@redhat.com Tested-by: Dominique Martinet Reviewed-by: Jeff Layton cc: Dominique Martinet cc: Jeff Layton cc: cc: cc: cc: Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 3 +++ fs/netfs/direct_write.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index a3059b3168fd..9a0d32e4b422 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -477,6 +477,9 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + if ((iocb->ki_flags & IOCB_DIRECT) || test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) return netfs_unbuffered_write_iter(iocb, from); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 60a40d293c87..bee047e20f5d 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -139,6 +139,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + trace_netfs_write_iter(iocb, from); netfs_stat(&netfs_n_rh_dio_write); @@ -146,7 +149,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) return ret; ret = generic_write_checks(iocb, from); - if (ret < 0) + if (ret <= 0) goto out; ret = file_remove_privs(file); if (ret < 0) From 0abcac4fe3ca3aeaef40ad53c0b295b2d5f8cbf1 Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Thu, 25 Jan 2024 13:19:33 -0800 Subject: [PATCH 013/327] firmware: microchip: fix wrong sizeof argument response_msg is a pointer to an unsigned int (u32). So passing just response_msg to sizeof would not print the size of the variable. To get the size of response_msg we need to pass it as a pointer variable. Fixes: ec5b0f1193ad ("firmware: microchip: add PolarFire SoC Auto Update support") Signed-off-by: Samasth Norway Ananda Signed-off-by: Conor Dooley --- drivers/firmware/microchip/mpfs-auto-update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/microchip/mpfs-auto-update.c b/drivers/firmware/microchip/mpfs-auto-update.c index 81f5f62e34fc..682e417be5a3 100644 --- a/drivers/firmware/microchip/mpfs-auto-update.c +++ b/drivers/firmware/microchip/mpfs-auto-update.c @@ -167,7 +167,7 @@ static int mpfs_auto_update_verify_image(struct fw_upload *fw_uploader) u32 *response_msg; int ret; - response_msg = devm_kzalloc(priv->dev, AUTO_UPDATE_FEATURE_RESP_SIZE * sizeof(response_msg), + response_msg = devm_kzalloc(priv->dev, AUTO_UPDATE_FEATURE_RESP_SIZE * sizeof(*response_msg), GFP_KERNEL); if (!response_msg) return -ENOMEM; From eef5c7b28dbecd6b141987a96db6c54e49828102 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 29 Jan 2024 13:18:56 +0000 Subject: [PATCH 014/327] cxl/pci: Skip to handle RAS errors if CXL.mem device is detached The PCI AER model is an awkward fit for CXL error handling. While the expectation is that a PCI device can escalate to link reset to recover from an AER event, the same reset on CXL amounts to a surprise memory hotplug of massive amounts of memory. At present, the CXL error handler attempts some optimistic error handling to unbind the device from the cxl_mem driver after reaping some RAS register values. This results in a "hopeful" attempt to unplug the memory, but there is no guarantee that will succeed. A subsequent AER notification after the memdev unbind event can no longer assume the registers are mapped. Check for memdev bind before reaping status register values to avoid crashes of the form: BUG: unable to handle page fault for address: ffa00000195e9100 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page [...] RIP: 0010:__cxl_handle_ras+0x30/0x110 [cxl_core] [...] Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x82/0x160 ? kernelmode_fixup_or_oops+0x84/0x110 ? exc_page_fault+0x113/0x170 ? asm_exc_page_fault+0x26/0x30 ? __pfx_dpc_reset_link+0x10/0x10 ? __cxl_handle_ras+0x30/0x110 [cxl_core] ? find_cxl_port+0x59/0x80 [cxl_core] cxl_handle_rp_ras+0xbc/0xd0 [cxl_core] cxl_error_detected+0x6c/0xf0 [cxl_core] report_error_detected+0xc7/0x1c0 pci_walk_bus+0x73/0x90 pcie_do_recovery+0x23f/0x330 Longer term, the unbind and PCI_ERS_RESULT_DISCONNECT behavior might need to be replaced with a new PCI_ERS_RESULT_PANIC. Fixes: 6ac07883dbb5 ("cxl/pci: Add RCH downstream port error logging") Cc: stable@vger.kernel.org Suggested-by: Dan Williams Signed-off-by: Li Ming Link: https://lore.kernel.org/r/20240129131856.2458980-1-ming4.li@intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/pci.c | 43 ++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 6c9c8d92f8f7..480489f5644e 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -932,11 +932,21 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } void cxl_cor_error_detected(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } - cxl_handle_endpoint_cor_ras(cxlds); + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_endpoint_cor_ras(cxlds); + } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); @@ -948,16 +958,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, struct device *dev = &cxlmd->dev; bool ue; - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_endpoint_ras(cxlds); + } - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_endpoint_ras(cxlds); switch (state) { case pci_channel_io_normal: From 4d5e86a56615cc387d21c629f9af8fb0e958d350 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2024 11:29:11 +0200 Subject: [PATCH 015/327] RDMA/mlx5: Fix fortify source warning while accessing Eth segment ------------[ cut here ]------------ memcpy: detected field-spanning write (size 56) of single field "eseg->inline_hdr.start" at /var/lib/dkms/mlnx-ofed-kernel/5.8/build/drivers/infiniband/hw/mlx5/wr.c:131 (size 2) WARNING: CPU: 0 PID: 293779 at /var/lib/dkms/mlnx-ofed-kernel/5.8/build/drivers/infiniband/hw/mlx5/wr.c:131 mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] Modules linked in: 8021q garp mrp stp llc rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) mlx5_core(OE) pci_hyperv_intf mlxdevm(OE) mlx_compat(OE) tls mlxfw(OE) psample nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c nfnetlink mst_pciconf(OE) knem(OE) vfio_pci vfio_pci_core vfio_iommu_type1 vfio iommufd irqbypass cuse nfsv3 nfs fscache netfs xfrm_user xfrm_algo ipmi_devintf ipmi_msghandler binfmt_misc crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 snd_pcsp aesni_intel crypto_simd cryptd snd_pcm snd_timer joydev snd soundcore input_leds serio_raw evbug nfsd auth_rpcgss nfs_acl lockd grace sch_fq_codel sunrpc drm efi_pstore ip_tables x_tables autofs4 psmouse virtio_net net_failover failover floppy [last unloaded: mlx_compat(OE)] CPU: 0 PID: 293779 Comm: ssh Tainted: G OE 6.2.0-32-generic #32~22.04.1-Ubuntu Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] Code: 0c 01 00 a8 01 75 25 48 8b 75 a0 b9 02 00 00 00 48 c7 c2 10 5b fd c0 48 c7 c7 80 5b fd c0 c6 05 57 0c 03 00 01 e8 95 4d 93 da <0f> 0b 44 8b 4d b0 4c 8b 45 c8 48 8b 4d c0 e9 49 fb ff ff 41 0f b7 RSP: 0018:ffffb5b48478b570 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb5b48478b628 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffb5b48478b5e8 R13: ffff963a3c609b5e R14: ffff9639c3fbd800 R15: ffffb5b480475a80 FS: 00007fc03b444c80(0000) GS:ffff963a3dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000556f46bdf000 CR3: 0000000006ac6003 CR4: 00000000003706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_regs+0x72/0x90 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] ? __warn+0x8d/0x160 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] ? report_bug+0x1bb/0x1d0 ? handle_bug+0x46/0x90 ? exc_invalid_op+0x19/0x80 ? asm_exc_invalid_op+0x1b/0x20 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] mlx5_ib_post_send_nodrain+0xb/0x20 [mlx5_ib] ipoib_send+0x2ec/0x770 [ib_ipoib] ipoib_start_xmit+0x5a0/0x770 [ib_ipoib] dev_hard_start_xmit+0x8e/0x1e0 ? validate_xmit_skb_list+0x4d/0x80 sch_direct_xmit+0x116/0x3a0 __dev_xmit_skb+0x1fd/0x580 __dev_queue_xmit+0x284/0x6b0 ? _raw_spin_unlock_irq+0xe/0x50 ? __flush_work.isra.0+0x20d/0x370 ? push_pseudo_header+0x17/0x40 [ib_ipoib] neigh_connected_output+0xcd/0x110 ip_finish_output2+0x179/0x480 ? __smp_call_single_queue+0x61/0xa0 __ip_finish_output+0xc3/0x190 ip_finish_output+0x2e/0xf0 ip_output+0x78/0x110 ? __pfx_ip_finish_output+0x10/0x10 ip_local_out+0x64/0x70 __ip_queue_xmit+0x18a/0x460 ip_queue_xmit+0x15/0x30 __tcp_transmit_skb+0x914/0x9c0 tcp_write_xmit+0x334/0x8d0 tcp_push_one+0x3c/0x60 tcp_sendmsg_locked+0x2e1/0xac0 tcp_sendmsg+0x2d/0x50 inet_sendmsg+0x43/0x90 sock_sendmsg+0x68/0x80 sock_write_iter+0x93/0x100 vfs_write+0x326/0x3c0 ksys_write+0xbd/0xf0 ? do_syscall_64+0x69/0x90 __x64_sys_write+0x19/0x30 do_syscall_64+0x59/0x90 ? do_user_addr_fault+0x1d0/0x640 ? exit_to_user_mode_prepare+0x3b/0xd0 ? irqentry_exit_to_user_mode+0x9/0x20 ? irqentry_exit+0x43/0x50 ? exc_page_fault+0x92/0x1b0 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fc03ad14a37 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffdf8697fe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000008024 RCX: 00007fc03ad14a37 RDX: 0000000000008024 RSI: 0000556f46bd8270 RDI: 0000000000000003 RBP: 0000556f46bb1800 R08: 0000000000007fe3 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000556f46bc66b0 R14: 000000000000000a R15: 0000556f46bb2f50 ---[ end trace 0000000000000000 ]--- Link: https://lore.kernel.org/r/8228ad34bd1a25047586270f7b1fb4ddcd046282.1706433934.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/wr.c | 2 +- include/linux/mlx5/qp.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index df1d1b0a3ef7..9947feb7fb8a 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -78,7 +78,7 @@ static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, */ copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start, left); - memcpy(eseg->inline_hdr.start, pdata, copysz); + memcpy(eseg->inline_hdr.data, pdata, copysz); stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) - sizeof(eseg->inline_hdr.start) + copysz, 16); *size += stride / 16; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index bd53cf4be7bd..f0e55bf3ec8b 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -269,7 +269,10 @@ struct mlx5_wqe_eth_seg { union { struct { __be16 sz; - u8 start[2]; + union { + u8 start[2]; + DECLARE_FLEX_ARRAY(u8, data); + }; } inline_hdr; struct { __be16 type; From 43fdbd140238d44e7e847232719fef7d20f9d326 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 28 Jan 2024 11:29:12 +0200 Subject: [PATCH 016/327] IB/mlx5: Don't expose debugfs entries for RRoCE general parameters if not supported debugfs entries for RRoCE general CC parameters must be exposed only when they are supported, otherwise when accessing them there may be a syndrome error in kernel log, for example: $ cat /sys/kernel/debug/mlx5/0000:08:00.1/cc_params/rtt_resp_dscp cat: '/sys/kernel/debug/mlx5/0000:08:00.1/cc_params/rtt_resp_dscp': Invalid argument $ dmesg mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1253): QUERY_CONG_PARAMS(0x824) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x325a82), err(-22) Fixes: 66fb1d5df6ac ("IB/mlx5: Extend debug control for CC parameters") Reviewed-by: Edward Srouji Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/e7ade70bad52b7468bdb1de4d41d5fad70c8b71c.1706433934.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cong.c | 6 ++++++ include/linux/mlx5/mlx5_ifc.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index f87531318feb..a78a067e3ce7 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -458,6 +458,12 @@ void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num) dbg_cc_params->root = debugfs_create_dir("cc_params", mlx5_debugfs_get_dev_root(mdev)); for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { + if ((i == MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP_VALID || + i == MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP)) + if (!MLX5_CAP_GEN(mdev, roce) || + !MLX5_CAP_ROCE(mdev, roce_cc_general)) + continue; + dbg_cc_params->params[i].offset = i; dbg_cc_params->params[i].dev = dev; dbg_cc_params->params[i].port_num = port_num; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf5320b28b8b..2c10350bd422 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1103,7 +1103,7 @@ struct mlx5_ifc_roce_cap_bits { u8 sw_r_roce_src_udp_port[0x1]; u8 fl_rc_qp_when_roce_disabled[0x1]; u8 fl_rc_qp_when_roce_enabled[0x1]; - u8 reserved_at_7[0x1]; + u8 roce_cc_general[0x1]; u8 qp_ooo_transmit_default[0x1]; u8 reserved_at_9[0x15]; u8 qp_ts_format[0x2]; From be551ee1574280ef8afbf7c271212ac3e38933ef Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 28 Jan 2024 11:29:13 +0200 Subject: [PATCH 017/327] RDMA/mlx5: Relax DEVX access upon modify commands Relax DEVX access upon modify commands to be UVERBS_ACCESS_READ. The kernel doesn't need to protect what firmware protects, or what causes no damage to anyone but the user. As firmware needs to protect itself from parallel access to the same object, don't block parallel modify/query commands on the same object in the kernel side. This change will allow user space application to run parallel updates to different entries in the same bulk object. Tested-by: Tamar Mashiah Signed-off-by: Yishai Hadas Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/7407d5ed35dc427c1097699e12b49c01e1073406.1706433934.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/devx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 869369cb5b5f..253fea374a72 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2949,7 +2949,7 @@ DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_OBJ_MODIFY, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, UVERBS_IDR_ANY_OBJECT, - UVERBS_ACCESS_WRITE, + UVERBS_ACCESS_READ, UA_MANDATORY), UVERBS_ATTR_PTR_IN( MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, From f03869698bc3bd6d9d2d9f216b20da08a8c2508a Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 20 Dec 2023 01:02:42 +0100 Subject: [PATCH 018/327] arm64: dts: imx8mp: Disable UART4 by default on Data Modul i.MX8M Plus eDM SBC UART4 is used as CM7 coprocessor debug UART and may not be accessible from Linux in case it is protected by RDC. The RDC protection is set up by the platform firmware. UART4 is not used on this platform by Linux. Disable UART4 by default to prevent boot hangs, which occur when the RDC protection is in place. Fixes: 562d222f23f0 ("arm64: dts: imx8mp: Add support for Data Modul i.MX8M Plus eDM SBC") Signed-off-by: Marek Vasut Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts b/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts index d98a040860a4..5828c9d7821d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts @@ -486,7 +486,7 @@ &uart4 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_uart4>; - status = "okay"; + status = "disabled"; }; &usb3_phy0 { From bd97cea7b18a0a553773af806dfbfac27a7c4acb Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 31 Jan 2024 17:38:46 -0600 Subject: [PATCH 019/327] RDMA/irdma: Fix KASAN issue with tasklet KASAN testing revealed the following issue assocated with freeing an IRQ. [50006.466686] Call Trace: [50006.466691] [50006.489538] dump_stack+0x5c/0x80 [50006.493475] print_address_description.constprop.6+0x1a/0x150 [50006.499872] ? irdma_sc_process_ceq+0x483/0x790 [irdma] [50006.505742] ? irdma_sc_process_ceq+0x483/0x790 [irdma] [50006.511644] kasan_report.cold.11+0x7f/0x118 [50006.516572] ? irdma_sc_process_ceq+0x483/0x790 [irdma] [50006.522473] irdma_sc_process_ceq+0x483/0x790 [irdma] [50006.528232] irdma_process_ceq+0xb2/0x400 [irdma] [50006.533601] ? irdma_hw_flush_wqes_callback+0x370/0x370 [irdma] [50006.540298] irdma_ceq_dpc+0x44/0x100 [irdma] [50006.545306] tasklet_action_common.isra.14+0x148/0x2c0 [50006.551096] __do_softirq+0x1d0/0xaf8 [50006.555396] irq_exit_rcu+0x219/0x260 [50006.559670] irq_exit+0xa/0x20 [50006.563320] smp_apic_timer_interrupt+0x1bf/0x690 [50006.568645] apic_timer_interrupt+0xf/0x20 [50006.573341] The issue is that a tasklet could be pending on another core racing the delete of the irq. Fix by insuring any scheduled tasklet is killed after deleting the irq. Fixes: 44d9e52977a1 ("RDMA/irdma: Implement device initialization definitions") Signed-off-by: Mike Marciniszyn Signed-off-by: Shiraz Saleem Signed-off-by: Sindhu Devale Link: https://lore.kernel.org/r/20240131233849.400285-2-sindhu.devale@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/hw.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index bd4b2b896444..2f8d18d8be3b 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -570,6 +570,13 @@ static void irdma_destroy_irq(struct irdma_pci_f *rf, dev->irq_ops->irdma_dis_irq(dev, msix_vec->idx); irq_update_affinity_hint(msix_vec->irq, NULL); free_irq(msix_vec->irq, dev_id); + if (rf == dev_id) { + tasklet_kill(&rf->dpc_tasklet); + } else { + struct irdma_ceq *iwceq = (struct irdma_ceq *)dev_id; + + tasklet_kill(&iwceq->dpc_tasklet); + } } /** From ee107186bcfd25d7873258f3f75440e20f5e6416 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Wed, 31 Jan 2024 17:38:47 -0600 Subject: [PATCH 020/327] RDMA/irdma: Validate max_send_wr and max_recv_wr Validate that max_send_wr and max_recv_wr is within the supported range. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Change-Id: I2fc8b10292b641fddd20b36986a9dae90a93f4be Signed-off-by: Shiraz Saleem Signed-off-by: Sindhu Devale Link: https://lore.kernel.org/r/20240131233849.400285-3-sindhu.devale@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index b5eb8d421988..cb828e3da478 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -839,7 +839,9 @@ static int irdma_validate_qp_attrs(struct ib_qp_init_attr *init_attr, if (init_attr->cap.max_inline_data > uk_attrs->max_hw_inline || init_attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags || - init_attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags) + init_attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags || + init_attr->cap.max_send_wr > uk_attrs->max_hw_wq_quanta || + init_attr->cap.max_recv_wr > uk_attrs->max_hw_rq_quanta) return -EINVAL; if (rdma_protocol_roce(&iwdev->ibdev, 1)) { From 666047f3ece9f991774c1fe9b223139a9ef8908d Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 31 Jan 2024 17:38:48 -0600 Subject: [PATCH 021/327] RDMA/irdma: Set the CQ read threshold for GEN 1 The CQ shadow read threshold is currently not set for GEN 2. This could cause an invalid CQ overflow condition, so remove the GEN check that exclused GEN 1. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Sindhu Devale Link: https://lore.kernel.org/r/20240131233849.400285-4-sindhu.devale@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index cb828e3da478..0b046c061742 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2186,9 +2186,8 @@ static int irdma_create_cq(struct ib_cq *ibcq, info.cq_base_pa = iwcq->kmem.pa; } - if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) - info.shadow_read_threshold = min(info.cq_uk_init_info.cq_size / 2, - (u32)IRDMA_MAX_CQ_READ_THRESH); + info.shadow_read_threshold = min(info.cq_uk_init_info.cq_size / 2, + (u32)IRDMA_MAX_CQ_READ_THRESH); if (irdma_sc_cq_init(cq, &info)) { ibdev_dbg(&iwdev->ibdev, "VERBS: init cq fail\n"); From 630bdb6f28ca9e5ff79e244030170ac788478332 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 31 Jan 2024 17:38:49 -0600 Subject: [PATCH 022/327] RDMA/irdma: Add AE for too many RNRS Add IRDMA_AE_LLP_TOO_MANY_RNRS to the list of AE's processed as an abnormal asyncronous event. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Sindhu Devale Link: https://lore.kernel.org/r/20240131233849.400285-5-sindhu.devale@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/defs.h | 1 + drivers/infiniband/hw/irdma/hw.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/irdma/defs.h b/drivers/infiniband/hw/irdma/defs.h index 8fb752f2eda2..2cb4b96db721 100644 --- a/drivers/infiniband/hw/irdma/defs.h +++ b/drivers/infiniband/hw/irdma/defs.h @@ -346,6 +346,7 @@ enum irdma_cqp_op_type { #define IRDMA_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES 0x050b #define IRDMA_AE_LLP_DOUBT_REACHABILITY 0x050c #define IRDMA_AE_LLP_CONNECTION_ESTABLISHED 0x050e +#define IRDMA_AE_LLP_TOO_MANY_RNRS 0x050f #define IRDMA_AE_RESOURCE_EXHAUSTION 0x0520 #define IRDMA_AE_RESET_SENT 0x0601 #define IRDMA_AE_TERMINATE_SENT 0x0602 diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 2f8d18d8be3b..ad50b77282f8 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -387,6 +387,7 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) case IRDMA_AE_LLP_TOO_MANY_RETRIES: case IRDMA_AE_LCE_QP_CATASTROPHIC: case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: + case IRDMA_AE_LLP_TOO_MANY_RNRS: case IRDMA_AE_LCE_CQ_CATASTROPHIC: case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: default: From e6f57c6881916df39db7d95981a8ad2b9c3458d6 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Thu, 1 Feb 2024 09:10:08 +0100 Subject: [PATCH 023/327] IB/hfi1: Fix sdma.h tx->num_descs off-by-one error Unfortunately the commit `fd8958efe877` introduced another error causing the `descs` array to overflow. This reults in further crashes easily reproducible by `sendmsg` system call. [ 1080.836473] general protection fault, probably for non-canonical address 0x400300015528b00a: 0000 [#1] PREEMPT SMP PTI [ 1080.869326] RIP: 0010:hfi1_ipoib_build_ib_tx_headers.constprop.0+0xe1/0x2b0 [hfi1] -- [ 1080.974535] Call Trace: [ 1080.976990] [ 1081.021929] hfi1_ipoib_send_dma_common+0x7a/0x2e0 [hfi1] [ 1081.027364] hfi1_ipoib_send_dma_list+0x62/0x270 [hfi1] [ 1081.032633] hfi1_ipoib_send+0x112/0x300 [hfi1] [ 1081.042001] ipoib_start_xmit+0x2a9/0x2d0 [ib_ipoib] [ 1081.046978] dev_hard_start_xmit+0xc4/0x210 -- [ 1081.148347] __sys_sendmsg+0x59/0xa0 crash> ipoib_txreq 0xffff9cfeba229f00 struct ipoib_txreq { txreq = { list = { next = 0xffff9cfeba229f00, prev = 0xffff9cfeba229f00 }, descp = 0xffff9cfeba229f40, coalesce_buf = 0x0, wait = 0xffff9cfea4e69a48, complete = 0xffffffffc0fe0760 , packet_len = 0x46d, tlen = 0x0, num_desc = 0x0, desc_limit = 0x6, next_descq_idx = 0x45c, coalesce_idx = 0x0, flags = 0x0, descs = {{ qw = {0x8024000120dffb00, 0x4} # SDMA_DESC0_FIRST_DESC_FLAG (bit 63) }, { qw = { 0x3800014231b108, 0x4} }, { qw = { 0x310000e4ee0fcf0, 0x8} }, { qw = { 0x3000012e9f8000, 0x8} }, { qw = { 0x59000dfb9d0000, 0x8} }, { qw = { 0x78000e02e40000, 0x8} }} }, sdma_hdr = 0x400300015528b000, <<< invalid pointer in the tx request structure sdma_status = 0x0, SDMA_DESC0_LAST_DESC_FLAG (bit 62) complete = 0x0, priv = 0x0, txq = 0xffff9cfea4e69880, skb = 0xffff9d099809f400 } If an SDMA send consists of exactly 6 descriptors and requires dword padding (in the 7th descriptor), the sdma_txreq descriptor array is not properly expanded and the packet will overflow into the container structure. This results in a panic when the send completion runs. The exact panic varies depending on what elements of the container structure get corrupted. The fix is to use the correct expression in _pad_sdma_tx_descs() to test the need to expand the descriptor array. With this patch the crashes are no longer reproducible and the machine is stable. Fixes: fd8958efe877 ("IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors") Cc: stable@vger.kernel.org Reported-by: Mats Kronberg Tested-by: Mats Kronberg Signed-off-by: Daniel Vacek Link: https://lore.kernel.org/r/20240201081009.1109442-1-neelx@redhat.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 6e5ac2023328..b67d23b1f286 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -3158,7 +3158,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) { int rval = 0; - if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) { + if ((unlikely(tx->num_desc == tx->desc_limit))) { rval = _extend_sdma_tx_descs(dd, tx); if (rval) { __sdma_txclean(dd, tx); From a41f91b4da1490b90ae5859f3464e94d418fea2c Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Thu, 1 Feb 2024 20:11:03 +0800 Subject: [PATCH 024/327] arm64: dts: rockchip: aliase sdmmc as mmc1 for Cool Pi 4B Follow others rk3588 based boards, and u-boot only use mmc0/1 as mmc boot targets, so aliase sdmmc as mmc1. Fixes: 3f5d336d64d6 ("arm64: dts: rockchip: Add support for rk3588s based board Cool Pi 4B") Signed-off-by: Andy Yan Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20240201121106.1471301-1-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts index ef4f058c20ff..e037bf9db75a 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts @@ -19,8 +19,8 @@ aliases { mmc0 = &sdhci; - mmc1 = &sdio; - mmc2 = &sdmmc; + mmc1 = &sdmmc; + mmc2 = &sdio; }; analog-sound { From cebda3dd36bebb11d3e6df4d1944a1cdcf3cd4cf Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Thu, 1 Feb 2024 20:11:04 +0800 Subject: [PATCH 025/327] arm64: dts: rockchip: aliase sdmmc as mmc1 for Cool Pi CM5 EVB Follow others rk3588 based boards, and u-boot only use mmc0/1 as mmc boot targets, so aliase sdmmc as mmc1. Fixes: 791c154c3982 ("arm64: dts: rockchip: Add support for rk3588 based board Cool Pi CM5 EVB") Signed-off-by: Andy Yan Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20240201121106.1471301-2-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi index 0b02f4d6e003..cce1c8e83587 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi @@ -16,8 +16,8 @@ aliases { mmc0 = &sdhci; - mmc1 = &sdio; - mmc2 = &sdmmc; + mmc1 = &sdmmc; + mmc2 = &sdio; serial2 = &uart2; }; From c7e8dbb3bc12f389d617e6c0e9537f11a1fa6eb1 Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Thu, 1 Feb 2024 20:11:05 +0800 Subject: [PATCH 026/327] arm64: dts: rockchip: rename vcc5v0_usb30_host regulator for Cool Pi CM5 EVB According to the schematic, USB20 HOST0 and HOST1 each have their own independent power supply, but these two regulators controlled by a same GPIO, so give it a more appropriate name. Fixes: 791c154c3982 ("arm64: dts: rockchip: Add support for rk3588 based board Cool Pi CM5 EVB") Signed-off-by: Andy Yan Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20240201121106.1471301-3-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts index d4c70835e0fe..609f35ee4b0e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts @@ -72,7 +72,7 @@ vin-supply = <&vcc3v3_sys>; }; - vcc5v0_usb30_host: vcc5v0-usb30-host-regulator { + vcc5v0_usb_host1: vcc5v0_usb_host2: vcc5v0-usb-host-regulator { compatible = "regulator-fixed"; regulator-name = "vcc5v0_host"; regulator-boot-on; @@ -188,12 +188,12 @@ }; &u2phy2_host { - phy-supply = <&vcc5v0_usb30_host>; + phy-supply = <&vcc5v0_usb_host1>; status = "okay"; }; &u2phy3_host { - phy-supply = <&vcc5v0_usb30_host>; + phy-supply = <&vcc5v0_usb_host2>; status = "okay"; }; From 5556a8c3af8b4ee648b0fd4cdb4a1356d92de07b Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Thu, 1 Feb 2024 20:11:06 +0800 Subject: [PATCH 027/327] arm64: dts: rockchip: Fix the num-lanes of pcie3x4 on Cool Pi CM5 EVB The 4 lane pcie30 phy is shared by pcie3x4 and pcie3x2, so the num-lanes of pcie3x4 should be 2. Fixes: 791c154c3982 ("arm64: dts: rockchip: Add support for rk3588 based board Cool Pi CM5 EVB") Signed-off-by: Andy Yan Link: https://lore.kernel.org/r/20240201121106.1471301-4-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts index 609f35ee4b0e..a4946cdc3bb3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-evb.dts @@ -114,6 +114,7 @@ status = "okay"; }; +/* Standard pcie */ &pcie3x2 { reset-gpios = <&gpio3 RK_PB0 GPIO_ACTIVE_HIGH>; vpcie3v3-supply = <&vcc3v3_sys>; @@ -122,6 +123,7 @@ /* M.2 M-Key ssd */ &pcie3x4 { + num-lanes = <2>; reset-gpios = <&gpio4 RK_PB6 GPIO_ACTIVE_HIGH>; vpcie3v3-supply = <&vcc3v3_sys>; status = "okay"; From f98643d8daf3443e3b414a82d0cb3d745f8c8bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 29 Jan 2024 12:32:02 +0100 Subject: [PATCH 028/327] ARM: dts: rockchip: Drop interrupts property from pwm-rockchip nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The binding doesn't define interrupts and adding such a definition was refused because it's unclear how they should ever be used and the relevant registers are outside the PWM range. So drop them fixing several dtbs_check warnings like: arch/arm/boot/dts/rockchip/rv1108-elgin-r1.dtb: pwm@10280030: 'interrupts' does not match any of the regexes: 'pinctrl-[0-9]+' from schema $id: http://devicetree.org/schemas/pwm/pwm-rockchip.yaml# Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240129113205.2453029-2-u.kleine-koenig@pengutronix.de Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rockchip/rv1108.dtsi | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm/boot/dts/rockchip/rv1108.dtsi b/arch/arm/boot/dts/rockchip/rv1108.dtsi index abf3006f0a84..f3291f3bbc6f 100644 --- a/arch/arm/boot/dts/rockchip/rv1108.dtsi +++ b/arch/arm/boot/dts/rockchip/rv1108.dtsi @@ -196,7 +196,6 @@ pwm4: pwm@10280000 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x10280000 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM>, <&cru PCLK_PWM>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -208,7 +207,6 @@ pwm5: pwm@10280010 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x10280010 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM>, <&cru PCLK_PWM>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -220,7 +218,6 @@ pwm6: pwm@10280020 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x10280020 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM>, <&cru PCLK_PWM>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -232,7 +229,6 @@ pwm7: pwm@10280030 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x10280030 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM>, <&cru PCLK_PWM>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -386,7 +382,6 @@ pwm0: pwm@20040000 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x20040000 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM0_PMU>, <&cru PCLK_PWM0_PMU>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -398,7 +393,6 @@ pwm1: pwm@20040010 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x20040010 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM0_PMU>, <&cru PCLK_PWM0_PMU>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -410,7 +404,6 @@ pwm2: pwm@20040020 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x20040020 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM0_PMU>, <&cru PCLK_PWM0_PMU>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; @@ -422,7 +415,6 @@ pwm3: pwm@20040030 { compatible = "rockchip,rv1108-pwm", "rockchip,rk3288-pwm"; reg = <0x20040030 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM0_PMU>, <&cru PCLK_PWM0_PMU>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; From bab7ec1d80fe5998ce5ab4480f15e7cce43e2cdf Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 29 Jan 2024 12:48:51 +0100 Subject: [PATCH 029/327] arm64: dts: rockchip: drop unneeded status from rk3588-jaguar gpio-leds The default status is okay, so it is definitly not necessary to set it on a newly added node. Fixes: d1b8b36a2cc5 ("arm64: dts: rockchip: add Theobroma Jaguar SBC") Signed-off-by: Heiko Stuebner Reviewed-by: Quentin Schulz Link: https://lore.kernel.org/r/20240129114851.2019861-1-heiko@sntech.de --- arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts index 4ce70fb75a30..39d65002add1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts @@ -62,7 +62,6 @@ compatible = "gpio-leds"; pinctrl-names = "default"; pinctrl-0 = <&led1_pin>; - status = "okay"; /* LED1 on PCB */ led-1 { From fdfa083549de5d50ebf7f6811f33757781e838c0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 4 Feb 2024 16:42:07 -0800 Subject: [PATCH 030/327] RDMA/srpt: Support specifying the srpt_service_guid parameter Make loading ib_srpt with this parameter set work. The current behavior is that setting that parameter while loading the ib_srpt kernel module triggers the following kernel crash: BUG: kernel NULL pointer dereference, address: 0000000000000000 Call Trace: parse_one+0x18c/0x1d0 parse_args+0xe1/0x230 load_module+0x8de/0xa60 init_module_from_file+0x8b/0xd0 idempotent_init_module+0x181/0x240 __x64_sys_finit_module+0x5a/0xb0 do_syscall_64+0x5f/0xe0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Cc: LiHonggang Reported-by: LiHonggang Fixes: a42d985bd5b2 ("ib_srpt: Initial SRP Target merge for v3.3-rc1") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240205004207.17031-1-bvanassche@acm.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srpt/ib_srpt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 58f70cfec45a..d2dce6ce30a9 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -79,12 +79,16 @@ module_param(srpt_srq_size, int, 0444); MODULE_PARM_DESC(srpt_srq_size, "Shared receive queue (SRQ) size."); +static int srpt_set_u64_x(const char *buffer, const struct kernel_param *kp) +{ + return kstrtou64(buffer, 16, (u64 *)kp->arg); +} static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "0x%016llx\n", *(u64 *)kp->arg); } -module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid, - 0444); +module_param_call(srpt_service_guid, srpt_set_u64_x, srpt_get_u64_x, + &srpt_service_guid, 0444); MODULE_PARM_DESC(srpt_service_guid, "Using this value for ioc_guid, id_ext, and cm_listen_id instead of using the node_guid of the first HCA."); From a620a7f2ae8b08c5beea6369f61e87064ee222dc Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 10 Jan 2024 10:08:49 +0100 Subject: [PATCH 031/327] arm64: dts: tqma8mpql: fix audio codec iov-supply IOVDD is supplied by 1.8V, fix the referenced regulator. Fixes: d8f9d8126582d ("arm64: dts: imx8mp: Add analog audio output on i.MX8MP TQMa8MPxL/MBa8MPxL") Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts index a2d5d19b2de0..86d3da36e4f3 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts @@ -184,6 +184,13 @@ enable-active-high; }; + reg_vcc_1v8: regulator-1v8 { + compatible = "regulator-fixed"; + regulator-name = "VCC_1V8"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + reg_vcc_3v3: regulator-3v3 { compatible = "regulator-fixed"; regulator-name = "VCC_3V3"; @@ -480,7 +487,7 @@ clock-names = "mclk"; clocks = <&audio_blk_ctrl IMX8MP_CLK_AUDIOMIX_SAI3_MCLK1>; reset-gpios = <&gpio4 29 GPIO_ACTIVE_LOW>; - iov-supply = <®_vcc_3v3>; + iov-supply = <®_vcc_1v8>; ldoin-supply = <®_vcc_3v3>; }; From f954785a124e77d4e6bb52cab689a8de447999aa Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 13 Jan 2024 19:43:32 -0300 Subject: [PATCH 032/327] Revert "arm64: dts: imx8mp-dhcom-pdk3: Describe the USB-C connector" This reverts commit a4dca89fe8a1585af73e362f5f4e3189a00abf8e. Marek reported: "This patch breaks USB-C port on this board. If I plug in USB-C storage device, it is not detected. If I revert this patch, it is detected. Please drop this patch for now." Revert it to avoid the regression. Reported-by: Marek Vasut Closes: https://lore.kernel.org/linux-arm-kernel/623c294a-5c53-4e01-acbf-104acc180e14@denx.de/T/#u Fixes: a4dca89fe8a1 ("arm64: dts: imx8mp-dhcom-pdk3: Describe the USB-C connector") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts index fea67a9282f0..b749e28e5ede 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-pdk3.dts @@ -175,14 +175,10 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_ptn5150>; - connector { - compatible = "usb-c-connector"; - label = "USB-C"; + port { - port { - ptn5150_out_ep: endpoint { - remote-endpoint = <&dwc3_0_ep>; - }; + ptn5150_out_ep: endpoint { + remote-endpoint = <&dwc3_0_ep>; }; }; }; From 690085d866f08cc72ae4d601821564a0b63e32f3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 13 Jan 2024 19:43:33 -0300 Subject: [PATCH 033/327] Revert "arm64: dts: imx8mn-var-som-symphony: Describe the USB-C connector" This reverts commit 095b96b2b8c6dcabf40bbfe4bb99a95fe55c7463. Marek reported the similar change in imx8mp-dhcom-pdk3.dts caused a regression: "This patch breaks USB-C port on this board. If I plug in USB-C storage device, it is not detected. If I revert this patch, it is detected. Please drop this patch for now." Revert it here as well. Fixes: 095b96b2b8c6 ("arm64: dts: imx8mn-var-som-symphony: Describe the USB-C connector") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mn-var-som-symphony.dts | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som-symphony.dts b/arch/arm64/boot/dts/freescale/imx8mn-var-som-symphony.dts index f38ee2266b25..a6b94d1957c9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-var-som-symphony.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som-symphony.dts @@ -128,14 +128,9 @@ pinctrl-0 = <&pinctrl_ptn5150>; status = "okay"; - connector { - compatible = "usb-c-connector"; - label = "USB-C"; - - port { - typec1_dr_sw: endpoint { - remote-endpoint = <&usb1_drd_sw>; - }; + port { + typec1_dr_sw: endpoint { + remote-endpoint = <&usb1_drd_sw>; }; }; }; From 7bca405c986075c99b9f729d3587b5c45db39d01 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 19 Jan 2024 19:50:26 +0100 Subject: [PATCH 034/327] bus: imx-weim: fix valid range check When the range parsing was open-coded the number of u32 entries to parse had to be a multiple of 4 and the driver checks this. With the range parsing converted to the range parser the counting changes from individual u32 entries to a complete range, so the check must not reject counts not divisible by 4. Fixes: 2a88e4792c6d ("bus: imx-weim: Remove open coded "ranges" parsing") Signed-off-by: Lucas Stach Signed-off-by: Shawn Guo --- drivers/bus/imx-weim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bus/imx-weim.c b/drivers/bus/imx-weim.c index 6b5da73c8541..837bf9d51c6e 100644 --- a/drivers/bus/imx-weim.c +++ b/drivers/bus/imx-weim.c @@ -120,7 +120,7 @@ static int imx_weim_gpr_setup(struct platform_device *pdev) i++; } - if (i == 0 || i % 4) + if (i == 0) goto err; for (i = 0; i < ARRAY_SIZE(gprvals); i++) { From a4ab7dedaee0e39b15653c5fd0367e420739f7ef Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:07 +0000 Subject: [PATCH 035/327] iommufd/iova_bitmap: Bounds check mapped::pages access Dirty IOMMU hugepages reported on a base page page-size granularity can lead to an attempt to set dirty pages in the bitmap beyond the limits that are pinned. Bounds check the page index of the array we are trying to access is within the limits before we kmap() and return otherwise. While it is also a defensive check, this is also in preparation to defer setting bits (outside the mapped range) to the next iteration(s) when the pages become available. Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries") Link: https://lore.kernel.org/r/20240202133415.23819-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Tested-by: Avihai Horon Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iova_bitmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 0a92c9eeaf7f..a3606b4c2229 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -409,6 +409,7 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_page_idx = mapped->npages - 1; do { unsigned int page_idx = cur_bit / BITS_PER_PAGE; @@ -417,6 +418,9 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, last_bit - cur_bit + 1); void *kaddr; + if (unlikely(page_idx > last_page_idx)) + break; + kaddr = kmap_local_page(mapped->pages[page_idx]); bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); From d18411ec305728c6371806c4fb09be07016aad0b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:08 +0000 Subject: [PATCH 036/327] iommufd/iova_bitmap: Switch iova_bitmap::bitmap to an u8 array iova_bitmap_mapped_length() don't deal correctly with the small bitmaps (< 2M bitmaps) when the starting address isn't u64 aligned, leading to skipping a tiny part of the IOVA range. This is materialized as not marking data dirty that should otherwise have been. Fix that by using a u8 * in the internal state of IOVA bitmap. Most of the data structures use the type of the bitmap to adjust its indexes, thus changing the type of the bitmap decreases the granularity of the bitmap indexes. Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries") Link: https://lore.kernel.org/r/20240202133415.23819-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Tested-by: Avihai Horon Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iova_bitmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index a3606b4c2229..9d42ab51a6bb 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -100,7 +100,7 @@ struct iova_bitmap { struct iova_bitmap_map mapped; /* userspace address of the bitmap */ - u64 __user *bitmap; + u8 __user *bitmap; /* u64 index that @mapped points to */ unsigned long mapped_base_index; @@ -162,7 +162,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap) { struct iova_bitmap_map *mapped = &bitmap->mapped; unsigned long npages; - u64 __user *addr; + u8 __user *addr; long ret; /* @@ -247,7 +247,7 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, mapped = &bitmap->mapped; mapped->pgshift = __ffs(page_size); - bitmap->bitmap = data; + bitmap->bitmap = (u8 __user *)data; bitmap->mapped_total_index = iova_bitmap_offset_to_index(bitmap, length - 1) + 1; bitmap->iova = iova; @@ -304,7 +304,7 @@ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, - bytes / sizeof(*bitmap->bitmap)); + DIV_ROUND_UP(bytes, sizeof(*bitmap->bitmap))); return remaining; } From 42af95114535dd94c39714b97ad720602d406b9a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:09 +0000 Subject: [PATCH 037/327] iommufd/selftest: Test u64 unaligned bitmaps Exercise the dirty tracking bitmaps with byte unaligned addresses in addition to the PAGE_SIZE unaligned bitmaps, using a address towards the end of the page boundary. In doing so, increase the tailroom we allocate for the bitmap from MOCK_PAGE_SIZE(2K) into PAGE_SIZE(4K), such that we can test end of bitmap boundary. Link: https://lore.kernel.org/r/20240202133415.23819-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 1a881e7a21d1..49774a720314 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1741,9 +1741,9 @@ FIXTURE_SETUP(iommufd_dirty_tracking) self->bitmap_size = variant->buffer_size / self->page_size / BITS_PER_BYTE; - /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */ + /* Provision with an extra (PAGE_SIZE) for the unaligned case */ rc = posix_memalign(&self->bitmap, PAGE_SIZE, - self->bitmap_size + MOCK_PAGE_SIZE); + self->bitmap_size + PAGE_SIZE); assert(!rc); assert(self->bitmap); assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); @@ -1873,6 +1873,13 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, 0, _metadata); + /* u64 unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + 0xff1, + self->bitmap_size, 0, _metadata); + + test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } @@ -1907,6 +1914,14 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); + /* u64 unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + 0xff1, + self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); + test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } From 2780025e01e2e1c92f83ee7da91d9727c2e58a3e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:10 +0000 Subject: [PATCH 038/327] iommufd/iova_bitmap: Handle recording beyond the mapped pages IOVA bitmap is a zero-copy scheme of recording dirty bits that iterate the different bitmap user pages at chunks of a maximum of PAGE_SIZE/sizeof(struct page*) pages. When the iterations are split up into 64G, the end of the range may be broken up in a way that's aligned with a non base page PTE size. This leads to only part of the huge page being recorded in the bitmap. Note that in pratice this is only a problem for IOMMU dirty tracking i.e. when the backing PTEs are in IOMMU hugepages and the bitmap is in base page granularity. So far this not something that affects VF dirty trackers (which reports and records at the same granularity). To fix that, if there is a remainder of bits left to set in which the current IOVA bitmap doesn't cover, make a copy of the bitmap structure and iterate-and-set the rest of the bits remaining. Finally, when advancing the iterator, skip all the bits that were set ahead. Link: https://lore.kernel.org/r/20240202133415.23819-5-joao.m.martins@oracle.com Reported-by: Avihai Horon Fixes: f35f22cc760e ("iommu/vt-d: Access/Dirty bit support for SS domains") Fixes: 421a511a293f ("iommu/amd: Access/Dirty bit support in IOPTEs") Signed-off-by: Joao Martins Tested-by: Avihai Horon Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iova_bitmap.c | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 9d42ab51a6bb..b370e8ee8866 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -113,6 +113,9 @@ struct iova_bitmap { /* length of the IOVA range for the whole bitmap */ size_t length; + + /* length of the IOVA range set ahead the pinned pages */ + unsigned long set_ahead_length; }; /* @@ -341,6 +344,32 @@ static bool iova_bitmap_done(struct iova_bitmap *bitmap) return bitmap->mapped_base_index >= bitmap->mapped_total_index; } +static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap, + size_t set_ahead_length) +{ + int ret = 0; + + while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) { + unsigned long length = iova_bitmap_mapped_length(bitmap); + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + + ret = iova_bitmap_get(bitmap); + if (ret) + break; + + length = min(length, set_ahead_length); + iova_bitmap_set(bitmap, iova, length); + + set_ahead_length -= length; + bitmap->mapped_base_index += + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + iova_bitmap_put(bitmap); + } + + bitmap->set_ahead_length = 0; + return ret; +} + /* * Advances to the next range, releases the current pinned * pages and pins the next set of bitmap pages. @@ -357,6 +386,15 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap) if (iova_bitmap_done(bitmap)) return 0; + /* Iterate, set and skip any bits requested for next iteration */ + if (bitmap->set_ahead_length) { + int ret; + + ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length); + if (ret) + return ret; + } + /* When advancing the index we pin the next set of bitmap pages */ return iova_bitmap_get(bitmap); } @@ -426,5 +464,10 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, kunmap_local(kaddr); cur_bit += nbits; } while (cur_bit <= last_bit); + + if (unlikely(cur_bit <= last_bit)) { + bitmap->set_ahead_length = + ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift); + } } EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); From 407fc184f0e0bfde61026d6ce3fb1e70a15159a3 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:11 +0000 Subject: [PATCH 039/327] iommufd/selftest: Refactor dirty bitmap tests Rework the functions that test and set the bitmaps to receive a new parameter (the pte_page_size) that reflects the expected PTE size in the page tables. The same scheme is still used i.e. even bits are dirty and odd page indexes aren't dirty. Here it just refactors to consider the size of the PTE rather than hardcoded to IOMMU mock base page assumptions. While at it, refactor dirty bitmap tests to use the idev_id created by the fixture instead of creating a new one. This is in preparation for doing tests with IOMMU hugepages where multiple bits set as part of recording a whole hugepage as dirty and thus the pte_page_size will vary depending on io hugepages or io base pages. Link: https://lore.kernel.org/r/20240202133415.23819-6-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 28 ++++++------- tools/testing/selftests/iommu/iommufd_utils.h | 39 ++++++++++++------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 49774a720314..56c3e511a0ab 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1849,7 +1849,7 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { - uint32_t stddev_id; + uint32_t page_size = MOCK_PAGE_SIZE; uint32_t hwpt_id; uint32_t ioas_id; @@ -1859,34 +1859,31 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) test_cmd_hwpt_alloc(self->idev_id, ioas_id, IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); test_cmd_set_dirty_tracking(hwpt_id, true); test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap, self->bitmap_size, 0, _metadata); /* PAGE_SIZE unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, 0, _metadata); /* u64 unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, - self->bitmap + 0xff1, - self->bitmap_size, 0, _metadata); + MOCK_APERTURE_START, self->page_size, page_size, + self->bitmap + 0xff1, self->bitmap_size, 0, + _metadata); - - test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { - uint32_t stddev_id; + uint32_t page_size = MOCK_PAGE_SIZE; uint32_t hwpt_id; uint32_t ioas_id; @@ -1896,19 +1893,18 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) test_cmd_hwpt_alloc(self->idev_id, ioas_id, IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); test_cmd_set_dirty_tracking(hwpt_id, true); test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); /* Unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, + MOCK_APERTURE_START, self->page_size, page_size, self->bitmap + MOCK_PAGE_SIZE, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, @@ -1916,13 +1912,11 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) /* u64 unaligned bitmap */ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, - MOCK_APERTURE_START, self->page_size, - self->bitmap + 0xff1, - self->bitmap_size, + MOCK_APERTURE_START, self->page_size, page_size, + self->bitmap + 0xff1, self->bitmap_size, IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, _metadata); - test_ioctl_destroy(stddev_id); test_ioctl_destroy(hwpt_id); } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index c646264aa41f..8d2b46b2114d 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -344,16 +344,19 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, page_size, bitmap, nr)) static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, - __u64 iova, size_t page_size, __u64 *bitmap, + __u64 iova, size_t page_size, + size_t pte_page_size, __u64 *bitmap, __u64 bitmap_size, __u32 flags, struct __test_metadata *_metadata) { - unsigned long i, nbits = bitmap_size * BITS_PER_BYTE; - unsigned long nr = nbits / 2; + unsigned long npte = pte_page_size / page_size, pteset = 2 * npte; + unsigned long nbits = bitmap_size * BITS_PER_BYTE; + unsigned long j, i, nr = nbits / pteset ?: 1; __u64 out_dirty = 0; /* Mark all even bits as dirty in the mock domain */ - for (i = 0; i < nbits; i += 2) + memset(bitmap, 0, bitmap_size); + for (i = 0; i < nbits; i += pteset) set_bit(i, (unsigned long *)bitmap); test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, @@ -365,8 +368,12 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, flags); /* Beware ASSERT_EQ() is two statements -- braces are not redundant! */ - for (i = 0; i < nbits; i++) { - ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); + for (i = 0; i < nbits; i += pteset) { + for (j = 0; j < pteset; j++) { + ASSERT_EQ(j < npte, + test_bit(i + j, (unsigned long *)bitmap)); + } + ASSERT_EQ(!(i % pteset), test_bit(i, (unsigned long *)bitmap)); } memset(bitmap, 0, bitmap_size); @@ -374,19 +381,23 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, flags); /* It as read already -- expect all zeroes */ - for (i = 0; i < nbits; i++) { - ASSERT_EQ(!(i % 2) && (flags & - IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), - test_bit(i, (unsigned long *)bitmap)); + for (i = 0; i < nbits; i += pteset) { + for (j = 0; j < pteset; j++) { + ASSERT_EQ( + (j < npte) && + (flags & + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), + test_bit(i + j, (unsigned long *)bitmap)); + } } return 0; } -#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \ - bitmap_size, flags, _metadata) \ +#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, pte_size,\ + bitmap, bitmap_size, flags, _metadata) \ ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \ - page_size, bitmap, bitmap_size, \ - flags, _metadata)) + page_size, pte_size, bitmap, \ + bitmap_size, flags, _metadata)) static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) From 02a8c61a8b06a4a082b58c3e643b28036c6be60f Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:12 +0000 Subject: [PATCH 040/327] iommufd/selftest: Refactor mock_domain_read_and_clear_dirty() Move the clearing of the dirty bit of the mock domain into mock_domain_test_and_clear_dirty() helper, simplifying the caller function. Additionally, rework the mock_domain_read_and_clear_dirty() loop to iterate over a potentially variable IO page size. No functional change intended with the loop refactor. This is in preparation for dirty tracking support for IOMMU hugepage mock domains. Link: https://lore.kernel.org/r/20240202133415.23819-7-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 64 ++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d9e9920c7eba..796e7e3ec0cf 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -191,6 +191,34 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } +static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, + unsigned long iova, size_t page_size, + unsigned long flags) +{ + unsigned long cur, end = iova + page_size - 1; + bool dirty = false; + void *ent, *old; + + for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); + if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) + continue; + + dirty = true; + /* Clear dirty */ + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { + unsigned long val; + + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + } + } + + return dirty; +} + static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long iova, size_t size, unsigned long flags, @@ -198,31 +226,29 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, { struct mock_iommu_domain *mock = container_of(domain, struct mock_iommu_domain, domain); - unsigned long i, max = size / MOCK_IO_PAGE_SIZE; - void *ent, *old; + unsigned long end = iova + size; + void *ent; if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) return -EINVAL; - for (i = 0; i < max; i++) { - unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE; + do { + unsigned long pgsize = MOCK_IO_PAGE_SIZE; + unsigned long head; - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, - cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - iommu_dirty_bitmap_record(dirty, cur, - MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent) { + iova += pgsize; + continue; } - } + + head = iova & ~(pgsize - 1); + + /* Clear dirty */ + if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, head, pgsize); + iova = head + pgsize; + } while (iova < end); return 0; } From 7db521e23fe9e36855b61b01a67291281118570e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:13 +0000 Subject: [PATCH 041/327] iommufd/selftest: Hugepage mock domain support Add support to mock iommu hugepages of 1M (for a 2K mock io page size). To avoid breaking test suite defaults, the way this is done is by explicitly creating a iommu mock device which has hugepage support (i.e. through MOCK_FLAGS_DEVICE_HUGE_IOVA). The same scheme is maintained of mock base page index tracking in the XArray, except that an extra bit is added to mark it as a hugepage. One subpage containing the dirty bit, means that the whole hugepage is dirty (similar to AMD IOMMU non-standard page sizes). For clearing, same thing applies, and it must clear all dirty subpages. This is in preparation for dirty tracking to mark mock hugepages as dirty to exercise all the iova-bitmap fixes. Link: https://lore.kernel.org/r/20240202133415.23819-8-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 1 + drivers/iommu/iommufd/selftest.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 482d4059f5db..e854d3f67205 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -45,6 +45,7 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, + MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, }; enum { diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 796e7e3ec0cf..8abf9747773e 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -41,6 +41,7 @@ static atomic_t mock_dev_num; enum { MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, /* * Like a real page table alignment requires the low bits of the address @@ -53,6 +54,7 @@ enum { MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, + MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; /* @@ -242,6 +244,8 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, continue; } + if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) + pgsize = MOCK_HUGE_PAGE_SIZE; head = iova & ~(pgsize - 1); /* Clear dirty */ @@ -260,6 +264,7 @@ const struct iommu_dirty_ops dirty_ops = { static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); @@ -268,6 +273,8 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) + mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; mock->domain.ops = mock_ops.default_domain_ops; mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); @@ -313,7 +320,7 @@ mock_domain_alloc_user(struct device *dev, u32 flags, return ERR_PTR(-EOPNOTSUPP); if (user_data || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(NULL); + domain = mock_domain_alloc_paging(dev); if (!domain) return ERR_PTR(-ENOMEM); if (has_dirty_flag) @@ -376,6 +383,9 @@ static int mock_domain_map_pages(struct iommu_domain *domain, if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) flags = MOCK_PFN_LAST_IOVA; + if (pgsize != MOCK_IO_PAGE_SIZE) { + flags |= MOCK_PFN_HUGE_IOVA; + } old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | flags), @@ -630,7 +640,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) struct mock_dev *mdev; int rc; - if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY)) + if (dev_flags & + ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); From fe13166f0562117c42fc60a109e664a914523e63 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:14 +0000 Subject: [PATCH 042/327] iommufd/selftest: Add mock IO hugepages tests Leverage previously added MOCK_FLAGS_DEVICE_HUGE_IOVA flag to create an IOMMU domain with more than MOCK_IO_PAGE_SIZE supported. Plumb the hugetlb backing memory for buffer allocation and change the expected page size to MOCK_HUGE_PAGE_SIZE (1M) when hugepage variant test cases are used. These so far are limited to 128M and 256M IOVA range tests cases which is when 1M hugepages can be used. Link: https://lore.kernel.org/r/20240202133415.23819-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 45 +++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 56c3e511a0ab..edf1c99c9936 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -12,6 +12,7 @@ static unsigned long HUGEPAGE_SIZE; #define MOCK_PAGE_SIZE (PAGE_SIZE / 2) +#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) static unsigned long get_huge_page_size(void) { @@ -1716,10 +1717,12 @@ FIXTURE(iommufd_dirty_tracking) FIXTURE_VARIANT(iommufd_dirty_tracking) { unsigned long buffer_size; + bool hugepages; }; FIXTURE_SETUP(iommufd_dirty_tracking) { + int mmap_flags; void *vrc; int rc; @@ -1732,9 +1735,17 @@ FIXTURE_SETUP(iommufd_dirty_tracking) variant->buffer_size, rc); } + mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED; + if (variant->hugepages) { + /* + * MAP_POPULATE will cause the kernel to fail mmap if THPs are + * not available. + */ + mmap_flags |= MAP_HUGETLB | MAP_POPULATE; + } assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0); vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + mmap_flags, -1, 0); assert(vrc == self->buffer); self->page_size = MOCK_PAGE_SIZE; @@ -1749,8 +1760,16 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, - &self->idev_id); + /* Enable 1M mock IOMMU hugepages */ + if (variant->hugepages) { + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_HUGE_IOVA, + &self->stdev_id, &self->hwpt_id, + &self->idev_id); + } else { + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, + &self->hwpt_id, &self->idev_id); + } } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -1784,12 +1803,26 @@ FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) .buffer_size = 128UL * 1024UL * 1024UL, }; +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge) +{ + /* 4K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, + .hugepages = true, +}; + FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M) { /* 8K bitmap (256M IOVA range) */ .buffer_size = 256UL * 1024UL * 1024UL, }; +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M_huge) +{ + /* 8K bitmap (256M IOVA range) */ + .buffer_size = 256UL * 1024UL * 1024UL, + .hugepages = true, +}; + TEST_F(iommufd_dirty_tracking, enforce_dirty) { uint32_t ioas_id, stddev_id, idev_id; @@ -1853,6 +1886,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) uint32_t hwpt_id; uint32_t ioas_id; + if (variant->hugepages) + page_size = MOCK_HUGE_PAGE_SIZE; + test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); @@ -1887,6 +1923,9 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) uint32_t hwpt_id; uint32_t ioas_id; + if (variant->hugepages) + page_size = MOCK_HUGE_PAGE_SIZE; + test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); From 4bbcbc6ea2fa379632a24c14cfb47aa603816ac6 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2024 13:34:15 +0000 Subject: [PATCH 043/327] iommufd/iova_bitmap: Consider page offset for the pages to be pinned For small bitmaps that aren't PAGE_SIZE aligned *and* that are less than 512 pages in bitmap length, use an extra page to be able to cover the entire range e.g. [1M..3G] which would be iterated more efficiently in a single iteration, rather than two. Fixes: b058ea3ab5af ("vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries") Link: https://lore.kernel.org/r/20240202133415.23819-10-joao.m.martins@oracle.com Signed-off-by: Joao Martins Tested-by: Avihai Horon Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iova_bitmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index b370e8ee8866..db8c46bee155 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -178,18 +178,19 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap) bitmap->mapped_base_index) * sizeof(*bitmap->bitmap), PAGE_SIZE); - /* - * We always cap at max number of 'struct page' a base page can fit. - * This is, for example, on x86 means 2M of bitmap data max. - */ - npages = min(npages, PAGE_SIZE / sizeof(struct page *)); - /* * Bitmap address to be pinned is calculated via pointer arithmetic * with bitmap u64 word index. */ addr = bitmap->bitmap + bitmap->mapped_base_index; + /* + * We always cap at max number of 'struct page' a base page can fit. + * This is, for example, on x86 means 2M of bitmap data max. + */ + npages = min(npages + !!offset_in_page(addr), + PAGE_SIZE / sizeof(struct page *)); + ret = pin_user_pages_fast((unsigned long)addr, npages, FOLL_WRITE, mapped->pages); if (ret <= 0) From 82b143aeb169b8b55798d7d2063032e1a6ceeeb0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 5 Feb 2024 10:39:20 +0100 Subject: [PATCH 044/327] Revert "parisc: Only list existing CPUs in cpu_possible_mask" This reverts commit 0921244f6f4f0d05698b953fe632a99b38907226. It broke CPU hotplugging because it modifies the __cpu_possible_mask after bootup, so that it will be different than nr_cpu_ids, which then effictively breaks the workqueue setup code and triggers crashes when shutting down CPUs at runtime. Guenter was the first who noticed the wrong values in __cpu_possible_mask, since the cpumask Kunit tests were failig. Reverting this commit fixes both issues, but sadly brings back this uncritical runtime warning: register_cpu_capacity_sysctl: too early to get CPU4 device! Signed-off-by: Helge Deller Reported-by: Guenter Roeck Link: https://lkml.org/lkml/2024/2/4/146 Link: https://lore.kernel.org/lkml/Zb0mbHlIud_bqftx@slm.duckdns.org/t/ Cc: stable@vger.kernel.org # 6.0+ --- arch/parisc/kernel/processor.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index e95a977ba5f3..bf73562706b2 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -172,7 +172,6 @@ static int __init processor_probe(struct parisc_device *dev) p->cpu_num = cpu_info.cpu_num; p->cpu_loc = cpu_info.cpu_loc; - set_cpu_possible(cpuid, true); store_cpu_topology(cpuid); #ifdef CONFIG_SMP @@ -474,13 +473,6 @@ static struct parisc_driver cpu_driver __refdata = { */ void __init processor_init(void) { - unsigned int cpu; - reset_cpu_topology(); - - /* reset possible mask. We will mark those which are possible. */ - for_each_possible_cpu(cpu) - set_cpu_possible(cpu, false); - register_parisc_driver(&cpu_driver); } From 855678ed8534518e2b428bcbcec695de9ba248e8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:51 +0800 Subject: [PATCH 045/327] md: Fix missing release of 'active_io' for flush submit_flushes atomic_set(&mddev->flush_pending, 1); rdev_for_each_rcu(rdev, mddev) atomic_inc(&mddev->flush_pending); bi->bi_end_io = md_end_flush submit_bio(bi); /* flush io is done first */ md_end_flush if (atomic_dec_and_test(&mddev->flush_pending)) percpu_ref_put(&mddev->active_io) -> active_io is not released if (atomic_dec_and_test(&mddev->flush_pending)) -> missing release of active_io For consequence, mddev_suspend() will wait for 'active_io' to be zero forever. Fix this problem by releasing 'active_io' in submit_flushes() if 'flush_pending' is decreased to zero. Fixes: fa2bbff7b0b4 ("md: synchronize flush io with array reconfiguration") Cc: stable@vger.kernel.org # v6.1+ Reported-by: Blazej Kucman Closes: https://lore.kernel.org/lkml/20240130172524.0000417b@linux.intel.com/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-7-yukuai1@huaweicloud.com --- drivers/md/md.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2266358d8074..e52896c140bd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -579,8 +579,12 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + queue_work(md_wq, &mddev->flush_work); + } } static void md_submit_flush_data(struct work_struct *ws) From c0ec2a712daf133d9996a8a1b7ee2d4996080363 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Tue, 30 Jan 2024 19:27:40 +0800 Subject: [PATCH 046/327] crypto: virtio/akcipher - Fix stack overflow on memcpy sizeof(struct virtio_crypto_akcipher_session_para) is less than sizeof(struct virtio_crypto_op_ctrl_req::u), copying more bytes from stack variable leads stack overflow. Clang reports this issue by commands: make -j CC=clang-14 mrproper >/dev/null 2>&1 make -j O=/tmp/crypto-build CC=clang-14 allmodconfig >/dev/null 2>&1 make -j O=/tmp/crypto-build W=1 CC=clang-14 drivers/crypto/virtio/ virtio_crypto_akcipher_algs.o Fixes: 59ca6c93387d ("virtio-crypto: implement RSA algorithm") Link: https://lore.kernel.org/all/0a194a79-e3a3-45e7-be98-83abd3e1cb7e@roeck-us.net/ Cc: Signed-off-by: zhenwei pi Tested-by: Nathan Chancellor # build Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Herbert Xu --- drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c index 2621ff8a9376..de53eddf6796 100644 --- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c @@ -104,7 +104,8 @@ static void virtio_crypto_dataq_akcipher_callback(struct virtio_crypto_request * } static int virtio_crypto_alg_akcipher_init_session(struct virtio_crypto_akcipher_ctx *ctx, - struct virtio_crypto_ctrl_header *header, void *para, + struct virtio_crypto_ctrl_header *header, + struct virtio_crypto_akcipher_session_para *para, const uint8_t *key, unsigned int keylen) { struct scatterlist outhdr_sg, key_sg, inhdr_sg, *sgs[3]; @@ -128,7 +129,7 @@ static int virtio_crypto_alg_akcipher_init_session(struct virtio_crypto_akcipher ctrl = &vc_ctrl_req->ctrl; memcpy(&ctrl->header, header, sizeof(ctrl->header)); - memcpy(&ctrl->u, para, sizeof(ctrl->u)); + memcpy(&ctrl->u.akcipher_create_session.para, para, sizeof(*para)); input = &vc_ctrl_req->input; input->status = cpu_to_le32(VIRTIO_CRYPTO_ERR); From 7f8b81ca35f909747842b649025af541ec172b8f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Feb 2024 14:58:49 +0100 Subject: [PATCH 047/327] s390/configs: provide compat topic configuration target CONFIG_COMPAT is disabled by default, however compat code still needs to be compile tested. Add a compat topic configuration target which allows to enable it easily. Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/configs/compat.config | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 arch/s390/configs/compat.config diff --git a/arch/s390/configs/compat.config b/arch/s390/configs/compat.config new file mode 100644 index 000000000000..6fd051453ae8 --- /dev/null +++ b/arch/s390/configs/compat.config @@ -0,0 +1,3 @@ +# Help: Enable compat support +CONFIG_COMPAT=y +CONFIG_COMPAT_32BIT_TIME=y From 027790f611321acabbd3c938c1c8ebc08dddd71e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Feb 2024 14:58:50 +0100 Subject: [PATCH 048/327] s390/configs: enable INIT_STACK_ALL_ZERO in all configurations It looks like all distributions will enable INIT_STACK_ALL_ZERO. Reflect that in the default configurations. Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/configs/zfcpdump_defconfig | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index cae2dd34fbb4..0e8ba18a4a08 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -709,7 +709,6 @@ CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" -CONFIG_INIT_STACK_NONE=y CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 42b988873e54..c7a39b9e5f01 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -693,7 +693,6 @@ CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" -CONFIG_INIT_STACK_NONE=y CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 30d2a1687665..217a92e025c0 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -64,7 +64,6 @@ CONFIG_ZFCP=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_LSM="yama,loadpin,safesetid,integrity" -CONFIG_INIT_STACK_NONE=y # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y From 124468af7e769a52d27c3290007ac6e2ba346ccd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Feb 2024 14:58:51 +0100 Subject: [PATCH 049/327] s390/configs: update default configurations Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 7 +------ arch/s390/configs/defconfig | 8 +------- arch/s390/configs/zfcpdump_defconfig | 1 + 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 0e8ba18a4a08..c924be0d7ed8 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -118,7 +118,6 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y @@ -374,6 +373,7 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -436,9 +436,6 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set -CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -637,7 +634,6 @@ CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m -CONFIG_NETFS_SUPPORT=m CONFIG_NETFS_STATS=y CONFIG_FSCACHE=y CONFIG_CACHEFILES=m @@ -738,7 +734,6 @@ CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_CFB=m CONFIG_CRYPTO_HCTR2=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_LRW=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c7a39b9e5f01..c8f0c9fe40d7 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -109,7 +109,6 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y @@ -364,6 +363,7 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -426,9 +426,6 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set -CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -622,7 +619,6 @@ CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m -CONFIG_NETFS_SUPPORT=m CONFIG_NETFS_STATS=y CONFIG_FSCACHE=y CONFIG_CACHEFILES=m @@ -723,11 +719,9 @@ CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_CFB=m CONFIG_CRYPTO_HCTR2=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_CHACHA20POLY1305=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 217a92e025c0..c51f3ec4eb28 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -8,6 +8,7 @@ CONFIG_BPF_SYSCALL=y # CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 From 7e4a205fe56b9092f0143dad6aa5fee081139b09 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Feb 2024 23:53:05 -0500 Subject: [PATCH 050/327] Revert "get rid of DCACHE_GENOCIDE" This reverts commit 57851607326a2beef21e67f83f4f53a90df8445a. Unfortunately, while we only call that thing once, the callback *can* be called more than once for the same dentry - all it takes is rename_lock being touched while we are in d_walk(). For now let's revert it. Signed-off-by: Al Viro --- fs/dcache.c | 5 ++++- include/linux/dcache.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index b813528fb147..6ebccba33336 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - dentry->d_lockref.count--; + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } } return D_WALK_CONTINUE; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1666c387861f..d07cf2f1bb7d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -173,6 +173,7 @@ struct dentry_operations { #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) +#define DCACHE_GENOCIDE BIT(9) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) From 5ba4e6d5863c53e937f49932dee0ecb004c65928 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 8 Feb 2024 17:36:28 -0500 Subject: [PATCH 051/327] RDMA/qedr: Fix qedr_create_user_qp error flow Avoid the following warning by making sure to free the allocated resources in case that qedr_init_user_queue() fail. -----------[ cut here ]----------- WARNING: CPU: 0 PID: 143192 at drivers/infiniband/core/rdma_core.c:874 uverbs_destroy_ufile_hw+0xcf/0xf0 [ib_uverbs] Modules linked in: tls target_core_user uio target_core_pscsi target_core_file target_core_iblock ib_srpt ib_srp scsi_transport_srp nfsd nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs 8021q garp mrp stp llc ext4 mbcache jbd2 opa_vnic ib_umad ib_ipoib sunrpc rdma_ucm ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm hfi1 intel_rapl_msr intel_rapl_common mgag200 qedr sb_edac drm_shmem_helper rdmavt x86_pkg_temp_thermal drm_kms_helper intel_powerclamp ib_uverbs coretemp i2c_algo_bit kvm_intel dell_wmi_descriptor ipmi_ssif sparse_keymap kvm ib_core rfkill syscopyarea sysfillrect video sysimgblt irqbypass ipmi_si ipmi_devintf fb_sys_fops rapl iTCO_wdt mxm_wmi iTCO_vendor_support intel_cstate pcspkr dcdbas intel_uncore ipmi_msghandler lpc_ich acpi_power_meter mei_me mei fuse drm xfs libcrc32c qede sd_mod ahci libahci t10_pi sg crct10dif_pclmul crc32_pclmul crc32c_intel qed libata tg3 ghash_clmulni_intel megaraid_sas crc8 wmi [last unloaded: ib_srpt] CPU: 0 PID: 143192 Comm: fi_rdm_tagged_p Kdump: loaded Not tainted 5.14.0-408.el9.x86_64 #1 Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 2.14.0 01/25/2022 RIP: 0010:uverbs_destroy_ufile_hw+0xcf/0xf0 [ib_uverbs] Code: 5d 41 5c 41 5d 41 5e e9 0f 26 1b dd 48 89 df e8 67 6a ff ff 49 8b 86 10 01 00 00 48 85 c0 74 9c 4c 89 e7 e8 83 c0 cb dd eb 92 <0f> 0b eb be 0f 0b be 04 00 00 00 48 89 df e8 8e f5 ff ff e9 6d ff RSP: 0018:ffffb7c6cadfbc60 EFLAGS: 00010286 RAX: ffff8f0889ee3f60 RBX: ffff8f088c1a5200 RCX: 00000000802a0016 RDX: 00000000802a0017 RSI: 0000000000000001 RDI: ffff8f0880042600 RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8f11fffd5000 R11: 0000000000039000 R12: ffff8f0d5b36cd80 R13: ffff8f088c1a5250 R14: ffff8f1206d91000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8f11d7c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000147069200e20 CR3: 00000001c7210002 CR4: 00000000001706f0 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? ib_uverbs_close+0x1f/0xb0 [ib_uverbs] ? uverbs_destroy_ufile_hw+0xcf/0xf0 [ib_uverbs] ? __warn+0x81/0x110 ? uverbs_destroy_ufile_hw+0xcf/0xf0 [ib_uverbs] ? report_bug+0x10a/0x140 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? uverbs_destroy_ufile_hw+0xcf/0xf0 [ib_uverbs] ib_uverbs_close+0x1f/0xb0 [ib_uverbs] __fput+0x94/0x250 task_work_run+0x5c/0x90 do_exit+0x270/0x4a0 do_group_exit+0x2d/0x90 get_signal+0x87c/0x8c0 arch_do_signal_or_restart+0x25/0x100 ? ib_uverbs_ioctl+0xc2/0x110 [ib_uverbs] exit_to_user_mode_loop+0x9c/0x130 exit_to_user_mode_prepare+0xb6/0x100 syscall_exit_to_user_mode+0x12/0x40 do_syscall_64+0x69/0x90 ? syscall_exit_work+0x103/0x130 ? syscall_exit_to_user_mode+0x22/0x40 ? do_syscall_64+0x69/0x90 ? syscall_exit_work+0x103/0x130 ? syscall_exit_to_user_mode+0x22/0x40 ? do_syscall_64+0x69/0x90 ? do_syscall_64+0x69/0x90 ? common_interrupt+0x43/0xa0 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x1470abe3ec6b Code: Unable to access opcode bytes at RIP 0x1470abe3ec41. RSP: 002b:00007fff13ce9108 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: fffffffffffffffc RBX: 00007fff13ce9218 RCX: 00001470abe3ec6b RDX: 00007fff13ce9200 RSI: 00000000c0181b01 RDI: 0000000000000004 RBP: 00007fff13ce91e0 R08: 0000558d9655da10 R09: 0000558d9655dd00 R10: 00007fff13ce95c0 R11: 0000000000000246 R12: 00007fff13ce9358 R13: 0000000000000013 R14: 0000558d9655db50 R15: 00007fff13ce9470 --[ end trace 888a9b92e04c5c97 ]-- Fixes: df15856132bc ("RDMA/qedr: restructure functions that create/destroy QPs") Signed-off-by: Kamal Heib Link: https://lore.kernel.org/r/20240208223628.2040841-1-kheib@redhat.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qedr/verbs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 7887a6786ed4..f118ce0a9a61 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1879,8 +1879,17 @@ static int qedr_create_user_qp(struct qedr_dev *dev, /* RQ - read access only (0) */ rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr, ureq.rq_len, true, 0, alloc_and_init); - if (rc) + if (rc) { + ib_umem_release(qp->usq.umem); + qp->usq.umem = NULL; + if (rdma_protocol_roce(&dev->ibdev, 1)) { + qedr_free_pbl(dev, &qp->usq.pbl_info, + qp->usq.pbl_tbl); + } else { + kfree(qp->usq.pbl_tbl); + } return rc; + } } memset(&in_params, 0, sizeof(in_params)); From b419c5e2d9787a98c1410dbe1d4a25906b96f000 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 13 Feb 2024 13:31:56 +0200 Subject: [PATCH 052/327] Revert "iommu/arm-smmu: Convert to domain_alloc_paging()" This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling arm_smmu_write_context_bank() from new codepath results in the platform being reset because of the unclocked hardware access. Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()") Signed-off-by: Dmitry Baryshkov Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20240213-iommu-revert-domain-alloc-v1-1-325ff55dece4@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 68b6bc5e7c71..6317aaf7b3ab 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -859,10 +859,14 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) arm_smmu_rpm_put(smmu); } -static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) +static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; + if (type != IOMMU_DOMAIN_UNMANAGED) { + if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) + return NULL; + } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); - if (dev) { - struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); - - if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) { - kfree(smmu_domain); - return NULL; - } - } - return &smmu_domain->domain; } @@ -1600,7 +1595,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc_paging = arm_smmu_domain_alloc_paging, + .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .probe_finalize = arm_smmu_probe_finalize, From f03606470886e781e68b5d7acf2afb23b86cd7fa Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 13 Feb 2024 15:46:38 +0100 Subject: [PATCH 053/327] riscv: dts: starfive: replace underscores in node names Underscores should not be used in node names (dtc with W=2 warns about them), so replace them with hyphens. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/starfive/jh7100.dtsi | 12 ++++++------ arch/riscv/boot/dts/starfive/jh7110.dtsi | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/boot/dts/starfive/jh7100.dtsi b/arch/riscv/boot/dts/starfive/jh7100.dtsi index c216aaecac53..8bcf36d07f3f 100644 --- a/arch/riscv/boot/dts/starfive/jh7100.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7100.dtsi @@ -96,14 +96,14 @@ thermal-sensors = <&sfctemp>; trips { - cpu_alert0 { + cpu-alert0 { /* milliCelsius */ temperature = <75000>; hysteresis = <2000>; type = "passive"; }; - cpu_crit { + cpu-crit { /* milliCelsius */ temperature = <90000>; hysteresis = <2000>; @@ -113,28 +113,28 @@ }; }; - osc_sys: osc_sys { + osc_sys: osc-sys { compatible = "fixed-clock"; #clock-cells = <0>; /* This value must be overridden by the board */ clock-frequency = <0>; }; - osc_aud: osc_aud { + osc_aud: osc-aud { compatible = "fixed-clock"; #clock-cells = <0>; /* This value must be overridden by the board */ clock-frequency = <0>; }; - gmac_rmii_ref: gmac_rmii_ref { + gmac_rmii_ref: gmac-rmii-ref { compatible = "fixed-clock"; #clock-cells = <0>; /* Should be overridden by the board when needed */ clock-frequency = <0>; }; - gmac_gr_mii_rxclk: gmac_gr_mii_rxclk { + gmac_gr_mii_rxclk: gmac-gr-mii-rxclk { compatible = "fixed-clock"; #clock-cells = <0>; /* Should be overridden by the board when needed */ diff --git a/arch/riscv/boot/dts/starfive/jh7110.dtsi b/arch/riscv/boot/dts/starfive/jh7110.dtsi index 45213cdf50dc..74ed3b9264d8 100644 --- a/arch/riscv/boot/dts/starfive/jh7110.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110.dtsi @@ -237,14 +237,14 @@ }; trips { - cpu_alert0: cpu_alert0 { + cpu_alert0: cpu-alert0 { /* milliCelsius */ temperature = <85000>; hysteresis = <2000>; type = "passive"; }; - cpu_crit { + cpu-crit { /* milliCelsius */ temperature = <100000>; hysteresis = <2000>; From 00890f5d15f50bd386bf222ddee355cec6c6d53a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 11:51:29 +0100 Subject: [PATCH 054/327] arm64: dts: rockchip: minor rk3588 whitespace cleanup The DTS code coding style expects exactly one space before '{' character. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208105129.128561-1-krzysztof.kozlowski@linaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts index d7722772ecd8..997b516c2533 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts @@ -189,19 +189,19 @@ cpu-supply = <&vdd_cpu_lit_s0>; }; -&cpu_b0{ +&cpu_b0 { cpu-supply = <&vdd_cpu_big0_s0>; }; -&cpu_b1{ +&cpu_b1 { cpu-supply = <&vdd_cpu_big0_s0>; }; -&cpu_b2{ +&cpu_b2 { cpu-supply = <&vdd_cpu_big1_s0>; }; -&cpu_b3{ +&cpu_b3 { cpu-supply = <&vdd_cpu_big1_s0>; }; From 334bf0710c98d391f4067b72f535d6c4c84dfb6f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 19 Jan 2024 11:16:56 +0100 Subject: [PATCH 055/327] arm64: dts: rockchip: set num-cs property for spi on px30 The px30 has two spi controllers with two chip-selects each. The num-cs property is specified as the total number of chip selects a controllers has and is used since 2020 to find uses of chipselects outside that range in the Rockchip spi driver. Without the property set, the default is 1, so spi devices using the second chipselect will not be created. Fixes: eb1262e3cc8b ("spi: spi-rockchip: use num-cs property and ctlr->enable_gpiods") Signed-off-by: Heiko Stuebner Reviewed-by: Quentin Schulz Link: https://lore.kernel.org/r/20240119101656.965744-1-heiko@sntech.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/px30.dtsi b/arch/arm64/boot/dts/rockchip/px30.dtsi index d0905515399b..9137dd76e72c 100644 --- a/arch/arm64/boot/dts/rockchip/px30.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30.dtsi @@ -631,6 +631,7 @@ clock-names = "spiclk", "apb_pclk"; dmas = <&dmac 12>, <&dmac 13>; dma-names = "tx", "rx"; + num-cs = <2>; pinctrl-names = "default"; pinctrl-0 = <&spi0_clk &spi0_csn &spi0_miso &spi0_mosi>; #address-cells = <1>; @@ -646,6 +647,7 @@ clock-names = "spiclk", "apb_pclk"; dmas = <&dmac 14>, <&dmac 15>; dma-names = "tx", "rx"; + num-cs = <2>; pinctrl-names = "default"; pinctrl-0 = <&spi1_clk &spi1_csn0 &spi1_csn1 &spi1_miso &spi1_mosi>; #address-cells = <1>; From 1bbd894e2ae67faf52632bc9290ff926d9b741ea Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 5 Feb 2024 00:16:43 +0100 Subject: [PATCH 056/327] arm64: dts: rockchip: Drop interrupts property from rk3328 pwm-rockchip node The binding doesn't define interrupts and adding such a definition was refused because it's unclear how they should ever be used and the relevant registers are outside the PWM range. So drop them fixing several dtbs_check warnings. Signed-off-by: Johan Jonker Link: https://lore.kernel.org/r/5551846d-62cd-4b72-94f4-07541e726c37@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index fb5dcf6e9327..7b4c15c4a9c3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -488,7 +488,6 @@ pwm3: pwm@ff1b0030 { compatible = "rockchip,rk3328-pwm"; reg = <0x0 0xff1b0030 0x0 0x10>; - interrupts = ; clocks = <&cru SCLK_PWM>, <&cru PCLK_PWM>; clock-names = "pwm", "pclk"; pinctrl-names = "default"; From 11f522256e9043b0fcd2f994278645d3e201d20c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 8 Feb 2024 15:31:15 +0530 Subject: [PATCH 057/327] bpf: Fix warning for bpf_cpumask in verifier Compiling with CONFIG_BPF_SYSCALL & !CONFIG_BPF_JIT throws the below warning: "WARN: resolve_btfids: unresolved symbol bpf_cpumask" Fix it by adding the appropriate #ifdef. Signed-off-by: Hari Bathini Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Stanislav Fomichev Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20240208100115.602172-1-hbathini@linux.ibm.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65f598694d55..b263f093ee76 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5227,7 +5227,9 @@ BTF_ID(struct, prog_test_ref_kfunc) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) #endif +#ifdef CONFIG_BPF_JIT BTF_ID(struct, bpf_cpumask) +#endif BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) From c22d03a95b0d815cd186302fdd93f74d99f1c914 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 25 Jan 2024 14:19:42 -0600 Subject: [PATCH 058/327] arm64: dts: rockchip: Correct Indiedroid Nova GPIO Names Correct the names given to a few of the GPIO pins. The original names were unknowingly based on the header from a pre-production board. The production board has a slightly different pin assignment for the 40-pin GPIO header. Fixes: 3900160e164b ("arm64: dts: rockchip: Add Indiedroid Nova board") Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20240125201943.90476-2-macroalpha82@gmail.com Signed-off-by: Heiko Stuebner --- .../boot/dts/rockchip/rk3588s-indiedroid-nova.dts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-indiedroid-nova.dts b/arch/arm64/boot/dts/rockchip/rk3588s-indiedroid-nova.dts index dc677f29a9c7..3c2278886851 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-indiedroid-nova.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-indiedroid-nova.dts @@ -195,13 +195,13 @@ &gpio1 { gpio-line-names = /* GPIO1 A0-A7 */ - "HEADER_27_3v3", "HEADER_28_3v3", "", "", + "HEADER_27_3v3", "", "", "", "HEADER_29_1v8", "", "HEADER_7_1v8", "", /* GPIO1 B0-B7 */ "", "HEADER_31_1v8", "HEADER_33_1v8", "", "HEADER_11_1v8", "HEADER_13_1v8", "", "", /* GPIO1 C0-C7 */ - "", "", "", "", + "", "HEADER_28_3v3", "", "", "", "", "", "", /* GPIO1 D0-D7 */ "", "", "", "", @@ -225,11 +225,11 @@ &gpio4 { gpio-line-names = /* GPIO4 A0-A7 */ - "", "", "HEADER_37_3v3", "HEADER_32_3v3", - "HEADER_36_3v3", "", "HEADER_35_3v3", "HEADER_38_3v3", + "", "", "HEADER_37_3v3", "HEADER_8_3v3", + "HEADER_10_3v3", "", "HEADER_32_3v3", "HEADER_35_3v3", /* GPIO4 B0-B7 */ "", "", "", "HEADER_40_3v3", - "HEADER_8_3v3", "HEADER_10_3v3", "", "", + "HEADER_38_3v3", "HEADER_36_3v3", "", "", /* GPIO4 C0-C7 */ "", "", "", "", "", "", "", "", From 2127c604383666675789fd4a5fc2aead46c73aad Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 2 Feb 2024 17:32:20 +0100 Subject: [PATCH 059/327] xsk: Add truesize to skb_add_rx_frag(). xsk_build_skb() allocates a page and adds it to the skb via skb_add_rx_frag() and specifies 0 for truesize. This leads to a warning in skb_add_rx_frag() with CONFIG_DEBUG_NET enabled because size is larger than truesize. Increasing truesize requires to add the same amount to socket's sk_wmem_alloc counter in order not to underflow the counter during release in the destructor (sock_wfree()). Pass the size of the allocated page as truesize to skb_add_rx_frag(). Add this mount to socket's sk_wmem_alloc counter. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20240202163221.2488589-1-bigeasy@linutronix.de --- net/xdp/xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1eadfac03cc4..b78c0e095e22 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -722,7 +722,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, memcpy(vaddr, buffer, len); kunmap_local(vaddr); - skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); + skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); + refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); } if (first_frag && desc->options & XDP_TX_METADATA) { From ce6b6d1513965f500a05f3facf223fa01fd74920 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 13 Feb 2024 19:45:40 +0000 Subject: [PATCH 060/327] riscv: dts: sifive: add missing #interrupt-cells to pmic At W=2 dtc complains: hifive-unmatched-a00.dts:120.10-238.4: Warning (interrupt_provider): /soc/i2c@10030000/pmic@58: Missing '#interrupt-cells' in interrupt provider Add the missing property. Reviewed-by: Samuel Holland Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts index 07387f9c135c..72b87b08ab44 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -123,6 +123,7 @@ interrupt-parent = <&gpio>; interrupts = <1 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; onkey { compatible = "dlg,da9063-onkey"; From eb5c7465c3240151cd42a55c7ace9da0026308a1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 11:07:13 +0100 Subject: [PATCH 061/327] RDMA/srpt: fix function pointer cast warnings clang-16 notices that srpt_qp_event() gets called through an incompatible pointer here: drivers/infiniband/ulp/srpt/ib_srpt.c:1815:5: error: cast from 'void (*)(struct ib_event *, struct srpt_rdma_ch *)' to 'void (*)(struct ib_event *, void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] 1815 | = (void(*)(struct ib_event *, void*))srpt_qp_event; Change srpt_qp_event() to use the correct prototype and adjust the argument inside of it. Fixes: a42d985bd5b2 ("ib_srpt: Initial SRP Target merge for v3.3-rc1") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240213100728.458348-1-arnd@kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srpt/ib_srpt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index d2dce6ce30a9..040234c01be4 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -214,10 +214,12 @@ static const char *get_ch_state_name(enum rdma_ch_state s) /** * srpt_qp_event - QP event callback function * @event: Description of the event that occurred. - * @ch: SRPT RDMA channel. + * @ptr: SRPT RDMA channel. */ -static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) +static void srpt_qp_event(struct ib_event *event, void *ptr) { + struct srpt_rdma_ch *ch = ptr; + pr_debug("QP event %d on ch=%p sess_name=%s-%d state=%s\n", event->event, ch, ch->sess_name, ch->qp->qp_num, get_ch_state_name(ch->state)); @@ -1811,8 +1813,7 @@ retry: ch->cq_size = ch->rq_size + sq_size; qp_init->qp_context = (void *)ch; - qp_init->event_handler - = (void(*)(struct ib_event *, void*))srpt_qp_event; + qp_init->event_handler = srpt_qp_event; qp_init->send_cq = ch->cq; qp_init->recv_cq = ch->cq; qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; From 4b085736e44dbbe69b5eea1a8a294f404678a1f4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 Jan 2024 20:51:22 +0900 Subject: [PATCH 062/327] ata: libata-core: Do not try to set sleeping devices to standby In ata ata_dev_power_set_standby(), check that the target device is not sleeping. If it is, there is no need to do anything. Fixes: aa3998dbeb3a ("ata: libata-scsi: Disable scsi device manage_system_start_stop") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 09ed67772fae..d9f80f4f70f5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2017,6 +2017,10 @@ void ata_dev_power_set_standby(struct ata_device *dev) struct ata_taskfile tf; unsigned int err_mask; + /* If the device is already sleeping, do nothing. */ + if (dev->flags & ATA_DFLAG_SLEEPING) + return; + /* * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5) * causing some drives to spin up and down again. For these, do nothing From e37243b65d528a8a9f8b9a57a43885f8e8dfc15c Mon Sep 17 00:00:00 2001 From: Gianmarco Lusvardi Date: Tue, 13 Feb 2024 23:05:46 +0000 Subject: [PATCH 063/327] bpf, scripts: Correct GPL license name The bpf_doc script refers to the GPL as the "GNU Privacy License". I strongly suspect that the author wanted to refer to the GNU General Public License, under which the Linux kernel is released, as, to the best of my knowledge, there is no license named "GNU Privacy License". This patch corrects the license name in the script accordingly. Fixes: 56a092c89505 ("bpf: add script and prepare bpf.h for new helpers documentation") Signed-off-by: Gianmarco Lusvardi Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240213230544.930018-3-glusvardi@posteo.net --- scripts/bpf_doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 61b7dddedc46..0669bac5e900 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -513,7 +513,7 @@ eBPF programs can have an associated license, passed along with the bytecode instructions to the kernel when the programs are loaded. The format for that string is identical to the one in use for kernel modules (Dual licenses, such as "Dual BSD/GPL", may be used). Some helper functions are only accessible to -programs that are compatible with the GNU Privacy License (GPL). +programs that are compatible with the GNU General Public License (GNU GPL). In order to use such helpers, the eBPF program must be loaded with the correct license string passed (via **attr**) to the **bpf**\\ () system call, and this From 321da3dc1f3c92a12e3c5da934090d2992a8814c Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 13 Feb 2024 09:33:06 -0500 Subject: [PATCH 064/327] scsi: sd: usb_storage: uas: Access media prior to querying device properties It has been observed that some USB/UAS devices return generic properties hardcoded in firmware for mode pages for a period of time after a device has been discovered. The reported properties are either garbage or they do not accurately reflect the characteristics of the physical storage device attached in the case of a bridge. Prior to commit 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice") we would call revalidate several times during device discovery. As a result, incorrect values would eventually get replaced with ones accurately describing the attached storage. When we did away with the redundant revalidate pass, several cases were reported where devices reported nonsensical values or would end up in write-protected state. An initial attempt at addressing this issue involved introducing a delayed second revalidate invocation. However, this approach still left some devices reporting incorrect characteristics. Tasos Sahanidis debugged the problem further and identified that introducing a READ operation prior to MODE SENSE fixed the problem and that it wasn't a timing issue. Issuing a READ appears to cause the devices to update their state to reflect the actual properties of the storage media. Device properties like vendor, model, and storage capacity appear to be correctly reported from the get-go. It is unclear why these devices defer populating the remaining characteristics. Match the behavior of a well known commercial operating system and trigger a READ operation prior to querying device characteristics to force the device to populate the mode pages. The additional READ is triggered by a flag set in the USB storage and UAS drivers. We avoid issuing the READ for other transport classes since some storage devices identify Linux through our particular discovery command sequence. Link: https://lore.kernel.org/r/20240213143306.2194237-1-martin.petersen@oracle.com Fixes: 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice") Cc: stable@vger.kernel.org Reported-by: Tasos Sahanidis Reviewed-by: Ewan D. Milne Reviewed-by: Bart Van Assche Tested-by: Tasos Sahanidis Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 26 +++++++++++++++++++++++++- drivers/usb/storage/scsiglue.c | 7 +++++++ drivers/usb/storage/uas.c | 7 +++++++ include/scsi/scsi_device.h | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0833b3e6aa6e..bdd0acf7fa3c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3407,6 +3407,24 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, return true; } +static void sd_read_block_zero(struct scsi_disk *sdkp) +{ + unsigned int buf_len = sdkp->device->sector_size; + char *buffer, cmd[10] = { }; + + buffer = kmalloc(buf_len, GFP_KERNEL); + if (!buffer) + return; + + cmd[0] = READ_10; + put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ + put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ + + scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, + SD_TIMEOUT, sdkp->max_retries, NULL); + kfree(buffer); +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -3446,7 +3464,13 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->media_present) { sd_read_capacity(sdkp, buffer); - + /* + * Some USB/UAS devices return generic values for mode pages + * until the media has been accessed. Trigger a READ operation + * to force the device to populate mode pages. + */ + if (sdp->read_before_ms) + sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index c54e9805da53..12cf9940e5b6 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -179,6 +179,13 @@ static int slave_configure(struct scsi_device *sdev) */ sdev->use_192_bytes_for_3f = 1; + /* + * Some devices report generic values until the media has been + * accessed. Force a READ(10) prior to querying device + * characteristics. + */ + sdev->read_before_ms = 1; + /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 9707f53cfda9..71ace274761f 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -878,6 +878,13 @@ static int uas_slave_configure(struct scsi_device *sdev) if (devinfo->flags & US_FL_CAPACITY_HEURISTICS) sdev->guess_capacity = 1; + /* + * Some devices report generic values until the media has been + * accessed. Force a READ(10) prior to querying device + * characteristics. + */ + sdev->read_before_ms = 1; + /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 5ec1e71a09de..01c02cb76ea6 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -208,6 +208,7 @@ struct scsi_device { unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ + unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ From b5fc07a5fb56216a49e6c1d0b172d5464d99a89b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 14 Feb 2024 17:14:11 -0500 Subject: [PATCH 065/327] scsi: core: Consult supported VPD page list prior to fetching page Commit c92a6b5d6335 ("scsi: core: Query VPD size before getting full page") removed the logic which checks whether a VPD page is present on the supported pages list before asking for the page itself. That was done because SPC helpfully states "The Supported VPD Pages VPD page list may or may not include all the VPD pages that are able to be returned by the device server". Testing had revealed a few devices that supported some of the 0xBn pages but didn't actually list them in page 0. Julian Sikorski bisected a problem with his drive resetting during discovery to the commit above. As it turns out, this particular drive firmware will crash if we attempt to fetch page 0xB9. Various approaches were attempted to work around this. In the end, reinstating the logic that consults VPD page 0 before fetching any other page was the path of least resistance. A firmware update for the devices which originally compelled us to remove the check has since been released. Link: https://lore.kernel.org/r/20240214221411.2888112-1-martin.petersen@oracle.com Fixes: c92a6b5d6335 ("scsi: core: Query VPD size before getting full page") Cc: stable@vger.kernel.org Cc: Bart Van Assche Reported-by: Julian Sikorski Tested-by: Julian Sikorski Reviewed-by: Lee Duncan Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi.c | 22 ++++++++++++++++++++-- include/scsi/scsi_device.h | 4 ---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 76d369343c7a..8cad9792a562 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -328,21 +328,39 @@ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, return result + 4; } +enum scsi_vpd_parameters { + SCSI_VPD_HEADER_SIZE = 4, + SCSI_VPD_LIST_SIZE = 36, +}; + static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { - unsigned char vpd_header[SCSI_VPD_HEADER_SIZE] __aligned(4); + unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; + /* + * Fetch the supported pages VPD and validate that the requested page + * number is present. + */ + if (page != 0) { + result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); + if (result < SCSI_VPD_HEADER_SIZE) + return 0; + + result -= SCSI_VPD_HEADER_SIZE; + if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) + return 0; + } /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ - result = scsi_vpd_inquiry(sdev, vpd_header, page, sizeof(vpd_header)); + result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); if (result < 0) return 0; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 01c02cb76ea6..c38f4fe5e64c 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -100,10 +100,6 @@ struct scsi_vpd { unsigned char data[]; }; -enum scsi_vpd_parameters { - SCSI_VPD_HEADER_SIZE = 4, -}; - struct scsi_device { struct Scsi_Host *host; struct request_queue *request_queue; From de959094eb2197636f7c803af0943cb9d3b35804 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 14 Feb 2024 23:43:56 +0900 Subject: [PATCH 066/327] scsi: target: pscsi: Fix bio_put() for error case As of commit 066ff571011d ("block: turn bio_kmalloc into a simple kmalloc wrapper"), a bio allocated by bio_kmalloc() must be freed by bio_uninit() and kfree(). That is not done properly for the error case, hitting WARN and NULL pointer dereference in bio_free(). Fixes: 066ff571011d ("block: turn bio_kmalloc into a simple kmalloc wrapper") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Naohiro Aota Link: https://lore.kernel.org/r/20240214144356.101814-1-naohiro.aota@wdc.com Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pscsi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 41b7489d37ce..ed4fd22eac6e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -907,12 +907,15 @@ new_bio: return 0; fail: - if (bio) - bio_put(bio); + if (bio) { + bio_uninit(bio); + kfree(bio); + } while (req->bio) { bio = req->bio; req->bio = bio->bi_next; - bio_put(bio); + bio_uninit(bio); + kfree(bio); } req->biotail = NULL; return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; From f2dced9d1992824d677593072bc20eccf66ac5d5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 13 Feb 2024 21:08:09 +0300 Subject: [PATCH 067/327] scsi: ufs: Uninitialized variable in ufshcd_devfreq_target() There is one goto where "sched_clk_scaling_suspend_work" is true but "scale_up" is uninitialized. It leads to a Smatch uninitialized variable warning: drivers/ufs/core/ufshcd.c:1589 ufshcd_devfreq_target() error: uninitialized symbol 'scale_up'. Fixes: 1d969731b87f ("scsi: ufs: core: Only suspend clock scaling if scaling down") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/c787d37f-1107-4512-8991-bccf80e74a35@moroto.mountain Reviewed-by: Peter Wang Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d77b25b79ae3..3b89c9d4aa40 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -1469,7 +1469,7 @@ static int ufshcd_devfreq_target(struct device *dev, int ret = 0; struct ufs_hba *hba = dev_get_drvdata(dev); ktime_t start; - bool scale_up, sched_clk_scaling_suspend_work = false; + bool scale_up = false, sched_clk_scaling_suspend_work = false; struct list_head *clk_list = &hba->clk_list_head; struct ufs_clk_info *clki; unsigned long irq_flags; From 5761eb9761d2d5fe8248a9b719efc4d8baf1f24a Mon Sep 17 00:00:00 2001 From: Don Brace Date: Tue, 13 Feb 2024 10:22:00 -0600 Subject: [PATCH 068/327] scsi: smartpqi: Fix disable_managed_interrupts Correct blk-mq registration issue with module parameter disable_managed_interrupts enabled. When we turn off the default PCI_IRQ_AFFINITY flag, the driver needs to register with blk-mq using blk_mq_map_queues(). The driver is currently calling blk_mq_pci_map_queues() which results in a stack trace and possibly undefined behavior. Stack Trace: [ 7.860089] scsi host2: smartpqi [ 7.871934] WARNING: CPU: 0 PID: 238 at block/blk-mq-pci.c:52 blk_mq_pci_map_queues+0xca/0xd0 [ 7.889231] Modules linked in: sd_mod t10_pi sg uas smartpqi(+) crc32c_intel scsi_transport_sas usb_storage dm_mirror dm_region_hash dm_log dm_mod ipmi_devintf ipmi_msghandler fuse [ 7.924755] CPU: 0 PID: 238 Comm: kworker/0:3 Not tainted 4.18.0-372.88.1.el8_6_smartpqi_test.x86_64 #1 [ 7.944336] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 03/08/2022 [ 7.963026] Workqueue: events work_for_cpu_fn [ 7.978275] RIP: 0010:blk_mq_pci_map_queues+0xca/0xd0 [ 7.978278] Code: 48 89 de 89 c7 e8 f6 0f 4f 00 3b 05 c4 b7 8e 01 72 e1 5b 31 c0 5d 41 5c 41 5d 41 5e 41 5f e9 7d df 73 00 31 c0 e9 76 df 73 00 <0f> 0b eb bc 90 90 0f 1f 44 00 00 41 57 49 89 ff 41 56 41 55 41 54 [ 7.978280] RSP: 0018:ffffa95fc3707d50 EFLAGS: 00010216 [ 7.978283] RAX: 00000000ffffffff RBX: 0000000000000000 RCX: 0000000000000010 [ 7.978284] RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffff9190c32d4310 [ 7.978286] RBP: 0000000000000000 R08: ffffa95fc3707d38 R09: ffff91929b81ac00 [ 7.978287] R10: 0000000000000001 R11: ffffa95fc3707ac0 R12: 0000000000000000 [ 7.978288] R13: ffff9190c32d4000 R14: 00000000ffffffff R15: ffff9190c4c950a8 [ 7.978290] FS: 0000000000000000(0000) GS:ffff9193efc00000(0000) knlGS:0000000000000000 [ 7.978292] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.172814] CR2: 000055d11166c000 CR3: 00000002dae10002 CR4: 00000000007706f0 [ 8.172816] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8.172817] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8.172818] PKRU: 55555554 [ 8.172819] Call Trace: [ 8.172823] blk_mq_alloc_tag_set+0x12e/0x310 [ 8.264339] scsi_add_host_with_dma.cold.9+0x30/0x245 [ 8.279302] pqi_ctrl_init+0xacf/0xc8e [smartpqi] [ 8.294085] ? pqi_pci_probe+0x480/0x4c8 [smartpqi] [ 8.309015] pqi_pci_probe+0x480/0x4c8 [smartpqi] [ 8.323286] local_pci_probe+0x42/0x80 [ 8.337855] work_for_cpu_fn+0x16/0x20 [ 8.351193] process_one_work+0x1a7/0x360 [ 8.364462] ? create_worker+0x1a0/0x1a0 [ 8.379252] worker_thread+0x1ce/0x390 [ 8.392623] ? create_worker+0x1a0/0x1a0 [ 8.406295] kthread+0x10a/0x120 [ 8.418428] ? set_kthread_struct+0x50/0x50 [ 8.431532] ret_from_fork+0x1f/0x40 [ 8.444137] ---[ end trace 1bf0173d39354506 ]--- Fixes: cf15c3e734e8 ("scsi: smartpqi: Add module param to disable managed ints") Tested-by: Yogesh Chandra Pandey Reviewed-by: Scott Benesh Reviewed-by: Scott Teel Reviewed-by: Mahesh Rajashekhara Reviewed-by: Mike McGowen Reviewed-by: Kevin Barnett Signed-off-by: Don Brace Link: https://lore.kernel.org/r/20240213162200.1875970-2-don.brace@microchip.com Reviewed-by: Tomas Henzl Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi_init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index ceff1ec13f9e..385180c98be4 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -6533,8 +6533,11 @@ static void pqi_map_queues(struct Scsi_Host *shost) { struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); - blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + if (!ctrl_info->disable_managed_interrupts) + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], ctrl_info->pci_dev, 0); + else + return blk_mq_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT]); } static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device) From 9ddf190a7df77b77817f955fdb9c2ae9d1c9c9a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Feb 2024 21:59:53 -0800 Subject: [PATCH 069/327] scsi: jazz_esp: Only build if SCSI core is builtin JAZZ_ESP is a bool kconfig symbol that selects SCSI_SPI_ATTRS. When CONFIG_SCSI=m, this results in SCSI_SPI_ATTRS=m while JAZZ_ESP=y, which causes many undefined symbol linker errors. Fix this by only offering to build this driver when CONFIG_SCSI=y. [mkp: JAZZ_ESP is unique in that it does not support being compiled as a module unlike the remaining SPI SCSI HBA drivers] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20240214055953.9612-1-rdunlap@infradead.org Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nicolas Schier Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: linux-scsi@vger.kernel.org Cc: Geert Uytterhoeven Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402112222.Gl0udKyU-lkp@intel.com/ Signed-off-by: Martin K. Petersen --- drivers/scsi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index addac7fbe37b..9ce27092729c 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -1270,7 +1270,7 @@ source "drivers/scsi/arm/Kconfig" config JAZZ_ESP bool "MIPS JAZZ FAS216 SCSI support" - depends on MACH_JAZZ && SCSI + depends on MACH_JAZZ && SCSI=y select SCSI_SPI_ATTRS help This is the driver for the onboard SCSI host adapter of MIPS Magnum From 1baae052cccd08daf9a9d64c3f959d8cdb689757 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:46 +0800 Subject: [PATCH 070/327] md: Don't ignore suspended array in md_check_recovery() mddev_suspend() never stop sync_thread, hence it doesn't make sense to ignore suspended array in md_check_recovery(), which might cause sync_thread can't be unregistered. After commit f52f5c71f3d4 ("md: fix stopping sync thread"), following hang can be triggered by test shell/integrity-caching.sh: 1) suspend the array: raid_postsuspend mddev_suspend 2) stop the array: raid_dtr md_stop __md_stop_writes stop_sync_thread set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_wakeup_thread_directly(mddev->sync_thread); wait_event(..., !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3) sync thread done: md_do_sync set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); 4) daemon thread can't unregister sync thread: md_check_recovery if (mddev->suspended) return; -> return directly md_read_sync_thread clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); -> MD_RECOVERY_RUNNING can't be cleared, hence step 2 hang; This problem is not just related to dm-raid, fix it by ignoring suspended array in md_check_recovery(). And follow up patches will improve dm-raid better to frozen sync thread during suspend. Reported-by: Mikulas Patocka Closes: https://lore.kernel.org/all/8fb335e-6d2c-dbb5-d7-ded8db5145a@redhat.com/ Fixes: 68866e425be2 ("MD: no sync IO while suspended") Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e52896c140bd..eea8dd529b5e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9473,9 +9473,6 @@ not_running: */ void md_check_recovery(struct mddev *mddev) { - if (READ_ONCE(mddev->suspended)) - return; - if (mddev->bitmap) md_bitmap_daemon_work(mddev); From 55a48ad2db64737f7ffc0407634218cc6e4c513b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:47 +0800 Subject: [PATCH 071/327] md: Don't ignore read-only array in md_check_recovery() Usually if the array is not read-write, md_check_recovery() won't register new sync_thread in the first place. And if the array is read-write and sync_thread is registered, md_set_readonly() will unregister sync_thread before setting the array read-only. md/raid follow this behavior hence there is no problem. After commit f52f5c71f3d4 ("md: fix stopping sync thread"), following hang can be triggered by test shell/integrity-caching.sh: 1) array is read-only. dm-raid update super block: rs_update_sbs ro = mddev->ro mddev->ro = 0 -> set array read-write md_update_sb 2) register new sync thread concurrently. 3) dm-raid set array back to read-only: rs_update_sbs mddev->ro = ro 4) stop the array: raid_dtr md_stop stop_sync_thread set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_wakeup_thread_directly(mddev->sync_thread); wait_event(..., !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5) sync thread done: md_do_sync set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); 6) daemon thread can't unregister sync thread: md_check_recovery if (!md_is_rdwr(mddev) && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; -> -> MD_RECOVERY_RUNNING can't be cleared, hence step 4 hang; The root cause is that dm-raid manipulate 'mddev->ro' by itself, however, dm-raid really should stop sync thread before setting the array read-only. Unfortunately, I need to read more code before I can refacter the handler of 'mddev->ro' in dm-raid, hence let's fix the problem the easy way for now to prevent dm-raid regression. Reported-by: Mikulas Patocka Closes: https://lore.kernel.org/all/9801e40-8ac7-e225-6a71-309dcf9dc9aa@redhat.com/ Fixes: ecbfb9f118bc ("dm raid: add raid level takeover support") Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index eea8dd529b5e..2a1c19f87d01 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9449,6 +9449,20 @@ not_running: sysfs_notify_dirent_safe(mddev->sysfs_action); } +static void unregister_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + return; + + md_reap_sync_thread(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -9486,7 +9500,8 @@ void md_check_recovery(struct mddev *mddev) } if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<recovery)) { - /* sync_work already queued. */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + unregister_sync_thread(mddev); goto unlock; } @@ -9572,16 +9586,7 @@ void md_check_recovery(struct mddev *mddev) * still set. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - - if (WARN_ON_ONCE(!mddev->sync_thread)) - goto unlock; - - md_reap_sync_thread(mddev); + unregister_sync_thread(mddev); goto unlock; } From 82ec0ae59d02e89164b24c0cc8e4e50de78b5fd6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:48 +0800 Subject: [PATCH 072/327] md: Make sure md_do_sync() will set MD_RECOVERY_DONE stop_sync_thread() will interrupt md_do_sync(), and md_do_sync() must set MD_RECOVERY_DONE, so that follow up md_check_recovery() will unregister sync_thread, clear MD_RECOVERY_RUNNING and wake up stop_sync_thread(). If MD_RECOVERY_WAIT is set or the array is read-only, md_do_sync() will return without setting MD_RECOVERY_DONE, and after commit f52f5c71f3d4 ("md: fix stopping sync thread"), dm-raid switch from md_reap_sync_thread() to stop_sync_thread() to unregister sync_thread from md_stop() and md_stop_writes(), causing the test shell/lvconvert-raid-reshape.sh hang. We shouldn't switch back to md_reap_sync_thread() because it's problematic in the first place. Fix the problem by making sure md_do_sync() will set MD_RECOVERY_DONE. Reported-by: Mikulas Patocka Closes: https://lore.kernel.org/all/ece2b06f-d647-6613-a534-ff4c9bec1142@redhat.com/ Fixes: d5d885fd514f ("md: introduce new personality funciton start()") Fixes: 5fd6c1dce06e ("[PATCH] md: allow checkpoint of recovery with version-1 superblock") Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2a1c19f87d01..fbf04160d009 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8792,12 +8792,16 @@ void md_do_sync(struct md_thread *thread) int ret; /* just incase thread restarts... */ - if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - return; + goto skip; } if (mddev_is_clustered(mddev)) { From ad39c08186f8a0f221337985036ba86731d6aafe Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:49 +0800 Subject: [PATCH 073/327] md: Don't register sync_thread for reshape directly Currently, if reshape is interrupted, then reassemble the array will register sync_thread directly from pers->run(), in this case 'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee that md_do_sync() will be executed, hence stop_sync_thread() will hang because 'MD_RECOVERY_RUNNING' can't be cleared. Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE, however, following hang can still be triggered by dm-raid test shell/lvconvert-raid-reshape.sh occasionally: [root@fedora ~]# cat /proc/1982/stack [<0>] stop_sync_thread+0x1ab/0x270 [md_mod] [<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod] [<0>] raid_presuspend+0x1e/0x70 [dm_raid] [<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod] [<0>] __dm_destroy+0x2a5/0x310 [dm_mod] [<0>] dm_destroy+0x16/0x30 [dm_mod] [<0>] dev_remove+0x165/0x290 [dm_mod] [<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod] [<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod] [<0>] vfs_ioctl+0x21/0x60 [<0>] __x64_sys_ioctl+0xb9/0xe0 [<0>] do_syscall_64+0xc6/0x230 [<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 Meanwhile mddev->recovery is: MD_RECOVERY_RUNNING | MD_RECOVERY_INTR | MD_RECOVERY_RESHAPE | MD_RECOVERY_FROZEN Fix this problem by remove the code to register sync_thread directly from raid10 and raid5. And let md_check_recovery() to register sync_thread. Fixes: f67055780caa ("[PATCH] md: Checkpoint and allow restart of raid5 reshape") Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 5 ++++- drivers/md/raid10.c | 16 ++-------------- drivers/md/raid5.c | 29 ++--------------------------- 3 files changed, 8 insertions(+), 42 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fbf04160d009..dcde67a81647 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9376,6 +9376,7 @@ static void md_start_sync(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, sync_work); int spares = 0; bool suspend = false; + char *name; if (md_spares_need_change(mddev)) suspend = true; @@ -9408,8 +9409,10 @@ static void md_start_sync(struct work_struct *ws) if (spares) md_bitmap_write_all(mddev->bitmap); + name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? + "reshape" : "resync"; rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "resync")); + md_register_thread(md_do_sync, mddev, name)); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7412066ea22c..a5f8419e2df1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4175,11 +4175,7 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) - goto out_free_conf; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } return 0; @@ -4573,16 +4569,8 @@ out: clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) { - ret = -EAGAIN; - goto abort; - } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); conf->reshape_checkpoint = jiffies; - md_wakeup_thread(mddev->sync_thread); md_new_event(); return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8497880135ee..6a7a32f7fb91 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7936,11 +7936,7 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) - goto abort; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } /* Ok, everything is just fine now */ @@ -8506,29 +8502,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - rcu_assign_pointer(mddev->sync_thread, - md_register_thread(md_do_sync, mddev, "reshape")); - if (!mddev->sync_thread) { - mddev->recovery = 0; - spin_lock_irq(&conf->device_lock); - write_seqcount_begin(&conf->gen_lock); - mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - mddev->new_chunk_sectors = - conf->chunk_sectors = conf->prev_chunk_sectors; - mddev->new_layout = conf->algorithm = conf->prev_algo; - rdev_for_each(rdev, mddev) - rdev->new_data_offset = rdev->data_offset; - smp_wmb(); - conf->generation --; - conf->reshape_progress = MaxSector; - mddev->reshape_position = MaxSector; - write_seqcount_end(&conf->gen_lock); - spin_unlock_irq(&conf->device_lock); - return -EAGAIN; - } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); conf->reshape_checkpoint = jiffies; - md_wakeup_thread(mddev->sync_thread); md_new_event(); return 0; } From 9e46c70e829bddc24e04f963471e9983a11598b7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Feb 2024 17:25:50 +0800 Subject: [PATCH 074/327] md: Don't suspend the array for interrupted reshape md_start_sync() will suspend the array if there are spares that can be added or removed from conf, however, if reshape is still in progress, this won't happen at all or data will be corrupted(remove_and_add_spares won't be called from md_choose_sync_action for reshape), hence there is no need to suspend the array if reshape is not done yet. Meanwhile, there is a potential deadlock for raid456: 1) reshape is interrupted; 2) set one of the disk WantReplacement, and add a new disk to the array, however, recovery won't start until the reshape is finished; 3) then issue an IO across reshpae position, this IO will wait for reshape to make progress; 4) continue to reshape, then md_start_sync() found there is a spare disk that can be added to conf, mddev_suspend() is called; Step 4 and step 3 is waiting for each other, deadlock triggered. Noted this problem is found by code review, and it's not reporduced yet. Fix this porblem by don't suspend the array for interrupted reshape, this is safe because conf won't be changed until reshape is done. Fixes: bc08041b32ab ("md: suspend array in md_start_sync() if array need reconfiguration") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201092559.910982-6-yukuai1@huaweicloud.com --- drivers/md/md.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index dcde67a81647..9e41a9aaba8b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9378,12 +9378,17 @@ static void md_start_sync(struct work_struct *ws) bool suspend = false; char *name; - if (md_spares_need_change(mddev)) + /* + * If reshape is still in progress, spares won't be added or removed + * from conf until reshape is done. + */ + if (mddev->reshape_position == MaxSector && + md_spares_need_change(mddev)) { suspend = true; + mddev_suspend(mddev, false); + } - suspend ? mddev_suspend_and_lock_nointr(mddev) : - mddev_lock_nointr(mddev); - + mddev_lock_nointr(mddev); if (!md_is_rdwr(mddev)) { /* * On a read-only array we can: From ee0e39a63b78849f8abbef268b13e4838569f646 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:33 +0800 Subject: [PATCH 075/327] x86/mm: Move is_vsyscall_vaddr() into asm/vsyscall.h Move is_vsyscall_vaddr() into asm/vsyscall.h to make it available for copy_from_kernel_nofault_allowed() in arch/x86/mm/maccess.c. Reviewed-by: Sohil Mehta Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240202103935.3154011-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/vsyscall.h | 10 ++++++++++ arch/x86/mm/fault.c | 9 --------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb..472f0263dbc6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include #include +#include #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..d6375b3c633b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -798,15 +798,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) From 32019c659ecfe1d92e3bf9fcdfbb11a7c70acd58 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:34 +0800 Subject: [PATCH 076/327] x86/mm: Disallow vsyscall page read for copy_from_kernel_nofault() When trying to use copy_from_kernel_nofault() to read vsyscall page through a bpf program, the following oops was reported: BUG: unable to handle page fault for address: ffffffffff600000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3231067 P4D 3231067 PUD 3233067 PMD 3235067 PTE 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 20390 Comm: test_progs ...... 6.7.0+ #58 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:copy_from_kernel_nofault+0x6f/0x110 ...... Call Trace: ? copy_from_kernel_nofault+0x6f/0x110 bpf_probe_read_kernel+0x1d/0x50 bpf_prog_2061065e56845f08_do_probe_read+0x51/0x8d trace_call_bpf+0xc5/0x1c0 perf_call_bpf_enter.isra.0+0x69/0xb0 perf_syscall_enter+0x13e/0x200 syscall_trace_enter+0x188/0x1c0 do_syscall_64+0xb5/0xe0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ...... ---[ end trace 0000000000000000 ]--- The oops is triggered when: 1) A bpf program uses bpf_probe_read_kernel() to read from the vsyscall page and invokes copy_from_kernel_nofault() which in turn calls __get_user_asm(). 2) Because the vsyscall page address is not readable from kernel space, a page fault exception is triggered accordingly. 3) handle_page_fault() considers the vsyscall page address as a user space address instead of a kernel space address. This results in the fix-up setup by bpf not being applied and a page_fault_oops() is invoked due to SMAP. Considering handle_page_fault() has already considered the vsyscall page address as a userspace address, fix the problem by disallowing vsyscall page read for copy_from_kernel_nofault(). Originally-by: Thomas Gleixner Reported-by: syzbot+72aa0161922eba61b50e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAG48ez06TZft=ATH1qh2c5mpS5BT8UakwNkzi6nvK5_djC-4Nw@mail.gmail.com Reported-by: xingwei lee Closes: https://lore.kernel.org/bpf/CABOYnLynjBoFZOf3Z4BhaZkc5hx_kHfsjiW+UWLoB=w33LvScw@mail.gmail.com Signed-off-by: Hou Tao Reviewed-by: Sohil Mehta Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240202103935.3154011-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/mm/maccess.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec..42115ac079cf 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include #include +#include + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -15,6 +17,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) return false; + /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early From be66d79189ec8a1006ec6ec302bb27b160b3e6ce Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:35 +0800 Subject: [PATCH 077/327] selftest/bpf: Test the read of vsyscall page under x86-64 Under x86-64, when using bpf_probe_read_kernel{_str}() or bpf_probe_read{_str}() to read vsyscall page, the read may trigger oops, so add one test case to ensure that the problem is fixed. Beside those four bpf helpers mentioned above, testing the read of vsyscall page by using bpf_probe_read_user{_str} and bpf_copy_from_user{_task}() as well. The test case passes the address of vsyscall page to these six helpers and checks whether the returned values are expected: 1) For bpf_probe_read_kernel{_str}()/bpf_probe_read{_str}(), the expected return value is -ERANGE as shown below: bpf_probe_read_kernel_common copy_from_kernel_nofault // false, return -ERANGE copy_from_kernel_nofault_allowed 2) For bpf_probe_read_user{_str}(), the expected return value is -EFAULT as show below: bpf_probe_read_user_common copy_from_user_nofault // false, return -EFAULT __access_ok 3) For bpf_copy_from_user(), the expected return value is -EFAULT: // return -EFAULT bpf_copy_from_user copy_from_user _copy_from_user // return false access_ok 4) For bpf_copy_from_user_task(), the expected return value is -EFAULT: // return -EFAULT bpf_copy_from_user_task access_process_vm // return 0 vma_lookup() // return 0 expand_stack() The occurrence of oops depends on the availability of CPU SMAP [1] feature and there are three possible configurations of vsyscall page in the boot cmd-line: vsyscall={xonly|none|emulate}, so there are a total of six possible combinations. Under all these combinations, the test case runs successfully. [1]: https://en.wikipedia.org/wiki/Supervisor_Mode_Access_Prevention Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240202103935.3154011-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/read_vsyscall.c | 57 +++++++++++++++++++ .../selftests/bpf/progs/read_vsyscall.c | 45 +++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/read_vsyscall.c create mode 100644 tools/testing/selftests/bpf/progs/read_vsyscall.c diff --git a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c new file mode 100644 index 000000000000..3405923fe4e6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include "test_progs.h" +#include "read_vsyscall.skel.h" + +#if defined(__x86_64__) +/* For VSYSCALL_ADDR */ +#include +#else +/* To prevent build failure on non-x86 arch */ +#define VSYSCALL_ADDR 0UL +#endif + +struct read_ret_desc { + const char *name; + int ret; +} all_read[] = { + { .name = "probe_read_kernel", .ret = -ERANGE }, + { .name = "probe_read_kernel_str", .ret = -ERANGE }, + { .name = "probe_read", .ret = -ERANGE }, + { .name = "probe_read_str", .ret = -ERANGE }, + { .name = "probe_read_user", .ret = -EFAULT }, + { .name = "probe_read_user_str", .ret = -EFAULT }, + { .name = "copy_from_user", .ret = -EFAULT }, + { .name = "copy_from_user_task", .ret = -EFAULT }, +}; + +void test_read_vsyscall(void) +{ + struct read_vsyscall *skel; + unsigned int i; + int err; + +#if !defined(__x86_64__) + test__skip(); + return; +#endif + skel = read_vsyscall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "read_vsyscall open_load")) + return; + + skel->bss->target_pid = getpid(); + err = read_vsyscall__attach(skel); + if (!ASSERT_EQ(err, 0, "read_vsyscall attach")) + goto out; + + /* userspace may don't have vsyscall page due to LEGACY_VSYSCALL_NONE, + * but it doesn't affect the returned error codes. + */ + skel->bss->user_ptr = (void *)VSYSCALL_ADDR; + usleep(1); + + for (i = 0; i < ARRAY_SIZE(all_read); i++) + ASSERT_EQ(skel->bss->read_ret[i], all_read[i].ret, all_read[i].name); +out: + read_vsyscall__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/read_vsyscall.c b/tools/testing/selftests/bpf/progs/read_vsyscall.c new file mode 100644 index 000000000000..986f96687ae1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/read_vsyscall.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include +#include + +#include "bpf_misc.h" + +int target_pid = 0; +void *user_ptr = 0; +int read_ret[8]; + +char _license[] SEC("license") = "GPL"; + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int do_probe_read(void *ctx) +{ + char buf[8]; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + read_ret[0] = bpf_probe_read_kernel(buf, sizeof(buf), user_ptr); + read_ret[1] = bpf_probe_read_kernel_str(buf, sizeof(buf), user_ptr); + read_ret[2] = bpf_probe_read(buf, sizeof(buf), user_ptr); + read_ret[3] = bpf_probe_read_str(buf, sizeof(buf), user_ptr); + read_ret[4] = bpf_probe_read_user(buf, sizeof(buf), user_ptr); + read_ret[5] = bpf_probe_read_user_str(buf, sizeof(buf), user_ptr); + + return 0; +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int do_copy_from_user(void *ctx) +{ + char buf[8]; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + read_ret[6] = bpf_copy_from_user(buf, sizeof(buf), user_ptr); + read_ret[7] = bpf_copy_from_user_task(buf, sizeof(buf), user_ptr, + bpf_get_current_task_btf(), 0); + + return 0; +} From b4ea9b6a18ebf7f9f3a7a60f82e925186978cfcf Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 14 Feb 2024 17:32:40 +0100 Subject: [PATCH 078/327] net/iucv: fix the allocation size of iucv_path_table array iucv_path_table is a dynamically allocated array of pointers to struct iucv_path items. Yet, its size is calculated as if it was an array of struct iucv_path items. Signed-off-by: Alexander Gordeev Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- net/iucv/iucv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 6334f64f04d5..b0b3e9c5af44 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -156,7 +156,7 @@ static char iucv_error_pathid[16] = "INVALID PATHID"; static LIST_HEAD(iucv_handler_list); /* - * iucv_path_table: an array of iucv_path structures. + * iucv_path_table: array of pointers to iucv_path structures. */ static struct iucv_path **iucv_path_table; static unsigned long iucv_max_pathid; @@ -544,7 +544,7 @@ static int iucv_enable(void) cpus_read_lock(); rc = -ENOMEM; - alloc_size = iucv_max_pathid * sizeof(struct iucv_path); + alloc_size = iucv_max_pathid * sizeof(*iucv_path_table); iucv_path_table = kzalloc(alloc_size, GFP_KERNEL); if (!iucv_path_table) goto out; From dc489f86257cab5056e747344f17a164f63bff4b Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 14 Feb 2024 22:40:03 +0100 Subject: [PATCH 079/327] net: bridge: switchdev: Skip MDB replays of deferred events on offload Before this change, generation of the list of MDB events to replay would race against the creation of new group memberships, either from the IGMP/MLD snooping logic or from user configuration. While new memberships are immediately visible to walkers of br->mdb_list, the notification of their existence to switchdev event subscribers is deferred until a later point in time. So if a replay list was generated during a time that overlapped with such a window, it would also contain a replay of the not-yet-delivered event. The driver would thus receive two copies of what the bridge internally considered to be one single event. On destruction of the bridge, only a single membership deletion event was therefore sent. As a consequence of this, drivers which reference count memberships (at least DSA), would be left with orphan groups in their hardware database when the bridge was destroyed. This is only an issue when replaying additions. While deletion events may still be pending on the deferred queue, they will already have been removed from br->mdb_list, so no duplicates can be generated in that scenario. To a user this meant that old group memberships, from a bridge in which a port was previously attached, could be reanimated (in hardware) when the port joined a new bridge, without the new bridge's knowledge. For example, on an mv88e6xxx system, create a snooping bridge and immediately add a port to it: root@infix-06-0b-00:~$ ip link add dev br0 up type bridge mcast_snooping 1 && \ > ip link set dev x3 up master br0 And then destroy the bridge: root@infix-06-0b-00:~$ ip link del dev br0 root@infix-06-0b-00:~$ mvls atu ADDRESS FID STATE Q F 0 1 2 3 4 5 6 7 8 9 a DEV:0 Marvell 88E6393X 33:33:00:00:00:6a 1 static - - 0 . . . . . . . . . . 33:33:ff:87:e4:3f 1 static - - 0 . . . . . . . . . . ff:ff:ff:ff:ff:ff 1 static - - 0 1 2 3 4 5 6 7 8 9 a root@infix-06-0b-00:~$ The two IPv6 groups remain in the hardware database because the port (x3) is notified of the host's membership twice: once via the original event and once via a replay. Since only a single delete notification is sent, the count remains at 1 when the bridge is destroyed. Then add the same port (or another port belonging to the same hardware domain) to a new bridge, this time with snooping disabled: root@infix-06-0b-00:~$ ip link add dev br1 up type bridge mcast_snooping 0 && \ > ip link set dev x3 up master br1 All multicast, including the two IPv6 groups from br0, should now be flooded, according to the policy of br1. But instead the old memberships are still active in the hardware database, causing the switch to only forward traffic to those groups towards the CPU (port 0). Eliminate the race in two steps: 1. Grab the write-side lock of the MDB while generating the replay list. This prevents new memberships from showing up while we are generating the replay list. But it leaves the scenario in which a deferred event was already generated, but not delivered, before we grabbed the lock. Therefore: 2. Make sure that no deferred version of a replay event is already enqueued to the switchdev deferred queue, before adding it to the replay list, when replaying additions. Fixes: 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries") Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 3 ++ net/bridge/br_switchdev.c | 76 ++++++++++++++++++++++++--------------- net/switchdev/switchdev.c | 73 +++++++++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 29 deletions(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index a43062d4c734..8346b0d29542 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -308,6 +308,9 @@ void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack); +bool switchdev_port_obj_act_is_deferred(struct net_device *dev, + enum switchdev_notifier_type nt, + const struct switchdev_obj *obj); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ee84e783e1df..6a7cb01f121c 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -595,21 +595,40 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, } static int br_switchdev_mdb_queue_one(struct list_head *mdb_list, + struct net_device *dev, + unsigned long action, enum switchdev_obj_id id, const struct net_bridge_mdb_entry *mp, struct net_device *orig_dev) { - struct switchdev_obj_port_mdb *mdb; + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = id, + .orig_dev = orig_dev, + }, + }; + struct switchdev_obj_port_mdb *pmdb; - mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); - if (!mdb) + br_switchdev_mdb_populate(&mdb, mp); + + if (action == SWITCHDEV_PORT_OBJ_ADD && + switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) { + /* This event is already in the deferred queue of + * events, so this replay must be elided, lest the + * driver receives duplicate events for it. This can + * only happen when replaying additions, since + * modifications are always immediately visible in + * br->mdb_list, whereas actual event delivery may be + * delayed. + */ + return 0; + } + + pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC); + if (!pmdb) return -ENOMEM; - mdb->obj.id = id; - mdb->obj.orig_dev = orig_dev; - br_switchdev_mdb_populate(mdb, mp); - list_add_tail(&mdb->obj.list, mdb_list); - + list_add_tail(&pmdb->obj.list, mdb_list); return 0; } @@ -677,51 +696,50 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; - /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, - * because the write-side protection is br->multicast_lock. But we - * need to emulate the [ blocking ] calling context of a regular - * switchdev event, so since both br->multicast_lock and RCU read side - * critical sections are atomic, we have no choice but to pick the RCU - * read side lock, queue up all our events, leave the critical section - * and notify switchdev from blocking context. - */ - rcu_read_lock(); + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; - hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { + /* br_switchdev_mdb_queue_one() will take care to not queue a + * replay of an event that is already pending in the switchdev + * deferred queue. In order to safely determine that, there + * must be no new deferred MDB notifications enqueued for the + * duration of the MDB scan. Therefore, grab the write-side + * lock to avoid racing with any concurrent IGMP/MLD snooping. + */ + spin_lock_bh(&br->multicast_lock); + + hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group __rcu * const *pp; const struct net_bridge_port_group *p; if (mp->host_joined) { - err = br_switchdev_mdb_queue_one(&mdb_list, + err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_HOST_MDB, mp, br_dev); if (err) { - rcu_read_unlock(); + spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } - for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port->dev != dev) continue; - err = br_switchdev_mdb_queue_one(&mdb_list, + err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_PORT_MDB, mp, dev); if (err) { - rcu_read_unlock(); + spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } } - rcu_read_unlock(); - - if (adding) - action = SWITCHDEV_PORT_OBJ_ADD; - else - action = SWITCHDEV_PORT_OBJ_DEL; + spin_unlock_bh(&br->multicast_lock); list_for_each_entry(obj, &mdb_list, list) { err = br_switchdev_mdb_replay_one(nb, dev, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 5b045284849e..c9189a970eec 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -19,6 +19,35 @@ #include #include +static bool switchdev_obj_eq(const struct switchdev_obj *a, + const struct switchdev_obj *b) +{ + const struct switchdev_obj_port_vlan *va, *vb; + const struct switchdev_obj_port_mdb *ma, *mb; + + if (a->id != b->id || a->orig_dev != b->orig_dev) + return false; + + switch (a->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + va = SWITCHDEV_OBJ_PORT_VLAN(a); + vb = SWITCHDEV_OBJ_PORT_VLAN(b); + return va->flags == vb->flags && + va->vid == vb->vid && + va->changed == vb->changed; + case SWITCHDEV_OBJ_ID_PORT_MDB: + case SWITCHDEV_OBJ_ID_HOST_MDB: + ma = SWITCHDEV_OBJ_PORT_MDB(a); + mb = SWITCHDEV_OBJ_PORT_MDB(b); + return ma->vid == mb->vid && + ether_addr_equal(ma->addr, mb->addr); + default: + break; + } + + BUG(); +} + static LIST_HEAD(deferred); static DEFINE_SPINLOCK(deferred_lock); @@ -307,6 +336,50 @@ int switchdev_port_obj_del(struct net_device *dev, } EXPORT_SYMBOL_GPL(switchdev_port_obj_del); +/** + * switchdev_port_obj_act_is_deferred - Is object action pending? + * + * @dev: port device + * @nt: type of action; add or delete + * @obj: object to test + * + * Returns true if a deferred item is pending, which is + * equivalent to the action @nt on an object @obj. + * + * rtnl_lock must be held. + */ +bool switchdev_port_obj_act_is_deferred(struct net_device *dev, + enum switchdev_notifier_type nt, + const struct switchdev_obj *obj) +{ + struct switchdev_deferred_item *dfitem; + bool found = false; + + ASSERT_RTNL(); + + spin_lock_bh(&deferred_lock); + + list_for_each_entry(dfitem, &deferred, list) { + if (dfitem->dev != dev) + continue; + + if ((dfitem->func == switchdev_port_obj_add_deferred && + nt == SWITCHDEV_PORT_OBJ_ADD) || + (dfitem->func == switchdev_port_obj_del_deferred && + nt == SWITCHDEV_PORT_OBJ_DEL)) { + if (switchdev_obj_eq((const void *)dfitem->data, obj)) { + found = true; + break; + } + } + } + + spin_unlock_bh(&deferred_lock); + + return found; +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred); + static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); From f7a70d650b0b6b0134ccba763d672c8439d9f09b Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 14 Feb 2024 22:40:04 +0100 Subject: [PATCH 080/327] net: bridge: switchdev: Ensure deferred event delivery on unoffload When unoffloading a device, it is important to ensure that all relevant deferred events are delivered to it before it disassociates itself from the bridge. Before this change, this was true for the normal case when a device maps 1:1 to a net_bridge_port, i.e. br0 / swp0 When swp0 leaves br0, the call to switchdev_deferred_process() in del_nbp() makes sure to process any outstanding events while the device is still associated with the bridge. In the case when the association is indirect though, i.e. when the device is attached to the bridge via an intermediate device, like a LAG... br0 / lag0 / swp0 ...then detaching swp0 from lag0 does not cause any net_bridge_port to be deleted, so there was no guarantee that all events had been processed before the device disassociated itself from the bridge. Fix this by always synchronously processing all deferred events before signaling completion of unoffloading back to the driver. Fixes: 4e51bf44a03a ("net: bridge: move the switchdev object replay helpers to "push" mode") Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/bridge/br_switchdev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 6a7cb01f121c..7b41ee8740cb 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -804,6 +804,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); + + /* Make sure that the device leaving this bridge has seen all + * relevant events before it is disassociated. In the normal + * case, when the device is directly attached to the bridge, + * this is covered by del_nbp(). If the association was indirect + * however, e.g. via a team or bond, and the device is leaving + * that intermediate device, then the bridge port remains in + * place. + */ + switchdev_deferred_process(); } /* Let the bridge know that this port is offloaded, so that it can assign a From 66b60b0c8c4a163b022a9f0ad6769b0fd3dc662f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 14 Feb 2024 11:13:08 -0800 Subject: [PATCH 081/327] dccp/tcp: Unhash sk from ehash for tb2 alloc failure after check_estalblished(). syzkaller reported a warning [0] in inet_csk_destroy_sock() with no repro. WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); However, the syzkaller's log hinted that connect() failed just before the warning due to FAULT_INJECTION. [1] When connect() is called for an unbound socket, we search for an available ephemeral port. If a bhash bucket exists for the port, we call __inet_check_established() or __inet6_check_established() to check if the bucket is reusable. If reusable, we add the socket into ehash and set inet_sk(sk)->inet_num. Later, we look up the corresponding bhash2 bucket and try to allocate it if it does not exist. Although it rarely occurs in real use, if the allocation fails, we must revert the changes by check_established(). Otherwise, an unconnected socket could illegally occupy an ehash entry. Note that we do not put tw back into ehash because sk might have already responded to a packet for tw and it would be better to free tw earlier under such memory presure. [0]: WARNING: CPU: 0 PID: 350830 at net/ipv4/inet_connection_sock.c:1193 inet_csk_destroy_sock (net/ipv4/inet_connection_sock.c:1193) Modules linked in: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:inet_csk_destroy_sock (net/ipv4/inet_connection_sock.c:1193) Code: 41 5c 41 5d 41 5e e9 2d 4a 3d fd e8 28 4a 3d fd 48 89 ef e8 f0 cd 7d ff 5b 5d 41 5c 41 5d 41 5e e9 13 4a 3d fd e8 0e 4a 3d fd <0f> 0b e9 61 fe ff ff e8 02 4a 3d fd 4c 89 e7 be 03 00 00 00 e8 05 RSP: 0018:ffffc9000b21fd38 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000009e78 RCX: ffffffff840bae40 RDX: ffff88806e46c600 RSI: ffffffff840bb012 RDI: ffff88811755cca8 RBP: ffff88811755c880 R08: 0000000000000003 R09: 0000000000000000 R10: 0000000000009e78 R11: 0000000000000000 R12: ffff88811755c8e0 R13: ffff88811755c892 R14: ffff88811755c918 R15: 0000000000000000 FS: 00007f03e5243800(0000) GS:ffff88811ae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32f21000 CR3: 0000000112ffe001 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: ? inet_csk_destroy_sock (net/ipv4/inet_connection_sock.c:1193) dccp_close (net/dccp/proto.c:1078) inet_release (net/ipv4/af_inet.c:434) __sock_release (net/socket.c:660) sock_close (net/socket.c:1423) __fput (fs/file_table.c:377) __fput_sync (fs/file_table.c:462) __x64_sys_close (fs/open.c:1557 fs/open.c:1539 fs/open.c:1539) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7f03e53852bb Code: 03 00 00 00 0f 05 48 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 43 c9 f5 ff 8b 7c 24 0c 41 89 c0 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 89 44 24 0c e8 a1 c9 f5 ff 8b 44 RSP: 002b:00000000005dfba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f03e53852bb RDX: 0000000000000002 RSI: 0000000000000002 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000000 R09: 000000000000167c R10: 0000000008a79680 R11: 0000000000000293 R12: 00007f03e4e43000 R13: 00007f03e4e43170 R14: 00007f03e4e43178 R15: 00007f03e4e43170 [1]: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 0 CPU: 0 PID: 350833 Comm: syz-executor.1 Not tainted 6.7.0-12272-g2121c43f88f5 #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) should_fail_ex (lib/fault-inject.c:52 lib/fault-inject.c:153) should_failslab (mm/slub.c:3748) kmem_cache_alloc (mm/slub.c:3763 mm/slub.c:3842 mm/slub.c:3867) inet_bind2_bucket_create (net/ipv4/inet_hashtables.c:135) __inet_hash_connect (net/ipv4/inet_hashtables.c:1100) dccp_v4_connect (net/dccp/ipv4.c:116) __inet_stream_connect (net/ipv4/af_inet.c:676) inet_stream_connect (net/ipv4/af_inet.c:747) __sys_connect_file (net/socket.c:2048 (discriminator 2)) __sys_connect (net/socket.c:2065) __x64_sys_connect (net/socket.c:2072) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7f03e5284e5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 73 9f 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007f03e4641cc8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 00000000004bbf80 RCX: 00007f03e5284e5d RDX: 0000000000000010 RSI: 0000000020000000 RDI: 0000000000000003 RBP: 00000000004bbf80 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 R13: 000000000000000b R14: 00007f03e52e5530 R15: 0000000000000000 Reported-by: syzkaller Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 93e9193df544..308ff34002ea 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1130,10 +1130,33 @@ ok: return 0; error: + if (sk_hashed(sk)) { + spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); + + sock_prot_inuse_add(net, sk->sk_prot, -1); + + spin_lock(lock); + sk_nulls_del_node_init_rcu(sk); + spin_unlock(lock); + + sk->sk_hash = 0; + inet_sk(sk)->inet_sport = 0; + inet_sk(sk)->inet_num = 0; + + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + } + spin_unlock(&head2->lock); if (tb_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - spin_unlock_bh(&head->lock); + spin_unlock(&head->lock); + + if (tw) + inet_twsk_deschedule_put(tw); + + local_bh_enable(); + return -ENOMEM; } From a9f80df4f51440303d063b55bb98720857693821 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 14 Feb 2024 23:00:50 -0800 Subject: [PATCH 082/327] net: ethernet: adi: requires PHYLIB support This driver uses functions that are supplied by the Kconfig symbol PHYLIB, so select it to ensure that they are built as needed. When CONFIG_ADIN1110=y and CONFIG_PHYLIB=m, there are multiple build (linker) errors that are resolved by this Kconfig change: ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_net_open': drivers/net/ethernet/adi/adin1110.c:933: undefined reference to `phy_start' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_probe_netdevs': drivers/net/ethernet/adi/adin1110.c:1603: undefined reference to `get_phy_device' ld: drivers/net/ethernet/adi/adin1110.c:1609: undefined reference to `phy_connect' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_disconnect_phy': drivers/net/ethernet/adi/adin1110.c:1226: undefined reference to `phy_disconnect' ld: drivers/net/ethernet/adi/adin1110.o: in function `devm_mdiobus_alloc': include/linux/phy.h:455: undefined reference to `devm_mdiobus_alloc_size' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_register_mdiobus': drivers/net/ethernet/adi/adin1110.c:529: undefined reference to `__devm_mdiobus_register' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_net_stop': drivers/net/ethernet/adi/adin1110.c:958: undefined reference to `phy_stop' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_disconnect_phy': drivers/net/ethernet/adi/adin1110.c:1226: undefined reference to `phy_disconnect' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_adjust_link': drivers/net/ethernet/adi/adin1110.c:1077: undefined reference to `phy_print_status' ld: drivers/net/ethernet/adi/adin1110.o: in function `adin1110_ioctl': drivers/net/ethernet/adi/adin1110.c:790: undefined reference to `phy_do_ioctl' ld: drivers/net/ethernet/adi/adin1110.o:(.rodata+0xf60): undefined reference to `phy_ethtool_get_link_ksettings' ld: drivers/net/ethernet/adi/adin1110.o:(.rodata+0xf68): undefined reference to `phy_ethtool_set_link_ksettings' Fixes: bc93e19d088b ("net: ethernet: adi: Add ADIN1110 support") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402070626.eZsfVHG5-lkp@intel.com/ Cc: Lennart Franzen Cc: Alexandru Tachici Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Reviewed-by: Nuno Sa Signed-off-by: David S. Miller --- drivers/net/ethernet/adi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/adi/Kconfig b/drivers/net/ethernet/adi/Kconfig index da3bdd302502..c91b4dcef4ec 100644 --- a/drivers/net/ethernet/adi/Kconfig +++ b/drivers/net/ethernet/adi/Kconfig @@ -7,6 +7,7 @@ config NET_VENDOR_ADI bool "Analog Devices devices" default y depends on SPI + select PHYLIB help If you have a network (Ethernet) card belonging to this class, say Y. From 52f671db18823089a02f07efc04efdb2272ddc17 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Feb 2024 06:33:45 -0800 Subject: [PATCH 083/327] net/sched: act_mirred: use the backlog for mirred ingress The test Davide added in commit ca22da2fbd69 ("act_mirred: use the backlog for nested calls to mirred ingress") hangs our testing VMs every 10 or so runs, with the familiar tcp_v4_rcv -> tcp_v4_rcv deadlock reported by lockdep. The problem as previously described by Davide (see Link) is that if we reverse flow of traffic with the redirect (egress -> ingress) we may reach the same socket which generated the packet. And we may still be holding its socket lock. The common solution to such deadlocks is to put the packet in the Rx backlog, rather than run the Rx path inline. Do that for all egress -> ingress reversals, not just once we started to nest mirred calls. In the past there was a concern that the backlog indirection will lead to loss of error reporting / less accurate stats. But the current workaround does not seem to address the issue. Fixes: 53592b364001 ("net/sched: act_mirred: Implement ingress actions") Cc: Marcelo Ricardo Leitner Suggested-by: Davide Caratti Link: https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com/ Signed-off-by: Jakub Kicinski Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 14 +++++--------- .../testing/selftests/net/forwarding/tc_actions.sh | 3 --- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 0a1a9e40f237..291d47c9eb69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -232,18 +232,14 @@ release_idr: return err; } -static bool is_mirred_nested(void) -{ - return unlikely(__this_cpu_read(mirred_nest_level) > 1); -} - -static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) +static int +tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); - else if (is_mirred_nested()) + else if (!at_ingress) err = netif_rx(skb); else err = netif_receive_skb(skb); @@ -319,9 +315,9 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress); - err = tcf_mirred_forward(want_ingress, skb_to_send); + err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } else { - err = tcf_mirred_forward(want_ingress, skb_to_send); + err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } if (err) { diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index b0f5e55d2d0b..589629636502 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -235,9 +235,6 @@ mirred_egress_to_ingress_tcp_test() check_err $? "didn't mirred redirect ICMP" tc_check_packets "dev $h1 ingress" 102 10 check_err $? "didn't drop mirred ICMP" - local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits) - test ${overlimits} = 10 - check_err $? "wrong overlimits, expected 10 got ${overlimits}" tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower From 166c2c8a6a4dc2e4ceba9e10cfe81c3e469e3210 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Feb 2024 06:33:46 -0800 Subject: [PATCH 084/327] net/sched: act_mirred: don't override retval if we already lost the skb If we're redirecting the skb, and haven't called tcf_mirred_forward(), yet, we need to tell the core to drop the skb by setting the retcode to SHOT. If we have called tcf_mirred_forward(), however, the skb is out of our hands and returning SHOT will lead to UaF. Move the retval override to the error path which actually need it. Reviewed-by: Michal Swiatkowski Fixes: e5cf1baf92cb ("act_mirred: use TC_ACT_REINSERT when possible") Signed-off-by: Jakub Kicinski Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 291d47c9eb69..6faa7d00da09 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -266,8 +266,7 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); - err = -ENODEV; - goto out; + goto err_cant_do; } /* we could easily avoid the clone only if called by ingress and clsact; @@ -279,10 +278,8 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, tcf_mirred_can_reinsert(retval); if (!dont_clone) { skb_to_send = skb_clone(skb, GFP_ATOMIC); - if (!skb_to_send) { - err = -ENOMEM; - goto out; - } + if (!skb_to_send) + goto err_cant_do; } want_ingress = tcf_mirred_act_wants_ingress(m_eaction); @@ -319,15 +316,16 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, } else { err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } - - if (err) { -out: + if (err) tcf_action_inc_overlimit_qstats(&m->common); - if (is_redirect) - retval = TC_ACT_SHOT; - } return retval; + +err_cant_do: + if (is_redirect) + retval = TC_ACT_SHOT; + tcf_action_inc_overlimit_qstats(&m->common); + return retval; } static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, From 02f76a9cd4494719600baf1ab278930df39431ab Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 16 Feb 2024 15:30:48 +0530 Subject: [PATCH 085/327] drm/buddy: Modify duplicate list_splice_tail call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the duplicate list_splice_tail call when the total_allocated < size condition is true. Cc: # 6.7+ Fixes: 8746c6c9dfa3 ("drm/buddy: Fix alloc_range() error handling code") Reported-by: Bert Karwatzki Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20240216100048.4101-1-Arunpravin.PaneerSelvam@amd.com Signed-off-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index c1a99bf4dffd..c4222b886db7 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -538,13 +538,13 @@ static int __alloc_range(struct drm_buddy *mm, list_add(&block->left->tmp_link, dfs); } while (1); - list_splice_tail(&allocated, blocks); - if (total_allocated < size) { err = -ENOSPC; goto err_free; } + list_splice_tail(&allocated, blocks); + return 0; err_undo: From 0affdba22aca5573f9d989bcb1d71d32a6a03efe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:57:37 +0100 Subject: [PATCH 086/327] nouveau: fix function cast warnings clang-16 warns about casting between incompatible function types: drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c:161:10: error: cast from 'void (*)(const struct firmware *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] 161 | .fini = (void(*)(void *))release_firmware, This one was done to use the generic shadow_fw_release() function as a callback for struct nvbios_source. Change it to use the same prototype as the other five instances, with a trivial helper function that actually calls release_firmware. Fixes: 70c0f263cc2e ("drm/nouveau/bios: pull in basic vbios subdev, more to come later") Signed-off-by: Arnd Bergmann Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240213095753.455062-1-arnd@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c index 19188683c8fc..8c2bf1c16f2a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c @@ -154,11 +154,17 @@ shadow_fw_init(struct nvkm_bios *bios, const char *name) return (void *)fw; } +static void +shadow_fw_release(void *fw) +{ + release_firmware(fw); +} + static const struct nvbios_source shadow_fw = { .name = "firmware", .init = shadow_fw_init, - .fini = (void(*)(void *))release_firmware, + .fini = shadow_fw_release, .read = shadow_fw_read, .rw = false, }; From 65323796debe49a1922ba507020f7530a4b3f9af Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 13 Feb 2024 21:09:57 +0300 Subject: [PATCH 087/327] drm/nouveau/mmu/r535: uninitialized variable in r535_bar_new_() If gf100_bar_new_() fails then "bar" is not initialized. Fixes: 5bf0257136a2 ("drm/nouveau/mmu/r535: initial support") Signed-off-by: Dan Carpenter Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/dab21df7-4d90-4479-97d8-97e5d228c714@moroto.mountain --- drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c index 4135690326f4..3a30bea30e36 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/r535.c @@ -168,12 +168,11 @@ r535_bar_new_(const struct nvkm_bar_func *hw, struct nvkm_device *device, rm->flush = r535_bar_flush; ret = gf100_bar_new_(rm, device, type, inst, &bar); - *pbar = bar; if (ret) { - if (!bar) - kfree(rm); + kfree(rm); return ret; } + *pbar = bar; bar->flushBAR2PhysMode = ioremap(device->func->resource_addr(device, 3), PAGE_SIZE); if (!bar->flushBAR2PhysMode) From 250f5402e636a5cec9e0e95df252c3d54307210f Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Sun, 11 Feb 2024 23:43:14 +0100 Subject: [PATCH 088/327] parisc/ftrace: add missing CONFIG_DYNAMIC_FTRACE check Fixes a bug revealed by -Wmissing-prototypes when CONFIG_FUNCTION_GRAPH_TRACER is enabled but not CONFIG_DYNAMIC_FTRACE: arch/parisc/kernel/ftrace.c:82:5: error: no previous prototype for 'ftrace_enable_ftrace_graph_caller' [-Werror=missing-prototypes] 82 | int ftrace_enable_ftrace_graph_caller(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/parisc/kernel/ftrace.c:88:5: error: no previous prototype for 'ftrace_disable_ftrace_graph_caller' [-Werror=missing-prototypes] 88 | int ftrace_disable_ftrace_graph_caller(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Max Kellermann Signed-off-by: Helge Deller --- arch/parisc/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index d1defb9ede70..621a4b386ae4 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -78,7 +78,7 @@ asmlinkage void notrace __hot ftrace_function_trampoline(unsigned long parent, #endif } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_FUNCTION_GRAPH_TRACER) int ftrace_enable_ftrace_graph_caller(void) { static_key_enable(&ftrace_graph_enable.key); From f945a404ed8a6eeb9907ef122a8382df56c2a714 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 12 Feb 2024 00:09:45 +0100 Subject: [PATCH 089/327] parisc/kprobes: always include asm-generic/kprobes.h The NOKPROBE_SYMBOL macro (and others) were moved to asm-generic/kprobes.h in 2017 by commit 7d134b2ce639 ("kprobes: move kprobe declarations to asm-generic/kprobes.h"), and this new header was included by asm/kprobes.h unconditionally on all architectures. When kprobe support was added to parisc in 2017 by commit 8858ac8e9e9b1 ("parisc: Implement kprobes"), that header was only included when CONFIG_KPROBES was enabled. This can lead to build failures when NOKPROBE_SYMBOL is used, but CONFIG_KPROBES is disabled. This mistake however was never actually noticed because linux/kprobes.h also includes asm-generic/kprobes.h (though I do not understand why that is, because it also includes asm/kprobes.h). To prevent eventual build failures, I suggest to always include asm-generic/kprobes.h on parisc, just like all the other architectures do. This way, including asm/kprobes.h suffices, and nobody (outside of arch/) ever needs to explicitly include asm-generic/kprobes.h. Signed-off-by: Max Kellermann Signed-off-by: Helge Deller --- arch/parisc/include/asm/kprobes.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/kprobes.h b/arch/parisc/include/asm/kprobes.h index 0a175ac87698..0f42f5c8e3b6 100644 --- a/arch/parisc/include/asm/kprobes.h +++ b/arch/parisc/include/asm/kprobes.h @@ -10,9 +10,10 @@ #ifndef _PARISC_KPROBES_H #define _PARISC_KPROBES_H +#include + #ifdef CONFIG_KPROBES -#include #include #include #include From 5429c8de56f6b2bd8f537df3a1e04e67b9c04282 Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 16 Feb 2024 15:04:17 -0600 Subject: [PATCH 090/327] block: sed-opal: handle empty atoms when parsing response The SED Opal response parsing function response_parse() does not handle the case of an empty atom in the response. This causes the entry count to be too high and the response fails to be parsed. Recognizing, but ignoring, empty atoms allows response handling to succeed. Signed-off-by: Greg Joyce Link: https://lore.kernel.org/r/20240216210417.3526064-2-gjoyce@linux.ibm.com Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/opal_proto.h b/block/opal_proto.h index dec7ce3a3edb..d247a457bf6e 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -71,6 +71,7 @@ enum opal_response_token { #define SHORT_ATOM_BYTE 0xBF #define MEDIUM_ATOM_BYTE 0xDF #define LONG_ATOM_BYTE 0xE3 +#define EMPTY_ATOM_BYTE 0xFF #define OPAL_INVAL_PARAM 12 #define OPAL_MANUFACTURED_INACTIVE 0x08 diff --git a/block/sed-opal.c b/block/sed-opal.c index 3d9e9cd250bd..fa4dba5d8531 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1056,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length, token_length = response_parse_medium(iter, pos); else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ token_length = response_parse_long(iter, pos); + else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */ + token_length = 1; else /* TOKEN */ token_length = response_parse_token(iter, pos); if (token_length < 0) return token_length; + if (pos[0] != EMPTY_ATOM_BYTE) + num_entries++; + pos += token_length; total -= token_length; iter++; - num_entries++; } resp->num = num_entries; From 9b99c17f7510bed2adbe17751fb8abddba5620bc Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 12 Jan 2024 12:09:50 -0800 Subject: [PATCH 091/327] x86/numa: Fix the address overlap check in numa_fill_memblks() numa_fill_memblks() fills in the gaps in numa_meminfo memblks over a physical address range. To do so, it first creates a list of existing memblks that overlap that address range. The issue is that it is off by one when comparing to the end of the address range, so memblks that do not overlap are selected. The impact of selecting a memblk that does not actually overlap is that an existing memblk may be filled when the expected action is to do nothing and return NUMA_NO_MEMBLK to the caller. The caller can then add a new NUMA node and memblk. Replace the broken open-coded search for address overlap with the memblock helper memblock_addrs_overlap(). Update the kernel doc and in code comments. Suggested by: "Huang, Ying" Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()") Signed-off-by: Alison Schofield Acked-by: Mike Rapoport (IBM) Acked-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/10a3e6109c34c21a8dd4c513cf63df63481a2b07.1705085543.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- arch/x86/mm/numa.c | 19 +++++++------------ include/linux/memblock.h | 2 ++ mm/memblock.c | 5 +++-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index adc497b93f03..8ada9bbfad58 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; * @start: address to begin fill * @end: address to end fill * - * Find and extend numa_meminfo memblks to cover the @start-@end - * physical address range, such that the first memblk includes - * @start, the last memblk includes @end, and any gaps in between - * are filled. + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end * * RETURNS: * 0 : Success - * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end */ int __init numa_fill_memblks(u64 start, u64 end) @@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end) /* * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. Exclude (start == bi->end) since - * end addresses in both a CFMWS range and a memblk range - * are exclusive. - * - * This list of pointers is used to make in-place changes - * that fill out the numa_meminfo memblks. + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - if (start < bi->end && end >= bi->start) { + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { blk[count] = &mi->blk[i]; count++; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b695f9e946da..e2082240586d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -121,6 +121,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif void memblock_trim_memory(phys_addr_t align); +unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); bool memblock_validate_numa_coverage(unsigned long threshold_bytes); diff --git a/mm/memblock.c b/mm/memblock.c index 4dcb2ee35eca..964eb72db539 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) /* * Address comparison utilities */ -static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) +unsigned long __init_memblock +memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, + phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } From b626070ffc14acca5b87a2aa5f581db98617584c Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 12 Jan 2024 12:09:51 -0800 Subject: [PATCH 092/327] x86/numa: Fix the sort compare func used in numa_fill_memblks() The compare function used to sort memblks into starting address order fails when the result of its u64 address subtraction gets truncated to an int upon return. The impact of the bad sort is that memblks will be filled out incorrectly. Depending on the set of memblks, a user may see no errors at all but still have a bad fill, or see messages reporting a node overlap that leads to numa init failure: [] node 0 [mem: ] overlaps with node 1 [mem: ] [] No NUMA configuration found Replace with a comparison that can only result in: 1, 0, -1. Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()") Signed-off-by: Alison Schofield Acked-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/99dcb3ae87e04995e9f293f6158dc8fa0749a487.1705085543.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8ada9bbfad58..65e9a6e391c0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b) const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; - return ma->start - mb->start; + return (ma->start > mb->start) - (ma->start < mb->start); } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; From 453a7fde8031a5192ed2f9646ad048c1a5e930dc Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Wed, 31 Jan 2024 13:59:30 -0800 Subject: [PATCH 093/327] cxl/region: Handle endpoint decoders in cxl_region_find_decoder() In preparation for adding a new caller of cxl_region_find_decoders() teach it to find a decoder from a cxl_endpoint_decoder structure. Combining switch and endpoint decoder lookup in one function prevents code duplication in call sites. Update the existing caller. Signed-off-by: Alison Schofield Tested-by: Wonjae Lee Link: https://lore.kernel.org/r/79ae6d72978ef9f3ceec9722e1cb793820553c8e.1706736863.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ce0e2d82bb2b..f1a9d1798d21 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -730,12 +730,17 @@ static int match_auto_decoder(struct device *dev, void *data) return 0; } -static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port, - struct cxl_region *cxlr) +static struct cxl_decoder * +cxl_region_find_decoder(struct cxl_port *port, + struct cxl_endpoint_decoder *cxled, + struct cxl_region *cxlr) { struct device *dev; int id = 0; + if (port == cxled_to_port(cxled)) + return &cxled->cxld; + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) dev = device_find_child(&port->dev, &cxlr->params, match_auto_decoder); @@ -853,10 +858,7 @@ static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr, { struct cxl_decoder *cxld; - if (port == cxled_to_port(cxled)) - cxld = &cxled->cxld; - else - cxld = cxl_region_find_decoder(port, cxlr); + cxld = cxl_region_find_decoder(port, cxled, cxlr); if (!cxld) { dev_dbg(&cxlr->dev, "%s: no decoder available\n", dev_name(&port->dev)); From cb66b1d60c283bb340a2fc19deff7de8acea74b1 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Wed, 31 Jan 2024 13:59:31 -0800 Subject: [PATCH 094/327] cxl/region: Allow out of order assembly of autodiscovered regions Autodiscovered regions can fail to assemble if they are not discovered in HPA decode order. The user will see failure messages like: [] cxl region0: endpoint5: HPA order violation region1 [] cxl region0: endpoint5: failed to allocate region reference The check that is causing the failure helps the CXL driver enforce a CXL spec mandate that decoders be committed in HPA order. The check is needless for autodiscovered regions since their decoders are already programmed. Trying to enforce order in the assembly of these regions is useless because they are assembled once all their member endpoints arrive, and there is no guarantee on the order in which endpoints are discovered during probe. Keep the existing check, but for autodiscovered regions, allow the out of order assembly after a sanity check that the lesser numbered decoder has the lesser HPA starting address. Signed-off-by: Alison Schofield Tested-by: Wonjae Lee Link: https://lore.kernel.org/r/3dec69ee97524ab229a20c6739272c3000b18408.1706736863.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index f1a9d1798d21..4c7fd2d5cccb 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -758,8 +758,31 @@ cxl_region_find_decoder(struct cxl_port *port, return to_cxl_decoder(dev); } -static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port, - struct cxl_region *cxlr) +static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter, + struct cxl_decoder *cxld) +{ + struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter); + struct cxl_decoder *cxld_iter = rr->decoder; + + /* + * Allow the out of order assembly of auto-discovered regions. + * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders + * in HPA order. Confirm that the decoder with the lesser HPA + * starting address has the lesser id. + */ + dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n", + dev_name(&cxld->dev), cxld->id, + dev_name(&cxld_iter->dev), cxld_iter->id); + + if (cxld_iter->id > cxld->id) + return true; + + return false; +} + +static struct cxl_region_ref * +alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) { struct cxl_region_params *p = &cxlr->params; struct cxl_region_ref *cxl_rr, *iter; @@ -769,16 +792,21 @@ static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port, xa_for_each(&port->regions, index, iter) { struct cxl_region_params *ip = &iter->region->params; - if (!ip->res) + if (!ip->res || ip->res->start < p->res->start) continue; - if (ip->res->start > p->res->start) { - dev_dbg(&cxlr->dev, - "%s: HPA order violation %s:%pr vs %pr\n", - dev_name(&port->dev), - dev_name(&iter->region->dev), ip->res, p->res); - return ERR_PTR(-EBUSY); + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { + struct cxl_decoder *cxld; + + cxld = cxl_region_find_decoder(port, cxled, cxlr); + if (auto_order_ok(port, iter->region, cxld)) + continue; } + dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n", + dev_name(&port->dev), + dev_name(&iter->region->dev), ip->res, p->res); + + return ERR_PTR(-EBUSY); } cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL); @@ -955,7 +983,7 @@ static int cxl_port_attach_region(struct cxl_port *port, nr_targets_inc = true; } } else { - cxl_rr = alloc_region_ref(port, cxlr); + cxl_rr = alloc_region_ref(port, cxlr, cxled); if (IS_ERR(cxl_rr)) { dev_dbg(&cxlr->dev, "%s: failed to allocate region reference\n", From 00413c15068276f6e5a50215849a8d4bd812b443 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 6 Feb 2024 12:03:37 -0700 Subject: [PATCH 095/327] cxl: Change 'struct cxl_memdev_state' *_perf_list to single 'struct cxl_dpa_perf' In order to address the issue with being able to expose qos_class sysfs attributes under 'ram' and 'pmem' sub-directories, the attributes must be defined as static attributes rather than under driver->dev_groups. To avoid implementing locking for accessing the 'struct cxl_dpa_perf` lists, convert the list to a single 'struct cxl_dpa_perf' entry in preparation to move the attributes to statically defined. While theoretically a partition may have multiple qos_class via CDAT, this has not been encountered with testing on available hardware. The code is simplified for now to not support the complex case until a use case is needed to support that. Link: https://lore.kernel.org/linux-cxl/65b200ba228f_2d43c29468@dwillia2-mobl3.amr.corp.intel.com.notmuch/ Suggested-by: Dan Williams Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240206190431.1810289-2-dave.jiang@intel.com Reviewed-by: Jonathan Cameron Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 82 ++++++++++++----------------------------- drivers/cxl/core/mbox.c | 4 +- drivers/cxl/cxlmem.h | 10 ++--- drivers/cxl/mem.c | 28 ++------------ 4 files changed, 34 insertions(+), 90 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 6fe11546889f..d66acc917dac 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -210,19 +210,12 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return 0; } -static void add_perf_entry(struct device *dev, struct dsmas_entry *dent, - struct list_head *list) +static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, + struct cxl_dpa_perf *dpa_perf) { - struct cxl_dpa_perf *dpa_perf; - - dpa_perf = kzalloc(sizeof(*dpa_perf), GFP_KERNEL); - if (!dpa_perf) - return; - dpa_perf->dpa_range = dent->dpa_range; dpa_perf->coord = dent->coord; dpa_perf->qos_class = dent->qos_class; - list_add_tail(&dpa_perf->list, list); dev_dbg(dev, "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", dent->dpa_range.start, dpa_perf->qos_class, @@ -230,20 +223,6 @@ static void add_perf_entry(struct device *dev, struct dsmas_entry *dent, dent->coord.read_latency, dent->coord.write_latency); } -static void free_perf_ents(void *data) -{ - struct cxl_memdev_state *mds = data; - struct cxl_dpa_perf *dpa_perf, *n; - LIST_HEAD(discard); - - list_splice_tail_init(&mds->ram_perf_list, &discard); - list_splice_tail_init(&mds->pmem_perf_list, &discard); - list_for_each_entry_safe(dpa_perf, n, &discard, list) { - list_del(&dpa_perf->list); - kfree(dpa_perf); - } -} - static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, struct xarray *dsmas_xa) { @@ -263,16 +242,14 @@ static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, xa_for_each(dsmas_xa, index, dent) { if (resource_size(&cxlds->ram_res) && range_contains(&ram_range, &dent->dpa_range)) - add_perf_entry(dev, dent, &mds->ram_perf_list); + update_perf_entry(dev, dent, &mds->ram_perf); else if (resource_size(&cxlds->pmem_res) && range_contains(&pmem_range, &dent->dpa_range)) - add_perf_entry(dev, dent, &mds->pmem_perf_list); + update_perf_entry(dev, dent, &mds->pmem_perf); else dev_dbg(dev, "no partition for dsmas dpa: %#llx\n", dent->dpa_range.start); } - - devm_add_action_or_reset(&cxlds->cxlmd->dev, free_perf_ents, mds); } static int match_cxlrd_qos_class(struct device *dev, void *data) @@ -293,24 +270,24 @@ static int match_cxlrd_qos_class(struct device *dev, void *data) return 0; } -static void cxl_qos_match(struct cxl_port *root_port, - struct list_head *work_list, - struct list_head *discard_list) +static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) { - struct cxl_dpa_perf *dpa_perf, *n; + *dpa_perf = (struct cxl_dpa_perf) { + .qos_class = CXL_QOS_CLASS_INVALID, + }; +} - list_for_each_entry_safe(dpa_perf, n, work_list, list) { - int rc; +static bool cxl_qos_match(struct cxl_port *root_port, + struct cxl_dpa_perf *dpa_perf) +{ + if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) + return false; - if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) - return; + if (!device_for_each_child(&root_port->dev, &dpa_perf->qos_class, + match_cxlrd_qos_class)) + return false; - rc = device_for_each_child(&root_port->dev, - (void *)&dpa_perf->qos_class, - match_cxlrd_qos_class); - if (!rc) - list_move_tail(&dpa_perf->list, discard_list); - } + return true; } static int match_cxlrd_hb(struct device *dev, void *data) @@ -334,23 +311,10 @@ static int match_cxlrd_hb(struct device *dev, void *data) return 0; } -static void discard_dpa_perf(struct list_head *list) -{ - struct cxl_dpa_perf *dpa_perf, *n; - - list_for_each_entry_safe(dpa_perf, n, list, list) { - list_del(&dpa_perf->list); - kfree(dpa_perf); - } -} -DEFINE_FREE(dpa_perf, struct list_head *, if (!list_empty(_T)) discard_dpa_perf(_T)) - static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - LIST_HEAD(__discard); - struct list_head *discard __free(dpa_perf) = &__discard; struct cxl_port *root_port; int rc; @@ -363,16 +327,18 @@ static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) root_port = &cxl_root->port; /* Check that the QTG IDs are all sane between end device and root decoders */ - cxl_qos_match(root_port, &mds->ram_perf_list, discard); - cxl_qos_match(root_port, &mds->pmem_perf_list, discard); + if (!cxl_qos_match(root_port, &mds->ram_perf)) + reset_dpa_perf(&mds->ram_perf); + if (!cxl_qos_match(root_port, &mds->pmem_perf)) + reset_dpa_perf(&mds->pmem_perf); /* Check to make sure that the device's host bridge is under a root decoder */ rc = device_for_each_child(&root_port->dev, (void *)cxlmd->endpoint->host_bridge, match_cxlrd_hb); if (!rc) { - list_splice_tail_init(&mds->ram_perf_list, discard); - list_splice_tail_init(&mds->pmem_perf_list, discard); + reset_dpa_perf(&mds->ram_perf); + reset_dpa_perf(&mds->pmem_perf); } return rc; diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 27166a411705..9adda4795eb7 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1391,8 +1391,8 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.reg_map.host = dev; mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; - INIT_LIST_HEAD(&mds->ram_perf_list); - INIT_LIST_HEAD(&mds->pmem_perf_list); + mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; + mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; return mds; } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 5303d6942b88..20fb3b35e89e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -395,13 +395,11 @@ enum cxl_devtype { /** * struct cxl_dpa_perf - DPA performance property entry - * @list - list entry * @dpa_range - range for DPA address * @coord - QoS performance data (i.e. latency, bandwidth) * @qos_class - QoS Class cookies */ struct cxl_dpa_perf { - struct list_head list; struct range dpa_range; struct access_coordinate coord; int qos_class; @@ -471,8 +469,8 @@ struct cxl_dev_state { * @security: security driver state info * @fw: firmware upload / activation state * @mbox_send: @dev specific transport for transmitting mailbox commands - * @ram_perf_list: performance data entries matched to RAM - * @pmem_perf_list: performance data entries matched to PMEM + * @ram_perf: performance data entry matched to RAM partition + * @pmem_perf: performance data entry matched to PMEM partition * * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -494,8 +492,8 @@ struct cxl_memdev_state { u64 next_volatile_bytes; u64 next_persistent_bytes; - struct list_head ram_perf_list; - struct list_head pmem_perf_list; + struct cxl_dpa_perf ram_perf; + struct cxl_dpa_perf pmem_perf; struct cxl_event_state event; struct cxl_poison_state poison; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index c5c9d8e0d88d..547f5a145fc5 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -221,18 +221,8 @@ static ssize_t ram_qos_class_show(struct device *dev, struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct cxl_dpa_perf *dpa_perf; - if (!dev->driver) - return -ENOENT; - - if (list_empty(&mds->ram_perf_list)) - return -ENOENT; - - dpa_perf = list_first_entry(&mds->ram_perf_list, struct cxl_dpa_perf, - list); - - return sysfs_emit(buf, "%d\n", dpa_perf->qos_class); + return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class); } static struct device_attribute dev_attr_ram_qos_class = @@ -244,18 +234,8 @@ static ssize_t pmem_qos_class_show(struct device *dev, struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct cxl_dpa_perf *dpa_perf; - if (!dev->driver) - return -ENOENT; - - if (list_empty(&mds->pmem_perf_list)) - return -ENOENT; - - dpa_perf = list_first_entry(&mds->pmem_perf_list, struct cxl_dpa_perf, - list); - - return sysfs_emit(buf, "%d\n", dpa_perf->qos_class); + return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class); } static struct device_attribute dev_attr_pmem_qos_class = @@ -273,11 +253,11 @@ static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) return 0; if (a == &dev_attr_pmem_qos_class.attr) - if (list_empty(&mds->pmem_perf_list)) + if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID) return 0; if (a == &dev_attr_ram_qos_class.attr) - if (list_empty(&mds->ram_perf_list)) + if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID) return 0; return a->mode; From 10cb393d769bed9473e1fa8d34e4d6d389db9155 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 6 Feb 2024 12:03:38 -0700 Subject: [PATCH 096/327] cxl: Remove unnecessary type cast in cxl_qos_class_verify() The passed in host bridge parameter for device_for_each_child() has unnecessary void * type cast. Remove the type cast. Suggested-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240206190431.1810289-3-dave.jiang@intel.com Reviewed-by: Jonathan Cameron Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index d66acc917dac..ecbd209ca70a 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -334,8 +334,7 @@ static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) /* Check to make sure that the device's host bridge is under a root decoder */ rc = device_for_each_child(&root_port->dev, - (void *)cxlmd->endpoint->host_bridge, - match_cxlrd_hb); + cxlmd->endpoint->host_bridge, match_cxlrd_hb); if (!rc) { reset_dpa_perf(&mds->ram_perf); reset_dpa_perf(&mds->pmem_perf); From cc214417f06f6b0a8f6da09220c8bcdb742210b9 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 6 Feb 2024 12:03:39 -0700 Subject: [PATCH 097/327] cxl: Fix sysfs export of qos_class for memdev Current implementation exports only to /sys/bus/cxl/devices/.../memN/qos_class. With both ram and pmem exposed, the second registered sysfs attribute is rejected as duplicate. It's not possible to create qos_class under the dev_groups via the driver due to the ram and pmem sysfs sub-directories already created by the device sysfs groups. Move the ram and pmem qos_class to the device sysfs groups and add a call to sysfs_update() after the perf data are validated so the qos_class can be visible. The end results should be /sys/bus/cxl/devices/.../memN/ram/qos_class and /sys/bus/cxl/devices/.../memN/pmem/qos_class. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240206190431.1810289-4-dave.jiang@intel.com Reviewed-by: Jonathan Cameron Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 1 + drivers/cxl/core/memdev.c | 63 +++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 ++ drivers/cxl/mem.c | 36 ---------------------- 4 files changed, 66 insertions(+), 36 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index ecbd209ca70a..08fd0baea7a0 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -382,6 +382,7 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port) cxl_memdev_set_qos_class(cxlds, dsmas_xa); cxl_qos_class_verify(cxlmd); + cxl_memdev_update_perf(cxlmd); } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index dae8802ecdb0..d4e259f3a7e9 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -447,13 +447,41 @@ static struct attribute *cxl_memdev_attributes[] = { NULL, }; +static ssize_t pmem_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + + return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class); +} + +static struct device_attribute dev_attr_pmem_qos_class = + __ATTR(qos_class, 0444, pmem_qos_class_show, NULL); + static struct attribute *cxl_memdev_pmem_attributes[] = { &dev_attr_pmem_size.attr, + &dev_attr_pmem_qos_class.attr, NULL, }; +static ssize_t ram_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + + return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class); +} + +static struct device_attribute dev_attr_ram_qos_class = + __ATTR(qos_class, 0444, ram_qos_class_show, NULL); + static struct attribute *cxl_memdev_ram_attributes[] = { &dev_attr_ram_size.attr, + &dev_attr_ram_qos_class.attr, NULL, }; @@ -477,14 +505,42 @@ static struct attribute_group cxl_memdev_attribute_group = { .is_visible = cxl_memdev_visible, }; +static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + if (a == &dev_attr_ram_qos_class.attr) + if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID) + return 0; + + return a->mode; +} + static struct attribute_group cxl_memdev_ram_attribute_group = { .name = "ram", .attrs = cxl_memdev_ram_attributes, + .is_visible = cxl_ram_visible, }; +static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + if (a == &dev_attr_pmem_qos_class.attr) + if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID) + return 0; + + return a->mode; +} + static struct attribute_group cxl_memdev_pmem_attribute_group = { .name = "pmem", .attrs = cxl_memdev_pmem_attributes, + .is_visible = cxl_pmem_visible, }; static umode_t cxl_memdev_security_visible(struct kobject *kobj, @@ -519,6 +575,13 @@ static const struct attribute_group *cxl_memdev_attribute_groups[] = { NULL, }; +void cxl_memdev_update_perf(struct cxl_memdev *cxlmd) +{ + sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_ram_attribute_group); + sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_pmem_attribute_group); +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_update_perf, CXL); + static const struct device_type cxl_memdev_type = { .name = "cxl_memdev", .release = cxl_memdev_release, diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index b6017c0c57b4..003feebab79b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -880,6 +880,8 @@ void cxl_switch_parse_cdat(struct cxl_port *port); int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord); +void cxl_memdev_update_perf(struct cxl_memdev *cxlmd); + /* * Unit test builds overrides this to __weak, find the 'strong' version * of these symbols in tools/testing/cxl/. diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 547f5a145fc5..0c79d9ce877c 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -215,32 +215,6 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static ssize_t ram_qos_class_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - - return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class); -} - -static struct device_attribute dev_attr_ram_qos_class = - __ATTR(qos_class, 0444, ram_qos_class_show, NULL); - -static ssize_t pmem_qos_class_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - - return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class); -} - -static struct device_attribute dev_attr_pmem_qos_class = - __ATTR(qos_class, 0444, pmem_qos_class_show, NULL); - static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); @@ -252,21 +226,11 @@ static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) mds->poison.enabled_cmds)) return 0; - if (a == &dev_attr_pmem_qos_class.attr) - if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; - - if (a == &dev_attr_ram_qos_class.attr) - if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; - return a->mode; } static struct attribute *cxl_mem_attrs[] = { &dev_attr_trigger_poison_list.attr, - &dev_attr_ram_qos_class.attr, - &dev_attr_pmem_qos_class.attr, NULL }; From 117132edc6900621337b77d29e953d0d01f32a5f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 6 Feb 2024 12:03:40 -0700 Subject: [PATCH 098/327] cxl/test: Add support for qos_class checking Set a fake qos_class to a unique value in order to do simple testing of qos_class for root decoders and mem devs via user cxl_test. A mock function is added to set the fake qos_class values for memory device and overrides cxl_endpoint_parse_cdat() in cxl driver code. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240206190431.1810289-5-dave.jiang@intel.com Reviewed-by: Jonathan Cameron Signed-off-by: Dan Williams --- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/cxl.c | 63 ++++++++++++++++++++++++++++++----- tools/testing/cxl/test/mock.c | 14 ++++++++ tools/testing/cxl/test/mock.h | 1 + 4 files changed, 70 insertions(+), 9 deletions(-) diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index caff3834671f..030b388800f0 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -13,6 +13,7 @@ ldflags-y += --wrap=cxl_hdm_decode_init ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys +ldflags-y += --wrap=cxl_endpoint_parse_cdat DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index a3cdbb2be038..908e0d083936 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -15,6 +15,8 @@ static int interleave_arithmetic; +#define FAKE_QTG_ID 42 + #define NR_CXL_HOST_BRIDGES 2 #define NR_CXL_SINGLE_HOST 1 #define NR_CXL_RCH 1 @@ -209,7 +211,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, - .qtg_id = 0, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, }, .target = { 0 }, @@ -224,7 +226,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, - .qtg_id = 1, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, @@ -239,7 +241,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 2, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, }, .target = { 0 }, @@ -254,7 +256,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 3, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, @@ -269,7 +271,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 4, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, }, .target = { 2 }, @@ -284,7 +286,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, - .qtg_id = 5, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M, }, .target = { 3 }, @@ -301,7 +303,7 @@ static struct { .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 0, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, }, .target = { 0, }, @@ -317,7 +319,7 @@ static struct { .granularity = 0, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 1, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, @@ -333,7 +335,7 @@ static struct { .granularity = 0, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, - .qtg_id = 0, + .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 16UL, }, .target = { 0, 1, 0, 1, }, @@ -976,6 +978,48 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port) return 0; } +/* + * Faking the cxl_dpa_perf for the memdev when appropriate. + */ +static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, + struct cxl_dpa_perf *dpa_perf) +{ + dpa_perf->qos_class = FAKE_QTG_ID; + dpa_perf->dpa_range = *range; + dpa_perf->coord.read_latency = 500; + dpa_perf->coord.write_latency = 500; + dpa_perf->coord.read_bandwidth = 1000; + dpa_perf->coord.write_bandwidth = 1000; +} + +static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) +{ + struct cxl_root *cxl_root __free(put_cxl_root) = + find_cxl_root(port); + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct range pmem_range = { + .start = cxlds->pmem_res.start, + .end = cxlds->pmem_res.end, + }; + struct range ram_range = { + .start = cxlds->ram_res.start, + .end = cxlds->ram_res.end, + }; + + if (!cxl_root) + return; + + if (range_len(&ram_range)) + dpa_perf_setup(port, &ram_range, &mds->ram_perf); + + if (range_len(&pmem_range)) + dpa_perf_setup(port, &pmem_range, &mds->pmem_perf); + + cxl_memdev_update_perf(cxlmd); +} + static struct cxl_mock_ops cxl_mock_ops = { .is_mock_adev = is_mock_adev, .is_mock_bridge = is_mock_bridge, @@ -989,6 +1033,7 @@ static struct cxl_mock_ops cxl_mock_ops = { .devm_cxl_setup_hdm = mock_cxl_setup_hdm, .devm_cxl_add_passthrough_decoder = mock_cxl_add_passthrough_decoder, .devm_cxl_enumerate_decoders = mock_cxl_enumerate_decoders, + .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 1a61e68e3095..6f737941dc0e 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -285,6 +285,20 @@ resource_size_t __wrap_cxl_rcd_component_reg_phys(struct device *dev, } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_rcd_component_reg_phys, CXL); +void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + + if (ops && ops->is_mock_dev(cxlmd->dev.parent)) + ops->cxl_endpoint_parse_cdat(port); + else + cxl_endpoint_parse_cdat(port); + put_cxl_mock_ops(index); +} +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); MODULE_IMPORT_NS(CXL); diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index a94223750346..d1b0271d2822 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -25,6 +25,7 @@ struct cxl_mock_ops { int (*devm_cxl_add_passthrough_decoder)(struct cxl_port *port); int (*devm_cxl_enumerate_decoders)( struct cxl_hdm *hdm, struct cxl_endpoint_dvsec_info *info); + void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From 0cab687205986491302cd2e440ef1d253031c221 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 16 Feb 2024 17:01:13 +0100 Subject: [PATCH 099/327] cxl/pci: Fix disabling memory if DVSEC CXL Range does not match a CFMWS window The Linux CXL subsystem is built on the assumption that HPA == SPA. That is, the host physical address (HPA) the HDM decoder registers are programmed with are system physical addresses (SPA). During HDM decoder setup, the DVSEC CXL range registers (cxl-3.1, 8.1.3.8) are checked if the memory is enabled and the CXL range is in a HPA window that is described in a CFMWS structure of the CXL host bridge (cxl-3.1, 9.18.1.3). Now, if the HPA is not an SPA, the CXL range does not match a CFMWS window and the CXL memory range will be disabled then. The HDM decoder stops working which causes system memory being disabled and further a system hang during HDM decoder initialization, typically when a CXL enabled kernel boots. Prevent a system hang and do not disable the HDM decoder if the decoder's CXL range is not found in a CFMWS window. Note the change only fixes a hardware hang, but does not implement HPA/SPA translation. Support for this can be added in a follow on patch series. Signed-off-by: Robert Richter Fixes: 34e37b4c432c ("cxl/port: Enable HDM Capability after validating DVSEC Ranges") Cc: Link: https://lore.kernel.org/r/20240216160113.407141-1-rrichter@amd.com Signed-off-by: Dan Williams --- drivers/cxl/core/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 480489f5644e..e9e6c81ce034 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -477,9 +477,9 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, allowed++; } - if (!allowed) { - cxl_set_mem_enable(cxlds, 0); - info->mem_enabled = 0; + if (!allowed && info->mem_enabled) { + dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n"); + return -ENXIO; } /* From 97dde84026339e4b4af9a6301f825d1828d7874b Mon Sep 17 00:00:00 2001 From: Pavel Sakharov Date: Wed, 14 Feb 2024 12:27:17 +0300 Subject: [PATCH 100/327] net: stmmac: Fix incorrect dereference in interrupt handlers If 'dev' or 'data' is NULL, the 'priv' variable has an incorrect address when dereferencing calling netdev_err(). Since we get as 'dev_id' or 'data' what was passed as the 'dev' argument to request_irq() during interrupt initialization (that is, the net_device and rx/tx queue pointers initialized at the time of the call) and since there are usually no checks for the 'dev_id' argument in such handlers in other drivers, remove these checks from the handlers in stmmac driver. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 8532f613bc78 ("net: stmmac: introduce MSI Interrupt routines for mac, safety, RX & TX") Signed-off-by: Pavel Sakharov Reviewed-by: Serge Semin Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 75d029704503..e80d77bd9f1f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6059,11 +6059,6 @@ static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id) struct net_device *dev = (struct net_device *)dev_id; struct stmmac_priv *priv = netdev_priv(dev); - if (unlikely(!dev)) { - netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); - return IRQ_NONE; - } - /* Check if adapter is up */ if (test_bit(STMMAC_DOWN, &priv->state)) return IRQ_HANDLED; @@ -6079,11 +6074,6 @@ static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id) struct net_device *dev = (struct net_device *)dev_id; struct stmmac_priv *priv = netdev_priv(dev); - if (unlikely(!dev)) { - netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); - return IRQ_NONE; - } - /* Check if adapter is up */ if (test_bit(STMMAC_DOWN, &priv->state)) return IRQ_HANDLED; @@ -6105,11 +6095,6 @@ static irqreturn_t stmmac_msi_intr_tx(int irq, void *data) dma_conf = container_of(tx_q, struct stmmac_dma_conf, tx_queue[chan]); priv = container_of(dma_conf, struct stmmac_priv, dma_conf); - if (unlikely(!data)) { - netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); - return IRQ_NONE; - } - /* Check if adapter is up */ if (test_bit(STMMAC_DOWN, &priv->state)) return IRQ_HANDLED; @@ -6136,11 +6121,6 @@ static irqreturn_t stmmac_msi_intr_rx(int irq, void *data) dma_conf = container_of(rx_q, struct stmmac_dma_conf, rx_queue[chan]); priv = container_of(dma_conf, struct stmmac_priv, dma_conf); - if (unlikely(!data)) { - netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); - return IRQ_NONE; - } - /* Check if adapter is up */ if (test_bit(STMMAC_DOWN, &priv->state)) return IRQ_HANDLED; From 081a0e3b0d4c061419d3f4679dec9f68725b17e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Feb 2024 17:21:06 +0000 Subject: [PATCH 101/327] ipv4: properly combine dev_base_seq and ipv4.dev_addr_genid net->dev_base_seq and ipv4.dev_addr_genid are monotonically increasing. If we XOR their values, we could miss to detect if both values were changed with the same amount. Fixes: 0465277f6b3f ("ipv4: provide addr and netconf dump consistency info") Signed-off-by: Eric Dumazet Cc: Nicolas Dichtel Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ca0ff15dc8fa..bc74f131fe4d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1825,6 +1825,21 @@ done: return err; } +/* Combine dev_addr_genid and dev_base_seq to detect changes. + */ +static u32 inet_base_seq(const struct net *net) +{ + u32 res = atomic_read(&net->ipv4.dev_addr_genid) + + net->dev_base_seq; + + /* Must not return 0 (see nl_dump_check_consistent()). + * Chose a value far away from 0. + */ + if (!res) + res = 0x80000000; + return res; +} + static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; @@ -1876,8 +1891,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; head = &tgt_net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^ - tgt_net->dev_base_seq; + cb->seq = inet_base_seq(tgt_net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -2278,8 +2292,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = inet_base_seq(net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; From e898e4cd1aab271ca414f9ac6e08e4c761f6913c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Feb 2024 17:21:07 +0000 Subject: [PATCH 102/327] ipv6: properly combine dev_base_seq and ipv6.dev_addr_genid net->dev_base_seq and ipv6.dev_addr_genid are monotonically increasing. If we XOR their values, we could miss to detect if both values were changed with the same amount. Fixes: 63998ac24f83 ("ipv6: provide addr and netconf dump consistency info") Signed-off-by: Eric Dumazet Cc: Nicolas Dichtel Signed-off-by: Eric Dumazet Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 733ace18806c..5a839c5fb1a5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -708,6 +708,22 @@ errout: return err; } +/* Combine dev_addr_genid and dev_base_seq to detect changes. + */ +static u32 inet6_base_seq(const struct net *net) +{ + u32 res = atomic_read(&net->ipv6.dev_addr_genid) + + net->dev_base_seq; + + /* Must not return 0 (see nl_dump_check_consistent()). + * Chose a value far away from 0. + */ + if (!res) + res = 0x80000000; + return res; +} + + static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { @@ -741,8 +757,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = inet6_base_seq(net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -5362,7 +5377,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, } rcu_read_lock(); - cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq; + cb->seq = inet6_base_seq(tgt_net); for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; From 6c347be62ae963b301ead8e7fa7b9973e6e0d6e1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 15 Feb 2024 19:25:28 +0100 Subject: [PATCH 103/327] mptcp: add needs_id for userspace appending addr When userspace PM requires to create an ID 0 subflow in "userspace pm create id 0 subflow" test like this: userspace_pm_add_sf $ns2 10.0.3.2 0 An ID 1 subflow, in fact, is created. Since in mptcp_pm_nl_append_new_local_addr(), 'id 0' will be treated as no ID is set by userspace, and will allocate a new ID immediately: if (!e->addr.id) e->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 1); To solve this issue, a new parameter needs_id is added for mptcp_userspace_pm_append_new_local_addr() to distinguish between whether userspace PM has set an ID 0 or whether userspace PM has not set any address. needs_id is true in mptcp_userspace_pm_get_local_id(), but false in mptcp_pm_nl_announce_doit() and mptcp_pm_nl_subflow_create_doit(). Fixes: e5ed101a6028 ("mptcp: userspace pm allow creating id 0 subflow") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_userspace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 4f3901d5b8ef..e582b3b2d174 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -26,7 +26,8 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry) + struct mptcp_pm_addr_entry *entry, + bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; @@ -41,7 +42,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); - if (addr_match && entry->addr.id == 0) + if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); if (addr_match && id_match) { @@ -64,7 +65,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, } *e = *entry; - if (!e->addr.id) + if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 1); @@ -153,7 +154,7 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, if (new_entry.addr.port == msk_sport) new_entry.addr.port = 0; - return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); + return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); } int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) @@ -198,7 +199,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) goto announce_err; } - err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); + err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto announce_err; @@ -378,7 +379,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) } local.addr = addr_l; - err = mptcp_userspace_pm_append_new_local_addr(msk, &local); + err = mptcp_userspace_pm_append_new_local_addr(msk, &local, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto create_err; From 584f3894262634596532cf43a5e782e34a0ce374 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 15 Feb 2024 19:25:29 +0100 Subject: [PATCH 104/327] mptcp: add needs_id for netlink appending addr Just the same as userspace PM, a new parameter needs_id is added for in-kernel PM mptcp_pm_nl_append_new_local_addr() too. Add a new helper mptcp_pm_has_addr_attr_id() to check whether an address ID is set from PM or not. In mptcp_pm_nl_get_local_id(), needs_id is always true, but in mptcp_pm_nl_add_addr_doit(), pass mptcp_pm_has_addr_attr_id() to needs_it. Fixes: efd5a4c04e18 ("mptcp: add the address ID assignment bitmap") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 287a60381eae..a24c9128dee9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -901,7 +901,8 @@ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) } static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, - struct mptcp_pm_addr_entry *entry) + struct mptcp_pm_addr_entry *entry, + bool needs_id) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; @@ -949,7 +950,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, } } - if (!entry->addr.id) { + if (!entry->addr.id && needs_id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -960,7 +961,7 @@ find_next: } } - if (!entry->addr.id) + if (!entry->addr.id && needs_id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); @@ -1092,7 +1093,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc entry->ifindex = 0; entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); if (ret < 0) kfree(entry); @@ -1285,6 +1286,18 @@ next: return 0; } +static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, + struct genl_info *info) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_address_nl_policy, info->extack) && + tb[MPTCP_PM_ADDR_ATTR_ID]) + return true; + return false; +} + int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; @@ -1326,7 +1339,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) goto out_free; } } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, + !mptcp_pm_has_addr_attr_id(attr, info)); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; From b8adb69a7d29c2d33eb327bca66476fb6066516b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:30 +0100 Subject: [PATCH 105/327] mptcp: fix lockless access in subflow ULP diag Since the introduction of the subflow ULP diag interface, the dump callback accessed all the subflow data with lockless. We need either to annotate all the read and write operation accordingly, or acquire the subflow socket lock. Let's do latter, even if slower, to avoid a diffstat havoc. Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/mptcp/diag.c | 6 +++++- net/tls/tls_main.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index dd78a1181031..f6eba9652d01 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2506,7 +2506,7 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(const struct sock *sk, struct sk_buff *skb); + int (*get_info)(struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index a536586742f2..e57c5f47f035 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -13,17 +13,19 @@ #include #include "protocol.h" -static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) +static int subflow_get_info(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *sf; struct nlattr *start; u32 flags = 0; + bool slow; int err; start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); if (!start) return -EMSGSIZE; + slow = lock_sock_fast(sk); rcu_read_lock(); sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); if (!sf) { @@ -69,11 +71,13 @@ static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) } rcu_read_unlock(); + unlock_sock_fast(sk, slow); nla_nest_end(skb, start); return 0; nla_failure: rcu_read_unlock(); + unlock_sock_fast(sk, slow); nla_nest_cancel(skb, start); return err; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 1c2c6800949d..b4674f03d71a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -1003,7 +1003,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx) return 0; } -static int tls_get_info(const struct sock *sk, struct sk_buff *skb) +static int tls_get_info(struct sock *sk, struct sk_buff *skb) { u16 version, cipher_type; struct tls_context *ctx; From a7cfe776637004a4c938fde78be4bd608c32c3ef Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:31 +0100 Subject: [PATCH 106/327] mptcp: fix data races on local_id The local address id is accessed lockless by the NL PM, add all the required ONCE annotation. There is a caveat: the local id can be initialized late in the subflow life-cycle, and its validity is controlled by the local_id_valid flag. Remove such flag and encode the validity in the local_id field itself with negative value before initialization. That allows accessing the field consistently with a single read operation. Fixes: 0ee4261a3681 ("mptcp: implement mptcp_pm_remove_subflow") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/diag.c | 2 +- net/mptcp/pm_netlink.c | 6 +++--- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 15 ++++++++++++--- net/mptcp/subflow.c | 9 +++++---- 6 files changed, 23 insertions(+), 13 deletions(-) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index e57c5f47f035..6ff6f14674aa 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -65,7 +65,7 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || - nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { err = -EMSGSIZE; goto nla_failure; } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a24c9128dee9..912e25077437 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -800,7 +800,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - u8 id = subflow->local_id; + u8 id = subflow_get_local_id(subflow); if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) continue; @@ -809,7 +809,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, subflow->local_id, subflow->remote_id, + i, rm_id, id, subflow->remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); @@ -1994,7 +1994,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) if (WARN_ON_ONCE(!sf)) return -EINVAL; - if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id)) + if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index e582b3b2d174..d396a5973429 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -234,7 +234,7 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { - if (subflow->local_id == 0) { + if (READ_ONCE(subflow->local_id) == 0) { has_id_0 = true; break; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8ef2927ebca2..948606a537da 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -85,7 +85,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ - subflow->local_id_valid = 1; + WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ed50f2015dc3..631a7f445f34 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -491,10 +491,9 @@ struct mptcp_subflow_context { remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ - local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 9; + __unused : 10; bool data_avail; bool scheduled; u32 remote_nonce; @@ -505,7 +504,7 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; - u8 local_id; + s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; @@ -556,6 +555,7 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; + WRITE_ONCE(subflow->local_id, -1); } static inline u64 @@ -1022,6 +1022,15 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) +{ + int local_id = READ_ONCE(subflow->local_id); + + if (local_id < 0) + return 0; + return local_id; +} + void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c34ecadee120..015184bbf06c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -577,8 +577,8 @@ do_reset: static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { - subflow->local_id = local_id; - subflow->local_id_valid = 1; + WARN_ON_ONCE(local_id < 0 || local_id > 255); + WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) @@ -587,7 +587,7 @@ static int subflow_chk_local_id(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; - if (likely(subflow->local_id_valid)) + if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); @@ -1731,6 +1731,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, pr_debug("subflow=%p", ctx); ctx->tcp_sock = sk; + WRITE_ONCE(ctx->local_id, -1); return ctx; } @@ -1966,7 +1967,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ - new_ctx->local_id_valid = 1; + subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; From 967d3c27127e71a10ff5c083583a038606431b61 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:32 +0100 Subject: [PATCH 107/327] mptcp: fix data races on remote_id Similar to the previous patch, address the data race on remote_id, adding the suitable ONCE annotations. Fixes: bedee0b56113 ("mptcp: address lookup improvements") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 8 ++++---- net/mptcp/subflow.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 912e25077437..ed6983af1ab2 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -443,7 +443,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = subflow->remote_id; + addrs[i].id = READ_ONCE(subflow->remote_id); if (deny_id0 && !addrs[i].id) continue; @@ -799,18 +799,18 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 remote_id = READ_ONCE(subflow->remote_id); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); - if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) + if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, id, subflow->remote_id, - msk->mpc_endpoint_id); + i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 015184bbf06c..71ba86246ff8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -535,7 +535,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; - subflow->remote_id = mp_opt.join_id; + WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -1567,7 +1567,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; - subflow->remote_id = remote_id; + WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; @@ -1974,7 +1974,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->fully_established = 1; new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; - new_ctx->remote_id = subflow_req->remote_id; + WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; From 045e9d812868a2d80b7a57b224ce8009444b7bbc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:33 +0100 Subject: [PATCH 108/327] mptcp: fix duplicate subflow creation Fullmesh endpoints could end-up unexpectedly generating duplicate subflows - same local and remote addresses - when multiple incoming ADD_ADDR are processed before the PM creates the subflow for the local endpoints. Address the issue explicitly checking for duplicates at subflow creation time. To avoid a quadratic computational complexity, track the unavailable remote address ids in a temporary bitmap and initialize such bitmap with the remote ids of all the existing subflows matching the local address currently processed. The above allows additionally replacing the existing code checking for duplicate entry in the current set with a simple bit test operation. Fixes: 2843ff6f36db ("mptcp: remote addresses fullmesh") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/435 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ed6983af1ab2..58d17d9604e7 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -396,19 +396,6 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } -static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr, - const struct mptcp_addr_info *addr) -{ - int i; - - for (i = 0; i < nr; i++) { - if (addrs[i].id == addr->id) - return true; - } - - return false; -} - /* Fill all the remote addresses into the array addrs[], * and return the array size. */ @@ -440,6 +427,16 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, msk->pm.subflows++; addrs[i++] = remote; } else { + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + + /* Forbid creation of new subflows matching existing + * ones, possibly already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); @@ -447,11 +444,17 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, if (deny_id0 && !addrs[i].id) continue; + if (test_bit(addrs[i].id, unavail_id)) + continue; + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) continue; - if (!lookup_address_in_vec(addrs, i, &addrs[i]) && - msk->pm.subflows < subflows_max) { + if (msk->pm.subflows < subflows_max) { + /* forbid creating multiple address towards + * this id + */ + __set_bit(addrs[i].id, unavail_id); msk->pm.subflows++; i++; } From d2a2547565a9f1ad7989f7e21f97cbf065a9390d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:34 +0100 Subject: [PATCH 109/327] selftests: mptcp: pm nl: also list skipped tests If the feature is not supported by older kernels, and instead of just ignoring some tests, we should mark them as skipped, so we can still track them. Fixes: d85555ac11f9 ("selftests: mptcp: pm_netlink: format subtests results in TAP") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 8f4ff123a7eb..79e83a2c95de 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -194,6 +194,12 @@ subflow 10.0.1.1" " (nofullmesh)" ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ subflow,backup,fullmesh 10.0.1.1" " (backup,fullmesh)" +else + for st in fullmesh nofullmesh backup,fullmesh; do + st=" (${st})" + printf "%-50s%s\n" "${st}" "[SKIP]" + mptcp_lib_result_skip "${st}" + done fi mptcp_lib_result_print_all_tap From 662f084f3396d8a804d56cb53ac05c9e39902a7b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:35 +0100 Subject: [PATCH 110/327] selftests: mptcp: pm nl: avoid error msg on older kernels Since the 'Fixes' commit mentioned below, and if the kernel being tested doesn't support the 'fullmesh' flag, this error will be printed: netlink error -22 (Invalid argument) ./pm_nl_ctl: bailing out due to netlink error[s] But that can be normal if the kernel doesn't support the feature, no need to print this worrying error message while everything else looks OK. So we can mute stderr. Failures will still be detected if any. Fixes: 1dc88d241f92 ("selftests: mptcp: pm_nl_ctl: always look for errors") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 79e83a2c95de..71899a3ffa7a 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -183,7 +183,7 @@ check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ subflow 10.0.1.1" " (nobackup)" # fullmesh support has been added later -ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh +ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || mptcp_lib_expect_all_features; then check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ From 694bd45980a61045eb5ec07799e3b94c76db830e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:36 +0100 Subject: [PATCH 111/327] selftests: mptcp: diag: fix bash warnings on older kernels Since the 'Fixes' commit mentioned below, the command that is executed in __chk_nr() helper can return nothing if the feature is not supported. This is the case when the MPTCP CURRESTAB counter is not supported. To avoid this warning ... ./diag.sh: line 65: [: !=: unary operator expected ... we just need to surround '$nr' with double quotes, to support an empty string when the feature is not supported. Fixes: 81ab772819da ("selftests: mptcp: diag: check CURRESTAB counters") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 04fcb8a077c9..e0615c6ffb8d 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -62,8 +62,8 @@ __chk_nr() nr=$(eval $command) printf "%-50s" "$msg" - if [ $nr != $expected ]; then - if [ $nr = "$skip" ] && ! mptcp_lib_expect_all_features; then + if [ "$nr" != "$expected" ]; then + if [ "$nr" = "$skip" ] && ! mptcp_lib_expect_all_features; then echo "[ skip ] Feature probably not supported" mptcp_lib_result_skip "${msg}" else From 4d8e0dde0403b5a86aa83e243f020711a9c3e31f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:37 +0100 Subject: [PATCH 112/327] selftests: mptcp: simult flows: fix some subtest names The selftest was correctly recording all the results, but the 'reverse direction' part was missing in the name when needed. It is important to have a unique (sub)test name in TAP, because some CI environments drop tests with duplicated name. Fixes: 675d99338e7a ("selftests: mptcp: simult flows: format subtests results in TAP") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/simult_flows.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 0cc964e6f2c1..8f9ddb3ad4fe 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -250,7 +250,8 @@ run_test() [ $bail -eq 0 ] || exit $ret fi - printf "%-60s" "$msg - reverse direction" + msg+=" - reverse direction" + printf "%-60s" "${msg}" do_transfer $large $small $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" From 2ef0d804c090658960c008446523863fd7e3541e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:38 +0100 Subject: [PATCH 113/327] selftests: mptcp: userspace_pm: unique subtest names It is important to have a unique (sub)test name in TAP, because some CI environments drop tests with duplicated names. Some subtests from the userspace_pm selftest had the same names. That's because different subflows are created (and deleted) between the same pair of IP addresses. Simply adding the destination port in the name is then enough to have different names, because the destination port is always different. Note that adding such info takes a bit more space, so we need to increase a bit the width to print the name, simply to keep all the '[ OK ]' aligned as before. Fixes: f589234e1af0 ("selftests: mptcp: userspace_pm: format subtests results in TAP") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/userspace_pm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 6167837f48e1..1b94a75604fe 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -75,7 +75,7 @@ print_test() { test_name="${1}" - _printf "%-63s" "${test_name}" + _printf "%-68s" "${test_name}" } print_results() @@ -542,7 +542,7 @@ verify_subflow_events() local remid local info - info="${e_saddr} (${e_from}) => ${e_daddr} (${e_to})" + info="${e_saddr} (${e_from}) => ${e_daddr}:${e_dport} (${e_to})" if [ "$e_type" = "$SUB_ESTABLISHED" ] then From 645c1dc965ef6b5554e5e69737bb179c7a0f872f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:39 +0100 Subject: [PATCH 114/327] selftests: mptcp: diag: unique 'in use' subtest names It is important to have a unique (sub)test name in TAP, because some CI environments drop tests with duplicated name. Some 'in use' subtests from the diag selftest had the same names, e.g.: chk 0 msk in use after flush Now the previous value is taken, to have different names, e.g.: chk 2->0 msk in use after flush While at it, avoid repeating the full message, declare it once in the helper. Fixes: ce9902573652 ("selftests: mptcp: diag: format subtests results in TAP") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index e0615c6ffb8d..266656a16229 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -166,9 +166,13 @@ chk_msk_listen() chk_msk_inuse() { local expected=$1 - local msg="$2" + local msg="....chk ${2:-${expected}} msk in use" local listen_nr + if [ "${expected}" -eq 0 ]; then + msg+=" after flush" + fi + listen_nr=$(ss -N "${ns}" -Ml | grep -c LISTEN) expected=$((expected + listen_nr)) @@ -179,7 +183,7 @@ chk_msk_inuse() sleep 0.1 done - __chk_nr get_msk_inuse $expected "$msg" 0 + __chk_nr get_msk_inuse $expected "${msg}" 0 } # $1: cestab nr @@ -227,11 +231,11 @@ wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" -chk_msk_inuse 2 "....chk 2 msk in use" +chk_msk_inuse 2 chk_msk_cestab 2 flush_pids -chk_msk_inuse 0 "....chk 0 msk in use after flush" +chk_msk_inuse 0 "2->0" chk_msk_cestab 0 echo "a" | \ @@ -247,11 +251,11 @@ echo "b" | \ 127.0.0.1 >/dev/null & wait_connected $ns 10001 chk_msk_fallback_nr 1 "check fallback" -chk_msk_inuse 1 "....chk 1 msk in use" +chk_msk_inuse 1 chk_msk_cestab 1 flush_pids -chk_msk_inuse 0 "....chk 0 msk in use after flush" +chk_msk_inuse 0 "1->0" chk_msk_cestab 0 NR_CLIENTS=100 @@ -273,11 +277,11 @@ for I in `seq 1 $NR_CLIENTS`; do done wait_msk_nr $((NR_CLIENTS*2)) "many msk socket present" -chk_msk_inuse $((NR_CLIENTS*2)) "....chk many msk in use" +chk_msk_inuse $((NR_CLIENTS*2)) "many" chk_msk_cestab $((NR_CLIENTS*2)) flush_pids -chk_msk_inuse 0 "....chk 0 msk in use after flush" +chk_msk_inuse 0 "many->0" chk_msk_cestab 0 mptcp_lib_result_print_all_tap From 4103d8480866fe5abb71ef0ed8af3a3b7b9625bf Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 15 Feb 2024 19:25:40 +0100 Subject: [PATCH 115/327] selftests: mptcp: diag: unique 'cestab' subtest names It is important to have a unique (sub)test name in TAP, because some CI environments drop tests with duplicated name. Some 'cestab' subtests from the diag selftest had the same names, e.g.: ....chk 0 cestab Now the previous value is taken, to have different names, e.g.: ....chk 2->0 cestab after flush While at it, the 'after flush' info is added, similar to what is done with the 'in use' subtests. Also inspired by these 'in use' subtests, 'many' is displayed instead of a large number: many msk socket present [ ok ] ....chk many msk in use [ ok ] ....chk many cestab [ ok ] ....chk many->0 msk in use after flush [ ok ] ....chk many->0 cestab after flush [ ok ] Fixes: 81ab772819da ("selftests: mptcp: diag: check CURRESTAB counters") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 266656a16229..0a58ebb8b04c 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -189,10 +189,15 @@ chk_msk_inuse() # $1: cestab nr chk_msk_cestab() { - local cestab=$1 + local expected=$1 + local msg="....chk ${2:-${expected}} cestab" + + if [ "${expected}" -eq 0 ]; then + msg+=" after flush" + fi __chk_nr "mptcp_lib_get_counter ${ns} MPTcpExtMPCurrEstab" \ - "${cestab}" "....chk ${cestab} cestab" "" + "${expected}" "${msg}" "" } wait_connected() @@ -236,7 +241,7 @@ chk_msk_cestab 2 flush_pids chk_msk_inuse 0 "2->0" -chk_msk_cestab 0 +chk_msk_cestab 0 "2->0" echo "a" | \ timeout ${timeout_test} \ @@ -256,7 +261,7 @@ chk_msk_cestab 1 flush_pids chk_msk_inuse 0 "1->0" -chk_msk_cestab 0 +chk_msk_cestab 0 "1->0" NR_CLIENTS=100 for I in `seq 1 $NR_CLIENTS`; do @@ -278,11 +283,11 @@ done wait_msk_nr $((NR_CLIENTS*2)) "many msk socket present" chk_msk_inuse $((NR_CLIENTS*2)) "many" -chk_msk_cestab $((NR_CLIENTS*2)) +chk_msk_cestab $((NR_CLIENTS*2)) "many" flush_pids chk_msk_inuse 0 "many->0" -chk_msk_cestab 0 +chk_msk_cestab 0 "many->0" mptcp_lib_result_print_all_tap exit $ret From 5b76d928f8b779a1b19c5842e7cabee4cbb610c3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 15 Feb 2024 10:27:31 -0800 Subject: [PATCH 116/327] net: bcmasp: Indicate MAC is in charge of PHY PM Avoid the PHY library call unnecessarily into the suspend/resume functions by setting phydev->mac_managed_pm to true. The ASP driver essentially does exactly what mdio_bus_phy_resume() does. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Florian Fainelli Signed-off-by: Justin Chen Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index f59557b0cd51..6ad1366270f7 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -1050,6 +1050,9 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) netdev_err(dev, "could not attach to PHY\n"); goto err_phy_disable; } + + /* Indicate that the MAC is responsible for PHY PM */ + phydev->mac_managed_pm = true; } else if (!intf->wolopts) { ret = phy_resume(dev->phydev); if (ret) From f120e62e37f0af4c4cbe08e5a88ea60a6a17c858 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Thu, 15 Feb 2024 10:27:32 -0800 Subject: [PATCH 117/327] net: bcmasp: Sanity check is off by one A sanity check for OOB write is off by one leading to a false positive when the array is full. Fixes: 9b90aca97f6d ("net: ethernet: bcmasp: fix possible OOB write in bcmasp_netfilt_get_all_active()") Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index 29b04a274d07..80245c65cc90 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -535,9 +535,6 @@ int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, int j = 0, i; for (i = 0; i < NUM_NET_FILTERS; i++) { - if (j == *rule_cnt) - return -EMSGSIZE; - if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) continue; @@ -547,6 +544,9 @@ int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, priv->net_filters[i - 1].wake_filter) continue; + if (j == *rule_cnt) + return -EMSGSIZE; + rule_locs[j++] = priv->net_filters[i].fs.location; } From a5c57fd2e9bd1c8ea8613a8f94fd0be5eccbf321 Mon Sep 17 00:00:00 2001 From: Gaurav Batra Date: Thu, 15 Feb 2024 16:18:33 -0600 Subject: [PATCH 118/327] powerpc/pseries/iommu: DLPAR add doesn't completely initialize pci_controller When a PCI device is dynamically added, the kernel oopses with a NULL pointer dereference: BUG: Kernel NULL pointer dereference on read at 0x00000030 Faulting instruction address: 0xc0000000006bbe5c Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs xsk_diag bonding nft_compat nf_tables nfnetlink rfkill binfmt_misc dm_multipath rpcrdma sunrpc rdma_ucm ib_srpt ib_isert iscsi_target_mod target_core_mod ib_umad ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core pseries_rng drm drm_panel_orientation_quirks xfs libcrc32c mlx5_core mlxfw sd_mod t10_pi sg tls ibmvscsi ibmveth scsi_transport_srp vmx_crypto pseries_wdt psample dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 17 PID: 2685 Comm: drmgr Not tainted 6.7.0-203405+ #66 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c0000000006bbe5c LR: c000000000a13e68 CTR: c0000000000579f8 REGS: c00000009924f240 TRAP: 0300 Not tainted (6.7.0-203405+) MSR: 8000000000009033 CR: 24002220 XER: 20040006 CFAR: c000000000a13e64 DAR: 0000000000000030 DSISR: 40000000 IRQMASK: 0 ... NIP sysfs_add_link_to_group+0x34/0x94 LR iommu_device_link+0x5c/0x118 Call Trace: iommu_init_device+0x26c/0x318 (unreliable) iommu_device_link+0x5c/0x118 iommu_init_device+0xa8/0x318 iommu_probe_device+0xc0/0x134 iommu_bus_notifier+0x44/0x104 notifier_call_chain+0xb8/0x19c blocking_notifier_call_chain+0x64/0x98 bus_notify+0x50/0x7c device_add+0x640/0x918 pci_device_add+0x23c/0x298 of_create_pci_dev+0x400/0x884 of_scan_pci_dev+0x124/0x1b0 __of_scan_bus+0x78/0x18c pcibios_scan_phb+0x2a4/0x3b0 init_phb_dynamic+0xb8/0x110 dlpar_add_slot+0x170/0x3b8 [rpadlpar_io] add_slot_store.part.0+0xb4/0x130 [rpadlpar_io] kobj_attr_store+0x2c/0x48 sysfs_kf_write+0x64/0x78 kernfs_fop_write_iter+0x1b0/0x290 vfs_write+0x350/0x4a0 ksys_write+0x84/0x140 system_call_exception+0x124/0x330 system_call_vectored_common+0x15c/0x2ec Commit a940904443e4 ("powerpc/iommu: Add iommu_ops to report capabilities and allow blocking domains") broke DLPAR add of PCI devices. The above added iommu_device structure to pci_controller. During system boot, PCI devices are discovered and this newly added iommu_device structure is initialized by a call to iommu_device_register(). During DLPAR add of a PCI device, a new pci_controller structure is allocated but there are no calls made to iommu_device_register() interface. Fix is to register the iommu device during DLPAR add as well. Fixes: a940904443e4 ("powerpc/iommu: Add iommu_ops to report capabilities and allow blocking domains") Signed-off-by: Gaurav Batra Reviewed-by: Brian King Signed-off-by: Michael Ellerman Link: https://msgid.link/20240215221833.4817-1-gbatra@linux.ibm.com --- arch/powerpc/include/asm/ppc-pci.h | 10 ++++++++++ arch/powerpc/kernel/iommu.c | 23 ++++++++++++++++------ arch/powerpc/platforms/pseries/pci_dlpar.c | 4 ++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index ce2b1b5eebdd..a8b7e8682f5b 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -30,6 +30,16 @@ void *pci_traverse_device_nodes(struct device_node *start, void *data); extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); +#if defined(CONFIG_IOMMU_API) && (defined(CONFIG_PPC_PSERIES) || \ + defined(CONFIG_PPC_POWERNV)) +extern void ppc_iommu_register_device(struct pci_controller *phb); +extern void ppc_iommu_unregister_device(struct pci_controller *phb); +#else +static inline void ppc_iommu_register_device(struct pci_controller *phb) { } +static inline void ppc_iommu_unregister_device(struct pci_controller *phb) { } +#endif + + /* From rtas_pci.h */ extern void init_pci_config_tokens (void); extern unsigned long get_phb_buid (struct device_node *); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index df17b33b89d1..2c0173e7094d 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1341,7 +1341,7 @@ static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) struct pci_controller *hose; if (!dev_is_pci(dev)) - return ERR_PTR(-EPERM); + return ERR_PTR(-ENODEV); pdev = to_pci_dev(dev); hose = pdev->bus->sysdata; @@ -1390,6 +1390,21 @@ static const struct attribute_group *spapr_tce_iommu_groups[] = { NULL, }; +void ppc_iommu_register_device(struct pci_controller *phb) +{ + iommu_device_sysfs_add(&phb->iommu, phb->parent, + spapr_tce_iommu_groups, "iommu-phb%04x", + phb->global_number); + iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, + phb->parent); +} + +void ppc_iommu_unregister_device(struct pci_controller *phb) +{ + iommu_device_unregister(&phb->iommu); + iommu_device_sysfs_remove(&phb->iommu); +} + /* * This registers IOMMU devices of PHBs. This needs to happen * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and @@ -1400,11 +1415,7 @@ static int __init spapr_tce_setup_phb_iommus_initcall(void) struct pci_controller *hose; list_for_each_entry(hose, &hose_list, list_node) { - iommu_device_sysfs_add(&hose->iommu, hose->parent, - spapr_tce_iommu_groups, "iommu-phb%04x", - hose->global_number); - iommu_device_register(&hose->iommu, &spapr_tce_iommu_ops, - hose->parent); + ppc_iommu_register_device(hose); } return 0; } diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 4ba824568119..4448386268d9 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -35,6 +35,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pseries_msi_allocate_domains(phb); + ppc_iommu_register_device(phb); + /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -76,6 +78,8 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + ppc_iommu_unregister_device(phb); + pseries_msi_free_domains(phb); /* Keep a reference so phb isn't freed yet */ From d70c7a6614ac2e1340a9fed044aec04d695c6645 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Mon, 29 Jan 2024 08:50:56 +0100 Subject: [PATCH 119/327] usb: uhci-grlib: Explicitly include linux/platform_device.h This fixes relying upon linux/of_platform.h to include linux/platform_device.h, which it no longer does, thereby fixing compilation problems like: In file included from drivers/usb/host/uhci-hcd.c:850: drivers/usb/host/uhci-grlib.c: In function 'uhci_hcd_grlib_probe': drivers/usb/host/uhci-grlib.c:92:29: error: invalid use of undefined type 'struct platform_device' 92 | struct device_node *dn = op->dev.of_node; | ^~ Fixes: ef175b29a242 ("of: Stop circularly including of_device.h and of_platform.h") Signed-off-by: Andreas Larsson Link: https://lore.kernel.org/r/20240129075056.1511630-1-andreas@gaisler.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/uhci-grlib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/uhci-grlib.c b/drivers/usb/host/uhci-grlib.c index ac3fc5970315..cfebb833668e 100644 --- a/drivers/usb/host/uhci-grlib.c +++ b/drivers/usb/host/uhci-grlib.c @@ -22,6 +22,7 @@ #include #include #include +#include static int uhci_grlib_init(struct usb_hcd *hcd) { From 18a6be674306c9acb05c08e5c3fd376ef50a917c Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Tue, 6 Feb 2024 11:40:18 +0100 Subject: [PATCH 120/327] usb: cdnsp: blocked some cdns3 specific code host.c file has some parts of code that were introduced for CDNS3 driver and should not be used with CDNSP driver. This patch blocks using these parts of codes by CDNSP driver. These elements include: - xhci_plat_cdns3_xhci object - cdns3 specific XECP_PORT_CAP_REG register - cdns3 specific XECP_AUX_CTRL_REG1 register cc: stable@vger.kernel.org Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") Signed-off-by: Pawel Laszczak Link: https://lore.kernel.org/r/20240206104018.48272-1-pawell@cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/host.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/usb/cdns3/host.c b/drivers/usb/cdns3/host.c index 6164fc4c96a4..ceca4d839dfd 100644 --- a/drivers/usb/cdns3/host.c +++ b/drivers/usb/cdns3/host.c @@ -18,6 +18,11 @@ #include "../host/xhci.h" #include "../host/xhci-plat.h" +/* + * The XECP_PORT_CAP_REG and XECP_AUX_CTRL_REG1 exist only + * in Cadence USB3 dual-role controller, so it can't be used + * with Cadence CDNSP dual-role controller. + */ #define XECP_PORT_CAP_REG 0x8000 #define XECP_AUX_CTRL_REG1 0x8120 @@ -57,6 +62,8 @@ static const struct xhci_plat_priv xhci_plat_cdns3_xhci = { .resume_quirk = xhci_cdns3_resume_quirk, }; +static const struct xhci_plat_priv xhci_plat_cdnsp_xhci; + static int __cdns_host_init(struct cdns *cdns) { struct platform_device *xhci; @@ -81,8 +88,13 @@ static int __cdns_host_init(struct cdns *cdns) goto err1; } - cdns->xhci_plat_data = kmemdup(&xhci_plat_cdns3_xhci, - sizeof(struct xhci_plat_priv), GFP_KERNEL); + if (cdns->version < CDNSP_CONTROLLER_V2) + cdns->xhci_plat_data = kmemdup(&xhci_plat_cdns3_xhci, + sizeof(struct xhci_plat_priv), GFP_KERNEL); + else + cdns->xhci_plat_data = kmemdup(&xhci_plat_cdnsp_xhci, + sizeof(struct xhci_plat_priv), GFP_KERNEL); + if (!cdns->xhci_plat_data) { ret = -ENOMEM; goto err1; From 47625b018c6bc788bc10dd654c82696eb0a5ef11 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Thu, 15 Feb 2024 13:16:09 +0100 Subject: [PATCH 121/327] usb: cdnsp: fixed issue with incorrect detecting CDNSP family controllers Cadence have several controllers from 0x000403xx family but current driver suuport detecting only one with DID equal 0x0004034E. It causes that if someone uses different CDNSP controller then driver will use incorrect version and register space. Patch fix this issue. cc: stable@vger.kernel.org Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") Signed-off-by: Pawel Laszczak Link: https://lore.kernel.org/r/20240215121609.259772-1-pawell@cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/core.c | 1 - drivers/usb/cdns3/drd.c | 13 +++++++++---- drivers/usb/cdns3/drd.h | 6 +++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/usb/cdns3/core.c b/drivers/usb/cdns3/core.c index 33548771a0d3..465e9267b49c 100644 --- a/drivers/usb/cdns3/core.c +++ b/drivers/usb/cdns3/core.c @@ -395,7 +395,6 @@ pm_put: return ret; } - /** * cdns_wakeup_irq - interrupt handler for wakeup events * @irq: irq number for cdns3/cdnsp core device diff --git a/drivers/usb/cdns3/drd.c b/drivers/usb/cdns3/drd.c index 04b6d12f2b9a..ee917f1b091c 100644 --- a/drivers/usb/cdns3/drd.c +++ b/drivers/usb/cdns3/drd.c @@ -156,7 +156,8 @@ bool cdns_is_device(struct cdns *cdns) */ static void cdns_otg_disable_irq(struct cdns *cdns) { - writel(0, &cdns->otg_irq_regs->ien); + if (cdns->version) + writel(0, &cdns->otg_irq_regs->ien); } /** @@ -422,15 +423,20 @@ int cdns_drd_init(struct cdns *cdns) cdns->otg_regs = (void __iomem *)&cdns->otg_v1_regs->cmd; - if (readl(&cdns->otg_cdnsp_regs->did) == OTG_CDNSP_DID) { + state = readl(&cdns->otg_cdnsp_regs->did); + + if (OTG_CDNSP_CHECK_DID(state)) { cdns->otg_irq_regs = (struct cdns_otg_irq_regs __iomem *) &cdns->otg_cdnsp_regs->ien; cdns->version = CDNSP_CONTROLLER_V2; - } else { + } else if (OTG_CDNS3_CHECK_DID(state)) { cdns->otg_irq_regs = (struct cdns_otg_irq_regs __iomem *) &cdns->otg_v1_regs->ien; writel(1, &cdns->otg_v1_regs->simulate); cdns->version = CDNS3_CONTROLLER_V1; + } else { + dev_err(cdns->dev, "not supporte DID=0x%08x\n", state); + return -EINVAL; } dev_dbg(cdns->dev, "DRD version v1 (ID: %08x, rev: %08x)\n", @@ -483,7 +489,6 @@ int cdns_drd_exit(struct cdns *cdns) return 0; } - /* Indicate the cdns3 core was power lost before */ bool cdns_power_is_lost(struct cdns *cdns) { diff --git a/drivers/usb/cdns3/drd.h b/drivers/usb/cdns3/drd.h index cbdf94f73ed9..d72370c321d3 100644 --- a/drivers/usb/cdns3/drd.h +++ b/drivers/usb/cdns3/drd.h @@ -79,7 +79,11 @@ struct cdnsp_otg_regs { __le32 susp_timing_ctrl; }; -#define OTG_CDNSP_DID 0x0004034E +/* CDNSP driver supports 0x000403xx Cadence USB controller family. */ +#define OTG_CDNSP_CHECK_DID(did) (((did) & GENMASK(31, 8)) == 0x00040300) + +/* CDNS3 driver supports 0x000402xx Cadence USB controller family. */ +#define OTG_CDNS3_CHECK_DID(did) (((did) & GENMASK(31, 8)) == 0x00040200) /* * Common registers interface for both CDNS3 and CDNSP version of DRD. From 1c9be13846c0b2abc2480602f8ef421360e1ad9e Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 29 Jan 2024 17:37:38 +0800 Subject: [PATCH 122/327] usb: roles: fix NULL pointer issue when put module's reference In current design, usb role class driver will get usb_role_switch parent's module reference after the user get usb_role_switch device and put the reference after the user put the usb_role_switch device. However, the parent device of usb_role_switch may be removed before the user put the usb_role_switch. If so, then, NULL pointer issue will be met when the user put the parent module's reference. This will save the module pointer in structure of usb_role_switch. Then, we don't need to find module by iterating long relations. Fixes: 5c54fcac9a9d ("usb: roles: Take care of driver module reference counting") cc: stable@vger.kernel.org Signed-off-by: Xu Yang Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240129093739.2371530-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/roles/class.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index ae41578bd014..2bad038fb9ad 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -21,6 +21,7 @@ static const struct class role_class = { struct usb_role_switch { struct device dev; struct mutex lock; /* device lock*/ + struct module *module; /* the module this device depends on */ enum usb_role role; /* From descriptor */ @@ -135,7 +136,7 @@ struct usb_role_switch *usb_role_switch_get(struct device *dev) usb_role_switch_match); if (!IS_ERR_OR_NULL(sw)) - WARN_ON(!try_module_get(sw->dev.parent->driver->owner)); + WARN_ON(!try_module_get(sw->module)); return sw; } @@ -157,7 +158,7 @@ struct usb_role_switch *fwnode_usb_role_switch_get(struct fwnode_handle *fwnode) sw = fwnode_connection_find_match(fwnode, "usb-role-switch", NULL, usb_role_switch_match); if (!IS_ERR_OR_NULL(sw)) - WARN_ON(!try_module_get(sw->dev.parent->driver->owner)); + WARN_ON(!try_module_get(sw->module)); return sw; } @@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(fwnode_usb_role_switch_get); void usb_role_switch_put(struct usb_role_switch *sw) { if (!IS_ERR_OR_NULL(sw)) { - module_put(sw->dev.parent->driver->owner); + module_put(sw->module); put_device(&sw->dev); } } @@ -189,15 +190,18 @@ struct usb_role_switch * usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode) { struct device *dev; + struct usb_role_switch *sw = NULL; if (!fwnode) return NULL; dev = class_find_device_by_fwnode(&role_class, fwnode); - if (dev) - WARN_ON(!try_module_get(dev->parent->driver->owner)); + if (dev) { + sw = to_role_switch(dev); + WARN_ON(!try_module_get(sw->module)); + } - return dev ? to_role_switch(dev) : NULL; + return sw; } EXPORT_SYMBOL_GPL(usb_role_switch_find_by_fwnode); @@ -338,6 +342,7 @@ usb_role_switch_register(struct device *parent, sw->set = desc->set; sw->get = desc->get; + sw->module = parent->driver->owner; sw->dev.parent = parent; sw->dev.fwnode = desc->fwnode; sw->dev.class = &role_class; From b787a3e781759026a6212736ef8e52cf83d1821a Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 29 Jan 2024 17:37:39 +0800 Subject: [PATCH 123/327] usb: roles: don't get/set_role() when usb_role_switch is unregistered There is a possibility that usb_role_switch device is unregistered before the user put usb_role_switch. In this case, the user may still want to get/set_role() since the user can't sense the changes of usb_role_switch. This will add a flag to show if usb_role_switch is already registered and avoid unwanted behaviors. Fixes: fde0aa6c175a ("usb: common: Small class for USB role switches") cc: stable@vger.kernel.org Signed-off-by: Xu Yang Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240129093739.2371530-2-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/roles/class.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index 2bad038fb9ad..70165dd86b5d 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -23,6 +23,7 @@ struct usb_role_switch { struct mutex lock; /* device lock*/ struct module *module; /* the module this device depends on */ enum usb_role role; + bool registered; /* From descriptor */ struct device *usb2_port; @@ -49,6 +50,9 @@ int usb_role_switch_set_role(struct usb_role_switch *sw, enum usb_role role) if (IS_ERR_OR_NULL(sw)) return 0; + if (!sw->registered) + return -EOPNOTSUPP; + mutex_lock(&sw->lock); ret = sw->set(sw, role); @@ -74,7 +78,7 @@ enum usb_role usb_role_switch_get_role(struct usb_role_switch *sw) { enum usb_role role; - if (IS_ERR_OR_NULL(sw)) + if (IS_ERR_OR_NULL(sw) || !sw->registered) return USB_ROLE_NONE; mutex_lock(&sw->lock); @@ -357,6 +361,8 @@ usb_role_switch_register(struct device *parent, return ERR_PTR(ret); } + sw->registered = true; + /* TODO: Symlinks for the host port and the device controller. */ return sw; @@ -371,8 +377,10 @@ EXPORT_SYMBOL_GPL(usb_role_switch_register); */ void usb_role_switch_unregister(struct usb_role_switch *sw) { - if (!IS_ERR_OR_NULL(sw)) + if (!IS_ERR_OR_NULL(sw)) { + sw->registered = false; device_unregister(&sw->dev); + } } EXPORT_SYMBOL_GPL(usb_role_switch_unregister); From cd45f99034b0c8c9cb346dd0d6407a95ca3d36f6 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 2 Feb 2024 10:42:16 -0500 Subject: [PATCH 124/327] usb: cdns3: fixed memory use after free at cdns3_gadget_ep_disable() ... cdns3_gadget_ep_free_request(&priv_ep->endpoint, &priv_req->request); list_del_init(&priv_req->list); ... 'priv_req' actually free at cdns3_gadget_ep_free_request(). But list_del_init() use priv_req->list after it. [ 1542.642868][ T534] BUG: KFENCE: use-after-free read in __list_del_entry_valid+0x10/0xd4 [ 1542.642868][ T534] [ 1542.653162][ T534] Use-after-free read at 0x000000009ed0ba99 (in kfence-#3): [ 1542.660311][ T534] __list_del_entry_valid+0x10/0xd4 [ 1542.665375][ T534] cdns3_gadget_ep_disable+0x1f8/0x388 [cdns3] [ 1542.671571][ T534] usb_ep_disable+0x44/0xe4 [ 1542.675948][ T534] ffs_func_eps_disable+0x64/0xc8 [ 1542.680839][ T534] ffs_func_set_alt+0x74/0x368 [ 1542.685478][ T534] ffs_func_disable+0x18/0x28 Move list_del_init() before cdns3_gadget_ep_free_request() to resolve this problem. Cc: stable@vger.kernel.org Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") Signed-off-by: Frank Li Reviewed-by: Roger Quadros Acked-by: Peter Chen Link: https://lore.kernel.org/r/20240202154217.661867-1-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index aeca902ab6cc..d6723d31fc6e 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -2540,11 +2540,11 @@ static int cdns3_gadget_ep_disable(struct usb_ep *ep) while (!list_empty(&priv_ep->wa2_descmiss_req_list)) { priv_req = cdns3_next_priv_request(&priv_ep->wa2_descmiss_req_list); + list_del_init(&priv_req->list); kfree(priv_req->request.buf); cdns3_gadget_ep_free_request(&priv_ep->endpoint, &priv_req->request); - list_del_init(&priv_req->list); --priv_ep->wa2_counter; } From 5fd9e45f1ebcd57181358af28506e8a661a260b3 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 2 Feb 2024 10:42:17 -0500 Subject: [PATCH 125/327] usb: cdns3: fix memory double free when handle zero packet 829 if (request->complete) { 830 spin_unlock(&priv_dev->lock); 831 usb_gadget_giveback_request(&priv_ep->endpoint, 832 request); 833 spin_lock(&priv_dev->lock); 834 } 835 836 if (request->buf == priv_dev->zlp_buf) 837 cdns3_gadget_ep_free_request(&priv_ep->endpoint, request); Driver append an additional zero packet request when queue a packet, which length mod max packet size is 0. When transfer complete, run to line 831, usb_gadget_giveback_request() will free this requestion. 836 condition is true, so cdns3_gadget_ep_free_request() free this request again. Log: [ 1920.140696][ T150] BUG: KFENCE: use-after-free read in cdns3_gadget_giveback+0x134/0x2c0 [cdns3] [ 1920.140696][ T150] [ 1920.151837][ T150] Use-after-free read at 0x000000003d1cd10b (in kfence-#36): [ 1920.159082][ T150] cdns3_gadget_giveback+0x134/0x2c0 [cdns3] [ 1920.164988][ T150] cdns3_transfer_completed+0x438/0x5f8 [cdns3] Add check at line 829, skip call usb_gadget_giveback_request() if it is additional zero length packet request. Needn't call usb_gadget_giveback_request() because it is allocated in this driver. Cc: stable@vger.kernel.org Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") Signed-off-by: Frank Li Reviewed-by: Roger Quadros Acked-by: Peter Chen Link: https://lore.kernel.org/r/20240202154217.661867-2-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index d6723d31fc6e..fd1beb10bba7 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -828,7 +828,11 @@ void cdns3_gadget_giveback(struct cdns3_endpoint *priv_ep, return; } - if (request->complete) { + /* + * zlp request is appended by driver, needn't call usb_gadget_giveback_request() to notify + * gadget composite driver. + */ + if (request->complete && request->buf != priv_dev->zlp_buf) { spin_unlock(&priv_dev->lock); usb_gadget_giveback_request(&priv_ep->endpoint, request); From b191a18cb5c47109ca696370a74a5062a70adfd0 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Fri, 16 Feb 2024 00:41:02 +0000 Subject: [PATCH 126/327] usb: dwc3: gadget: Don't disconnect if not started Don't go through soft-disconnection sequence if the controller hasn't started. Otherwise, there will be timeout and warning reports from the soft-disconnection flow. Cc: stable@vger.kernel.org Fixes: 61a348857e86 ("usb: dwc3: gadget: Fix NULL pointer dereference in dwc3_gadget_suspend") Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/linux-usb/20240215233536.7yejlj3zzkl23vjd@synopsys.com/T/#mb0661cd5f9272602af390c18392b9a36da4f96e6 Tested-by: Marek Szyprowski Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/e3be9b929934e0680a6f4b8f6eb11b18ae9c7e07.1708043922.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 4c8dd6724678..28f49400f3e8 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2650,6 +2650,11 @@ static int dwc3_gadget_soft_disconnect(struct dwc3 *dwc) int ret; spin_lock_irqsave(&dwc->lock, flags); + if (!dwc->pullups_connected) { + spin_unlock_irqrestore(&dwc->lock, flags); + return 0; + } + dwc->connected = false; /* From 858a74cb512833e276d96a72acb560ce8c138bec Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sat, 17 Feb 2024 21:20:42 +0200 Subject: [PATCH 127/327] usb: gadget: omap_udc: fix USB gadget regression on Palm TE When upgrading from 6.1 LTS to 6.6 LTS, I noticed the ethernet gadget stopped working on Palm TE. Commit 8825acd7cc8a ("ARM: omap1: remove dead code") deleted Palm TE from machine_without_vbus_sense(), although the board is still used. Fix that. Fixes: 8825acd7cc8a ("ARM: omap1: remove dead code") Cc: stable Signed-off-by: Aaro Koskinen Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240217192042.GA372205@darkstar.musicnaut.iki.fi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/omap_udc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c index 10c5d7f726a1..f90eeecf27de 100644 --- a/drivers/usb/gadget/udc/omap_udc.c +++ b/drivers/usb/gadget/udc/omap_udc.c @@ -2036,7 +2036,8 @@ static irqreturn_t omap_udc_iso_irq(int irq, void *_dev) static inline int machine_without_vbus_sense(void) { - return machine_is_omap_osk() || machine_is_sx1(); + return machine_is_omap_osk() || machine_is_omap_palmte() || + machine_is_sx1(); } static int omap_udc_start(struct usb_gadget *g, From 23b1d2d99b0f55326f05e7d757fa197c4a95dc5c Mon Sep 17 00:00:00 2001 From: Ondrej Jirman Date: Sat, 17 Feb 2024 17:20:21 +0100 Subject: [PATCH 128/327] Revert "usb: typec: tcpm: reset counter when enter into unattached state after try role" The reverted commit makes the state machine only ever go from SRC_ATTACH_WAIT to SNK_TRY in endless loop when toggling. After revert it goes to SRC_ATTACHED after initially trying SNK_TRY earlier, as it should for toggling to ever detect the power source mode and the port is again able to provide power to attached power sinks. This reverts commit 2d6d80127006ae3da26b1f21a65eccf957f2d1e5. Cc: stable@vger.kernel.org Fixes: 2d6d80127006 ("usb: typec: tcpm: reset counter when enter into unattached state after try role") Signed-off-by: Ondrej Jirman Link: https://lore.kernel.org/r/20240217162023.1719738-1-megi@xff.cz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index f7d7daa60c8d..295ae7eb912c 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -3743,9 +3743,6 @@ static void tcpm_detach(struct tcpm_port *port) if (tcpm_port_is_disconnected(port)) port->hard_reset_count = 0; - port->try_src_count = 0; - port->try_snk_count = 0; - if (!port->attached) return; From 76c51146820c5dac629f21deafab0a7039bc3ccd Mon Sep 17 00:00:00 2001 From: Krishna Kurapati Date: Mon, 5 Feb 2024 13:16:50 +0530 Subject: [PATCH 129/327] usb: gadget: ncm: Avoid dropping datagrams of properly parsed NTBs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is observed sometimes when tethering is used over NCM with Windows 11 as host, at some instances, the gadget_giveback has one byte appended at the end of a proper NTB. When the NTB is parsed, unwrap call looks for any leftover bytes in SKB provided by u_ether and if there are any pending bytes, it treats them as a separate NTB and parses it. But in case the second NTB (as per unwrap call) is faulty/corrupt, all the datagrams that were parsed properly in the first NTB and saved in rx_list are dropped. Adding a few custom traces showed the following: [002] d..1 7828.532866: dwc3_gadget_giveback: ep1out: req 000000003868811a length 1025/16384 zsI ==> 0 [002] d..1 7828.532867: ncm_unwrap_ntb: K: ncm_unwrap_ntb toprocess: 1025 [002] d..1 7828.532867: ncm_unwrap_ntb: K: ncm_unwrap_ntb nth: 1751999342 [002] d..1 7828.532868: ncm_unwrap_ntb: K: ncm_unwrap_ntb seq: 0xce67 [002] d..1 7828.532868: ncm_unwrap_ntb: K: ncm_unwrap_ntb blk_len: 0x400 [002] d..1 7828.532868: ncm_unwrap_ntb: K: ncm_unwrap_ntb ndp_len: 0x10 [002] d..1 7828.532869: ncm_unwrap_ntb: K: Parsed NTB with 1 frames In this case, the giveback is of 1025 bytes and block length is 1024. The rest 1 byte (which is 0x00) won't be parsed resulting in drop of all datagrams in rx_list. Same is case with packets of size 2048: [002] d..1 7828.557948: dwc3_gadget_giveback: ep1out: req 0000000011dfd96e length 2049/16384 zsI ==> 0 [002] d..1 7828.557949: ncm_unwrap_ntb: K: ncm_unwrap_ntb nth: 1751999342 [002] d..1 7828.557950: ncm_unwrap_ntb: K: ncm_unwrap_ntb blk_len: 0x800 Lecroy shows one byte coming in extra confirming that the byte is coming in from PC: Transfer 2959 - Bytes Transferred(1025) Timestamp((18.524 843 590) - Transaction 8391 - Data(1025 bytes) Timestamp(18.524 843 590) --- Packet 4063861 Data(1024 bytes) Duration(2.117us) Idle(14.700ns) Timestamp(18.524 843 590) --- Packet 4063863 Data(1 byte) Duration(66.160ns) Time(282.000ns) Timestamp(18.524 845 722) According to Windows driver, no ZLP is needed if wBlockLength is non-zero, because the non-zero wBlockLength has already told the function side the size of transfer to be expected. However, there are in-market NCM devices that rely on ZLP as long as the wBlockLength is multiple of wMaxPacketSize. To deal with such devices, it pads an extra 0 at end so the transfer is no longer multiple of wMaxPacketSize. Cc: Fixes: 9f6ce4240a2b ("usb: gadget: f_ncm.c added") Signed-off-by: Krishna Kurapati Reviewed-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20240205074650.200304-1-quic_kriskura@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_ncm.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index ca5d5f564998..e2a059cfda2c 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1338,7 +1338,15 @@ parse_ntb: "Parsed NTB with %d frames\n", dgram_counter); to_process -= block_len; - if (to_process != 0) { + + /* + * Windows NCM driver avoids USB ZLPs by adding a 1-byte + * zero pad as needed. + */ + if (to_process == 1 && + (*(unsigned char *)(ntb_ptr + block_len) == 0x00)) { + to_process--; + } else if (to_process > 0) { ntb_ptr = (unsigned char *)(ntb_ptr + block_len); goto parse_ntb; } From 84b6238aff3db8f4fa72bc5e451ba34ce288d30a Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 15 Feb 2024 12:20:39 -0800 Subject: [PATCH 130/327] MAINTAINERS: Drop myself as maintainer of TYPEC port controller drivers I am no longer involved in Type-C development and not really current on its status and progress. Recently I have been doing more damage than good. It is time to go. Cc: Heikki Krogerus Cc: Greg Kroah-Hartman Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20240215202039.1982539-1-linux@roeck-us.net Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9ed4d3868539..5cfa6939a104 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22878,9 +22878,8 @@ S: Maintained F: drivers/usb/typec/mux/pi3usb30532.c USB TYPEC PORT CONTROLLER DRIVERS -M: Guenter Roeck L: linux-usb@vger.kernel.org -S: Maintained +S: Orphan F: drivers/usb/typec/tcpm/ USB UHCI DRIVER From bd915ae73a2d78559b376ad2caf5e4ef51de2455 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 15 Feb 2024 23:04:42 +0100 Subject: [PATCH 131/327] drm/meson: Don't remove bridges which are created by other drivers Stop calling drm_bridge_remove() for bridges allocated/managed by other drivers in the remove paths of meson_encoder_{cvbs,dsi,hdmi}. drm_bridge_remove() unregisters the bridge so it cannot be used anymore. Doing so for bridges we don't own can lead to the video pipeline not being able to come up after -EPROBE_DEFER of the VPU because we're unregistering a bridge that's managed by another driver. The other driver doesn't know that we have unregistered it's bridge and on subsequent .probe() we're not able to find those bridges anymore (since nobody re-creates them). This fixes probe errors on Meson8b boards with the CVBS outputs enabled. Fixes: 09847723c12f ("drm/meson: remove drm bridges at aggregate driver unbind time") Fixes: 42dcf15f901c ("drm/meson: add DSI encoder") Cc: Reported-by: Steve Morvai Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Tested-by: Steve Morvai Link: https://lore.kernel.org/r/20240215220442.1343152-1-martin.blumenstingl@googlemail.com Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240215220442.1343152-1-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_encoder_cvbs.c | 1 - drivers/gpu/drm/meson/meson_encoder_dsi.c | 1 - drivers/gpu/drm/meson/meson_encoder_hdmi.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_encoder_cvbs.c b/drivers/gpu/drm/meson/meson_encoder_cvbs.c index 3f73b211fa8e..3407450435e2 100644 --- a/drivers/gpu/drm/meson/meson_encoder_cvbs.c +++ b/drivers/gpu/drm/meson/meson_encoder_cvbs.c @@ -294,6 +294,5 @@ void meson_encoder_cvbs_remove(struct meson_drm *priv) if (priv->encoders[MESON_ENC_CVBS]) { meson_encoder_cvbs = priv->encoders[MESON_ENC_CVBS]; drm_bridge_remove(&meson_encoder_cvbs->bridge); - drm_bridge_remove(meson_encoder_cvbs->next_bridge); } } diff --git a/drivers/gpu/drm/meson/meson_encoder_dsi.c b/drivers/gpu/drm/meson/meson_encoder_dsi.c index 3f93c70488ca..311b91630fbe 100644 --- a/drivers/gpu/drm/meson/meson_encoder_dsi.c +++ b/drivers/gpu/drm/meson/meson_encoder_dsi.c @@ -168,6 +168,5 @@ void meson_encoder_dsi_remove(struct meson_drm *priv) if (priv->encoders[MESON_ENC_DSI]) { meson_encoder_dsi = priv->encoders[MESON_ENC_DSI]; drm_bridge_remove(&meson_encoder_dsi->bridge); - drm_bridge_remove(meson_encoder_dsi->next_bridge); } } diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c index 25ea76558690..c4686568c9ca 100644 --- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c +++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c @@ -474,6 +474,5 @@ void meson_encoder_hdmi_remove(struct meson_drm *priv) if (priv->encoders[MESON_ENC_HDMI]) { meson_encoder_hdmi = priv->encoders[MESON_ENC_HDMI]; drm_bridge_remove(&meson_encoder_hdmi->bridge); - drm_bridge_remove(meson_encoder_hdmi->next_bridge); } } From 8b79d4e994074a058b6876dce843ee112656258d Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 14 Feb 2024 07:34:30 -0800 Subject: [PATCH 132/327] tty: hvc: Don't enable the RISC-V SBI console by default The new SBI console has the same problem as the old one: there's only one shared backing hardware and no synchronization, so the two drivers end up stepping on each other. This was the same issue the old SBI-0.1 console drivers had, but that was disabled by default when SBI-0.1 was. So just mark the new driver as nonportable. Reported-by: Emil Renner Berthing Fixes: 88ead68e764c ("tty: Add SBI debug console support to HVC SBI driver") Signed-off-by: Palmer Dabbelt Reviewed-by: Paul Walmsley Reviewed-by: Anup Patel Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240214153429.16484-2-palmer@rivosinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/hvc/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index 6e05c5c7bca1..c2a4e88b328f 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -108,13 +108,15 @@ config HVC_DCC_SERIALIZE_SMP config HVC_RISCV_SBI bool "RISC-V SBI console support" - depends on RISCV_SBI + depends on RISCV_SBI && NONPORTABLE select HVC_DRIVER help This enables support for console output via RISC-V SBI calls, which - is normally used only during boot to output printk. + is normally used only during boot to output printk. This driver + conflicts with real console drivers and should not be enabled on + systems that directly access the console. - If you don't know what do to here, say Y. + If you don't know what do to here, say N. config HVCS tristate "IBM Hypervisor Virtual Console Server support" From f418ae73311deb901c0110b08d1bbafc20c1820e Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 16 Feb 2024 23:47:07 +0100 Subject: [PATCH 133/327] serial: stm32: do not always set SER_RS485_RX_DURING_TX if RS485 is enabled Before commit 07c30ea5861f ("serial: Do not hold the port lock when setting rx-during-tx GPIO") the SER_RS485_RX_DURING_TX flag was only set if the rx-during-tx mode was not controlled by a GPIO. Now the flag is set unconditionally when RS485 is enabled. This results in an incorrect setting if the rx-during-tx GPIO is not asserted. Fix this by setting the flag only if the rx-during-tx mode is not controlled by a GPIO and thus restore the correct behaviour. Cc: stable@vger.kernel.org # 6.6+ Fixes: 07c30ea5861f ("serial: Do not hold the port lock when setting rx-during-tx GPIO") Signed-off-by: Lino Sanfilippo Link: https://lore.kernel.org/r/20240216224709.9928-1-l.sanfilippo@kunbus.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 794b77512740..693e932d6feb 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -251,7 +251,9 @@ static int stm32_usart_config_rs485(struct uart_port *port, struct ktermios *ter writel_relaxed(cr3, port->membase + ofs->cr3); writel_relaxed(cr1, port->membase + ofs->cr1); - rs485conf->flags |= SER_RS485_RX_DURING_TX; + if (!port->rs485_rx_during_tx_gpio) + rs485conf->flags |= SER_RS485_RX_DURING_TX; + } else { stm32_usart_clr_bits(port, ofs->cr3, USART_CR3_DEM | USART_CR3_DEP); From 3b69e32e151bc4a4e3c785cbdb1f918d5ee337ed Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 16 Feb 2024 23:47:08 +0100 Subject: [PATCH 134/327] serial: amba-pl011: Fix DMA transmission in RS485 mode When DMA is used in RS485 mode make sure that the UARTs tx section is enabled before the DMA buffers are queued for transmission. Cc: stable@vger.kernel.org Fixes: 8d479237727c ("serial: amba-pl011: add RS485 support") Signed-off-by: Lino Sanfilippo Link: https://lore.kernel.org/r/20240216224709.9928-2-l.sanfilippo@kunbus.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/amba-pl011.c | 60 ++++++++++++++++----------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index fccec1698a54..cf2c890a560f 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1339,11 +1339,41 @@ static void pl011_start_tx_pio(struct uart_amba_port *uap) } } +static void pl011_rs485_tx_start(struct uart_amba_port *uap) +{ + struct uart_port *port = &uap->port; + u32 cr; + + /* Enable transmitter */ + cr = pl011_read(uap, REG_CR); + cr |= UART011_CR_TXE; + + /* Disable receiver if half-duplex */ + if (!(port->rs485.flags & SER_RS485_RX_DURING_TX)) + cr &= ~UART011_CR_RXE; + + if (port->rs485.flags & SER_RS485_RTS_ON_SEND) + cr &= ~UART011_CR_RTS; + else + cr |= UART011_CR_RTS; + + pl011_write(cr, uap, REG_CR); + + if (port->rs485.delay_rts_before_send) + mdelay(port->rs485.delay_rts_before_send); + + uap->rs485_tx_started = true; +} + static void pl011_start_tx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); + if ((uap->port.rs485.flags & SER_RS485_ENABLED) && + !uap->rs485_tx_started) + pl011_rs485_tx_start(uap); + if (!pl011_dma_tx_start(uap)) pl011_start_tx_pio(uap); } @@ -1424,42 +1454,12 @@ static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c, return true; } -static void pl011_rs485_tx_start(struct uart_amba_port *uap) -{ - struct uart_port *port = &uap->port; - u32 cr; - - /* Enable transmitter */ - cr = pl011_read(uap, REG_CR); - cr |= UART011_CR_TXE; - - /* Disable receiver if half-duplex */ - if (!(port->rs485.flags & SER_RS485_RX_DURING_TX)) - cr &= ~UART011_CR_RXE; - - if (port->rs485.flags & SER_RS485_RTS_ON_SEND) - cr &= ~UART011_CR_RTS; - else - cr |= UART011_CR_RTS; - - pl011_write(cr, uap, REG_CR); - - if (port->rs485.delay_rts_before_send) - mdelay(port->rs485.delay_rts_before_send); - - uap->rs485_tx_started = true; -} - /* Returns true if tx interrupts have to be (kept) enabled */ static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq) { struct circ_buf *xmit = &uap->port.state->xmit; int count = uap->fifosize >> 1; - if ((uap->port.rs485.flags & SER_RS485_ENABLED) && - !uap->rs485_tx_started) - pl011_rs485_tx_start(uap); - if (uap->port.x_char) { if (!pl011_tx_char(uap, uap->port.x_char, from_irq)) return true; From cd65c48d66920457129584553f217005d09b1edb Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 15 Feb 2024 10:33:25 +0800 Subject: [PATCH 135/327] selftests: bonding: set active slave to primary eth1 specifically In bond priority testing, we set the primary interface to eth1 and add eth0,1,2 to bond in serial. This is OK in normal times. But when in debug kernel, the bridge port that eth0,1,2 connected would start slowly (enter blocking, forwarding state), which caused the primary interface down for a while after enslaving and active slave changed. Here is a test log from Jakub's debug test[1]. [ 400.399070][ T50] br0: port 1(s0) entered disabled state [ 400.400168][ T50] br0: port 4(s2) entered disabled state [ 400.941504][ T2791] bond0: (slave eth0): making interface the new active one [ 400.942603][ T2791] bond0: (slave eth0): Enslaving as an active interface with an up link [ 400.943633][ T2766] br0: port 1(s0) entered blocking state [ 400.944119][ T2766] br0: port 1(s0) entered forwarding state [ 401.128792][ T2792] bond0: (slave eth1): making interface the new active one [ 401.130771][ T2792] bond0: (slave eth1): Enslaving as an active interface with an up link [ 401.131643][ T69] br0: port 2(s1) entered blocking state [ 401.132067][ T69] br0: port 2(s1) entered forwarding state [ 401.346201][ T2793] bond0: (slave eth2): Enslaving as a backup interface with an up link [ 401.348414][ T50] br0: port 4(s2) entered blocking state [ 401.348857][ T50] br0: port 4(s2) entered forwarding state [ 401.519669][ T250] bond0: (slave eth0): link status definitely down, disabling slave [ 401.526522][ T250] bond0: (slave eth1): link status definitely down, disabling slave [ 401.526986][ T250] bond0: (slave eth2): making interface the new active one [ 401.629470][ T250] bond0: (slave eth0): link status definitely up [ 401.630089][ T250] bond0: (slave eth1): link status definitely up [...] # TEST: prio (active-backup ns_ip6_target primary_reselect 1) [FAIL] # Current active slave is eth2 but not eth1 Fix it by setting active slave to primary slave specifically before testing. [1] https://netdev-3.bots.linux.dev/vmksft-bonding-dbg/results/464301/1-bond-options-sh/stdout Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/bonding/bond_options.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index d508486cc0bd..9a3d3c389dad 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -62,6 +62,8 @@ prio_test() # create bond bond_reset "${param}" + # set active_slave to primary eth1 specifically + ip -n ${s_ns} link set bond0 type bond active_slave eth1 # check bonding member prio value ip -n ${s_ns} link set eth0 type bond_slave prio 0 From 9815e39617541ef52d0dfac4be274ad378c6dc09 Mon Sep 17 00:00:00 2001 From: "Andrey Jr. Melnikov" Date: Wed, 14 Feb 2024 17:57:57 +0100 Subject: [PATCH 136/327] ahci: asm1064: correct count of reported ports The ASM1064 SATA host controller always reports wrongly, that it has 24 ports. But in reality, it only has four ports. before: ahci 0000:04:00.0: SSS flag set, parallel bus scan disabled ahci 0000:04:00.0: AHCI 0001.0301 32 slots 24 ports 6 Gbps 0xffff0f impl SATA mode ahci 0000:04:00.0: flags: 64bit ncq sntf stag pm led only pio sxs deso sadm sds apst after: ahci 0000:04:00.0: ASM1064 has only four ports ahci 0000:04:00.0: forcing port_map 0xffff0f -> 0xf ahci 0000:04:00.0: SSS flag set, parallel bus scan disabled ahci 0000:04:00.0: AHCI 0001.0301 32 slots 24 ports 6 Gbps 0xf impl SATA mode ahci 0000:04:00.0: flags: 64bit ncq sntf stag pm led only pio sxs deso sadm sds apst Signed-off-by: "Andrey Jr. Melnikov" Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index da2e74fce2d9..682ff550ccfb 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -671,9 +671,17 @@ MODULE_PARM_DESC(mobile_lpm_policy, "Default LPM policy for mobile chipsets"); static void ahci_pci_save_initial_config(struct pci_dev *pdev, struct ahci_host_priv *hpriv) { - if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA && pdev->device == 0x1166) { - dev_info(&pdev->dev, "ASM1166 has only six ports\n"); - hpriv->saved_port_map = 0x3f; + if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA) { + switch (pdev->device) { + case 0x1166: + dev_info(&pdev->dev, "ASM1166 has only six ports\n"); + hpriv->saved_port_map = 0x3f; + break; + case 0x1064: + dev_info(&pdev->dev, "ASM1064 has only four ports\n"); + hpriv->saved_port_map = 0xf; + break; + } } if (pdev->vendor == PCI_VENDOR_ID_JMICRON && pdev->device == 0x2361) { From 26c8404e162b43dddcb037ba2d0cb58c0ed60aab Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Fri, 16 Feb 2024 23:44:57 +0530 Subject: [PATCH 137/327] ata: ahci_ceva: fix error handling for Xilinx GT PHY support Platform clock and phy error resources are not cleaned up in Xilinx GT PHY error path. To fix introduce the function ceva_ahci_platform_enable_resources() which is a customized version of ahci_platform_enable_resources() and inline with SATA IP programming sequence it does: - Assert SATA reset - Program PS GTR phy - Bring SATA by de-asserting the reset - Wait for GT lane PLL to be locked ceva_ahci_platform_enable_resources() is also used in the resume path as the same SATA programming sequence (as in probe) should be followed. Also cleanup the mixed usage of ahci_platform_enable_resources() and custom implementation in the probe function as both are not required. Fixes: 9a9d3abe24bb ("ata: ahci: ceva: Update the driver to support xilinx GT phy") Signed-off-by: Radhey Shyam Pandey Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/ahci_ceva.c | 125 +++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/drivers/ata/ahci_ceva.c b/drivers/ata/ahci_ceva.c index 64f7f7d6ba84..11a2c199a7c2 100644 --- a/drivers/ata/ahci_ceva.c +++ b/drivers/ata/ahci_ceva.c @@ -88,7 +88,6 @@ struct ceva_ahci_priv { u32 axicc; bool is_cci_enabled; int flags; - struct reset_control *rst; }; static unsigned int ceva_ahci_read_id(struct ata_device *dev, @@ -189,6 +188,60 @@ static const struct scsi_host_template ahci_platform_sht = { AHCI_SHT(DRV_NAME), }; +static int ceva_ahci_platform_enable_resources(struct ahci_host_priv *hpriv) +{ + int rc, i; + + rc = ahci_platform_enable_regulators(hpriv); + if (rc) + return rc; + + rc = ahci_platform_enable_clks(hpriv); + if (rc) + goto disable_regulator; + + /* Assert the controller reset */ + rc = ahci_platform_assert_rsts(hpriv); + if (rc) + goto disable_clks; + + for (i = 0; i < hpriv->nports; i++) { + rc = phy_init(hpriv->phys[i]); + if (rc) + goto disable_rsts; + } + + /* De-assert the controller reset */ + ahci_platform_deassert_rsts(hpriv); + + for (i = 0; i < hpriv->nports; i++) { + rc = phy_power_on(hpriv->phys[i]); + if (rc) { + phy_exit(hpriv->phys[i]); + goto disable_phys; + } + } + + return 0; + +disable_rsts: + ahci_platform_deassert_rsts(hpriv); + +disable_phys: + while (--i >= 0) { + phy_power_off(hpriv->phys[i]); + phy_exit(hpriv->phys[i]); + } + +disable_clks: + ahci_platform_disable_clks(hpriv); + +disable_regulator: + ahci_platform_disable_regulators(hpriv); + + return rc; +} + static int ceva_ahci_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -203,47 +256,19 @@ static int ceva_ahci_probe(struct platform_device *pdev) return -ENOMEM; cevapriv->ahci_pdev = pdev; - - cevapriv->rst = devm_reset_control_get_optional_exclusive(&pdev->dev, - NULL); - if (IS_ERR(cevapriv->rst)) - dev_err_probe(&pdev->dev, PTR_ERR(cevapriv->rst), - "failed to get reset\n"); - hpriv = ahci_platform_get_resources(pdev, 0); if (IS_ERR(hpriv)) return PTR_ERR(hpriv); - if (!cevapriv->rst) { - rc = ahci_platform_enable_resources(hpriv); - if (rc) - return rc; - } else { - int i; + hpriv->rsts = devm_reset_control_get_optional_exclusive(&pdev->dev, + NULL); + if (IS_ERR(hpriv->rsts)) + return dev_err_probe(&pdev->dev, PTR_ERR(hpriv->rsts), + "failed to get reset\n"); - rc = ahci_platform_enable_clks(hpriv); - if (rc) - return rc; - /* Assert the controller reset */ - reset_control_assert(cevapriv->rst); - - for (i = 0; i < hpriv->nports; i++) { - rc = phy_init(hpriv->phys[i]); - if (rc) - return rc; - } - - /* De-assert the controller reset */ - reset_control_deassert(cevapriv->rst); - - for (i = 0; i < hpriv->nports; i++) { - rc = phy_power_on(hpriv->phys[i]); - if (rc) { - phy_exit(hpriv->phys[i]); - return rc; - } - } - } + rc = ceva_ahci_platform_enable_resources(hpriv); + if (rc) + return rc; if (of_property_read_bool(np, "ceva,broken-gen2")) cevapriv->flags = CEVA_FLAG_BROKEN_GEN2; @@ -252,52 +277,60 @@ static int ceva_ahci_probe(struct platform_device *pdev) if (of_property_read_u8_array(np, "ceva,p0-cominit-params", (u8 *)&cevapriv->pp2c[0], 4) < 0) { dev_warn(dev, "ceva,p0-cominit-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } if (of_property_read_u8_array(np, "ceva,p1-cominit-params", (u8 *)&cevapriv->pp2c[1], 4) < 0) { dev_warn(dev, "ceva,p1-cominit-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } /* Read OOB timing value for COMWAKE from device-tree*/ if (of_property_read_u8_array(np, "ceva,p0-comwake-params", (u8 *)&cevapriv->pp3c[0], 4) < 0) { dev_warn(dev, "ceva,p0-comwake-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } if (of_property_read_u8_array(np, "ceva,p1-comwake-params", (u8 *)&cevapriv->pp3c[1], 4) < 0) { dev_warn(dev, "ceva,p1-comwake-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } /* Read phy BURST timing value from device-tree */ if (of_property_read_u8_array(np, "ceva,p0-burst-params", (u8 *)&cevapriv->pp4c[0], 4) < 0) { dev_warn(dev, "ceva,p0-burst-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } if (of_property_read_u8_array(np, "ceva,p1-burst-params", (u8 *)&cevapriv->pp4c[1], 4) < 0) { dev_warn(dev, "ceva,p1-burst-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } /* Read phy RETRY interval timing value from device-tree */ if (of_property_read_u16_array(np, "ceva,p0-retry-params", (u16 *)&cevapriv->pp5c[0], 2) < 0) { dev_warn(dev, "ceva,p0-retry-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } if (of_property_read_u16_array(np, "ceva,p1-retry-params", (u16 *)&cevapriv->pp5c[1], 2) < 0) { dev_warn(dev, "ceva,p1-retry-params property not defined\n"); - return -EINVAL; + rc = -EINVAL; + goto disable_resources; } /* @@ -335,7 +368,7 @@ static int __maybe_unused ceva_ahci_resume(struct device *dev) struct ahci_host_priv *hpriv = host->private_data; int rc; - rc = ahci_platform_enable_resources(hpriv); + rc = ceva_ahci_platform_enable_resources(hpriv); if (rc) return rc; From 335126937753844d36036984e96a8f343538a778 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 15 Feb 2024 17:44:32 +0000 Subject: [PATCH 138/327] drm/tests/drm_buddy: fix 32b build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doesn't seem to compile on 32b, presumably due to u64 mod/division. Simplest is to just switch over to u32 here. Also make print modifiers consistent with that. Fixes: a64056bb5a32 ("drm/tests/drm_buddy: add alloc_contiguous test") Reported-by: Geert Uytterhoeven Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240215174431.285069-7-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/tests/drm_buddy_test.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index fee6bec757d1..edacc1adb28f 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -21,7 +21,7 @@ static inline u64 get_size(int order, u64 chunk_size) static void drm_test_buddy_alloc_contiguous(struct kunit *test) { - u64 mm_size, ps = SZ_4K, i, n_pages, total; + u32 mm_size, ps = SZ_4K, i, n_pages, total; struct drm_buddy_block *block; struct drm_buddy mm; LIST_HEAD(left); @@ -56,30 +56,30 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test) KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, 0), - "buddy_alloc hit an error size=%d\n", + "buddy_alloc hit an error size=%u\n", ps); } while (++i < n_pages); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%d\n", 3 * ps); + "buddy_alloc didn't error size=%u\n", 3 * ps); drm_buddy_free_list(&mm, &middle); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%llu\n", 3 * ps); + "buddy_alloc didn't error size=%u\n", 3 * ps); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%llu\n", 2 * ps); + "buddy_alloc didn't error size=%u\n", 2 * ps); drm_buddy_free_list(&mm, &right); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%llu\n", 3 * ps); + "buddy_alloc didn't error size=%u\n", 3 * ps); /* * At this point we should have enough contiguous space for 2 blocks, * however they are never buddies (since we freed middle and right) so @@ -88,13 +88,13 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test) KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%d\n", 2 * ps); + "buddy_alloc hit an error size=%u\n", 2 * ps); drm_buddy_free_list(&mm, &left); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%d\n", 3 * ps); + "buddy_alloc hit an error size=%u\n", 3 * ps); total = 0; list_for_each_entry(block, &allocated, link) From e42b9d8b9ea2672811285e6a7654887ff64d23f3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 7 Feb 2024 10:00:42 +1030 Subject: [PATCH 139/327] btrfs: defrag: avoid unnecessary defrag caused by incorrect extent size [BUG] With the following file extent layout, defrag would do unnecessary IO and result more on-disk space usage. # mkfs.btrfs -f $dev # mount $dev $mnt # xfs_io -f -c "pwrite 0 40m" $mnt/foobar # sync # xfs_io -f -c "pwrite 40m 16k" $mnt/foobar # sync Above command would lead to the following file extent layout: item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53 generation 7 type 1 (regular) extent data disk byte 298844160 nr 41943040 extent data offset 0 nr 41943040 ram 41943040 extent compression 0 (none) item 7 key (257 EXTENT_DATA 41943040) itemoff 15763 itemsize 53 generation 8 type 1 (regular) extent data disk byte 13631488 nr 16384 extent data offset 0 nr 16384 ram 16384 extent compression 0 (none) Which is mostly fine. We can allow the final 16K to be merged with the previous 40M, but it's upon the end users' preference. But if we defrag the file using the default parameters, it would result worse file layout: # btrfs filesystem defrag $mnt/foobar # sync item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53 generation 7 type 1 (regular) extent data disk byte 298844160 nr 41943040 extent data offset 0 nr 8650752 ram 41943040 extent compression 0 (none) item 7 key (257 EXTENT_DATA 8650752) itemoff 15763 itemsize 53 generation 9 type 1 (regular) extent data disk byte 340787200 nr 33292288 extent data offset 0 nr 33292288 ram 33292288 extent compression 0 (none) item 8 key (257 EXTENT_DATA 41943040) itemoff 15710 itemsize 53 generation 8 type 1 (regular) extent data disk byte 13631488 nr 16384 extent data offset 0 nr 16384 ram 16384 extent compression 0 (none) Note the original 40M extent is still there, but a new 32M extent is created for no benefit at all. [CAUSE] There is an existing check to make sure we won't defrag a large enough extent (the threshold is by default 32M). But the check is using the length to the end of the extent: range_len = em->len - (cur - em->start); /* Skip too large extent */ if (range_len >= extent_thresh) goto next; This means, for the first 8MiB of the extent, the range_len is always smaller than the default threshold, and would not be defragged. But after the first 8MiB, the remaining part would fit the requirement, and be defragged. Such different behavior inside the same extent caused the above problem, and we should avoid different defrag decision inside the same extent. [FIX] Instead of using @range_len, just use @em->len, so that we have a consistent decision among the same file extent. Now with this fix, we won't touch the extent, thus not making it any worse. Reported-by: Filipe Manana Fixes: 0cb5950f3f3b ("btrfs: fix deadlock when reserving space during defrag") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Boris Burkov Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index c276b136ab63..5b0b64571418 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (range_len >= extent_thresh) + if (em->len >= extent_thresh) goto next; /* From b0ad381fa7690244802aed119b478b4bdafc31dd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Feb 2024 11:56:02 -0500 Subject: [PATCH 140/327] btrfs: fix deadlock with fiemap and extent locking While working on the patchset to remove extent locking I got a lockdep splat with fiemap and pagefaulting with my new extent lock replacement lock. This deadlock exists with our normal code, we just don't have lockdep annotations with the extent locking so we've never noticed it. Since we're copying the fiemap extent to user space on every iteration we have the chance of pagefaulting. Because we hold the extent lock for the entire range we could mkwrite into a range in the file that we have mmap'ed. This would deadlock with the following stack trace [<0>] lock_extent+0x28d/0x2f0 [<0>] btrfs_page_mkwrite+0x273/0x8a0 [<0>] do_page_mkwrite+0x50/0xb0 [<0>] do_fault+0xc1/0x7b0 [<0>] __handle_mm_fault+0x2fa/0x460 [<0>] handle_mm_fault+0xa4/0x330 [<0>] do_user_addr_fault+0x1f4/0x800 [<0>] exc_page_fault+0x7c/0x1e0 [<0>] asm_exc_page_fault+0x26/0x30 [<0>] rep_movs_alternative+0x33/0x70 [<0>] _copy_to_user+0x49/0x70 [<0>] fiemap_fill_next_extent+0xc8/0x120 [<0>] emit_fiemap_extent+0x4d/0xa0 [<0>] extent_fiemap+0x7f8/0xad0 [<0>] btrfs_fiemap+0x49/0x80 [<0>] __x64_sys_ioctl+0x3e1/0xb50 [<0>] do_syscall_64+0x94/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76 I wrote an fstest to reproduce this deadlock without my replacement lock and verified that the deadlock exists with our existing locking. To fix this simply don't take the extent lock for the entire duration of the fiemap. This is safe in general because we keep track of where we are when we're searching the tree, so if an ordered extent updates in the middle of our fiemap call we'll still emit the correct extents because we know what offset we were on before. The only place we maintain the lock is searching delalloc. Since the delalloc stuff can change during writeback we want to lock the extent range so we have a consistent view of delalloc at the time we're checking to see if we need to set the delalloc flag. With this patch applied we no longer deadlock with my testcase. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 62 ++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a0ffd41c5cc1..61d961a30dee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2689,16 +2689,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode, * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; + u64 lockstart; + u64 lockend; u64 prealloc_len = 0; bool delalloc; + lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); + lockend = round_up(end, inode->root->fs_info->sectorsize); + + /* + * We are only locking for the delalloc range because that's the + * only thing that can change here. With fiemap we have a lock + * on the inode, so no buffered or direct writes can happen. + * + * However mmaps and normal page writeback will cause this to + * change arbitrarily. We have to lock the extent lock here to + * make sure that nobody messes with the tree while we're doing + * btrfs_find_delalloc_in_range. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) break; @@ -2866,15 +2884,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); - struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end; u64 prev_extent_end; - u64 lockstart; - u64 lockend; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; bool stopped = false; int ret; @@ -2885,12 +2903,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - lockstart = round_down(start, inode->root->fs_info->sectorsize); - lockend = round_up(start + len, inode->root->fs_info->sectorsize); - prev_extent_end = lockstart; + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -2898,7 +2915,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, btrfs_release_path(path); path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, lockstart); + ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2910,7 +2927,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto check_eof_delalloc; } - while (prev_extent_end < lockend) { + while (prev_extent_end < range_end) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; struct btrfs_key key; @@ -2933,19 +2950,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item. */ - if (extent_end <= lockstart) + if (extent_end <= range_start) goto next_item; backref_ctx->curr_leaf_bytenr = leaf->start; /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { - const u64 range_end = min(key.offset, lockend) - 1; + const u64 hole_end = min(key.offset, range_end) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, - prev_extent_end, range_end); + prev_extent_end, hole_end); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= lockend) { + if (key.offset >= range_end) { stopped = true; break; } @@ -3049,29 +3066,41 @@ check_eof_delalloc: btrfs_free_path(path); path = NULL; - if (!stopped && prev_extent_end < lockend) { + if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) goto out_unlock; - prev_extent_end = lockend; + prev_extent_end = range_end; } if (cache.cached && cache.offset + cache.len >= last_extent_end) { const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; + u64 lockstart; + u64 lockend; bool delalloc; + lockstart = round_down(prev_extent_end, sectorsize); + lockend = round_up(i_size, sectorsize); + + /* + * See the comment in fiemap_process_hole as to why + * we're doing the locking here. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { @@ -3082,7 +3111,6 @@ check_eof_delalloc: ret = emit_last_fiemap_cache(fieinfo, &cache); out_unlock: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); From 121e4dcba3700b30e63f25203d09ddfccbab4a09 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 16 Feb 2024 14:52:59 -0800 Subject: [PATCH 141/327] ionic: use pci_is_enabled not open code Since there is a utility available for this, use the API rather than open code. Fixes: 13943d6c8273 ("ionic: prevent pci disable of already disabled device") Reviewed-by: Brett Creeley Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index c49aa358e424..10a9d80db32c 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -223,7 +223,7 @@ static void ionic_clear_pci(struct ionic *ionic) ionic_unmap_bars(ionic); pci_release_regions(ionic->pdev); - if (atomic_read(&ionic->pdev->enable_cnt) > 0) + if (pci_is_enabled(ionic->pdev)) pci_disable_device(ionic->pdev); } From 40b9385dd8e6a0515e1c9cd06a277483556b7286 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 16 Feb 2024 15:30:05 -0800 Subject: [PATCH 142/327] enic: Avoid false positive under FORTIFY_SOURCE FORTIFY_SOURCE has been ignoring 0-sized destinations while the kernel code base has been converted to flexible arrays. In order to enforce the 0-sized destinations (e.g. with __counted_by), the remaining 0-sized destinations need to be handled. Unfortunately, struct vic_provinfo resists full conversion, as it contains a flexible array of flexible arrays, which is only possible with the 0-sized fake flexible array. Use unsafe_memcpy() to avoid future false positives under CONFIG_FORTIFY_SOURCE. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/vnic_vic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cisco/enic/vnic_vic.c b/drivers/net/ethernet/cisco/enic/vnic_vic.c index 20fcb20b42ed..66b577835338 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_vic.c +++ b/drivers/net/ethernet/cisco/enic/vnic_vic.c @@ -49,7 +49,8 @@ int vic_provinfo_add_tlv(struct vic_provinfo *vp, u16 type, u16 length, tlv->type = htons(type); tlv->length = htons(length); - memcpy(tlv->value, value, length); + unsafe_memcpy(tlv->value, value, length, + /* Flexible array of flexible arrays */); vp->num_tlvs = htonl(ntohl(vp->num_tlvs) + 1); vp->length = htonl(ntohl(vp->length) + From 0281b919e175bb9c3128bd3872ac2903e9436e3f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 15 Feb 2024 13:12:17 -0800 Subject: [PATCH 143/327] bpf: Fix racing between bpf_timer_cancel_and_free and bpf_timer_cancel The following race is possible between bpf_timer_cancel_and_free and bpf_timer_cancel. It will lead a UAF on the timer->timer. bpf_timer_cancel(); spin_lock(); t = timer->time; spin_unlock(); bpf_timer_cancel_and_free(); spin_lock(); t = timer->timer; timer->timer = NULL; spin_unlock(); hrtimer_cancel(&t->timer); kfree(t); /* UAF on t */ hrtimer_cancel(&t->timer); In bpf_timer_cancel_and_free, this patch frees the timer->timer after a rcu grace period. This requires a rcu_head addition to the "struct bpf_hrtimer". Another kfree(t) happens in bpf_timer_init, this does not need a kfree_rcu because it is still under the spin_lock and timer->timer has not been visible by others yet. In bpf_timer_cancel, rcu_read_lock() is added because this helper can be used in a non rcu critical section context (e.g. from a sleepable bpf prog). Other timer->timer usages in helpers.c have been audited, bpf_timer_cancel() is the only place where timer->timer is used outside of the spin_lock. Another solution considered is to mark a t->flag in bpf_timer_cancel and clear it after hrtimer_cancel() is done. In bpf_timer_cancel_and_free, it busy waits for the flag to be cleared before kfree(t). This patch goes with a straight forward solution and frees timer->timer after a rcu grace period. Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20240215211218.990808-1-martin.lau@linux.dev --- kernel/bpf/helpers.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be72824f32b2..d19cd863d294 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1101,6 +1101,7 @@ struct bpf_hrtimer { struct bpf_prog *prog; void __rcu *callback_fn; void *value; + struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ @@ -1332,6 +1333,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; + rcu_read_lock(); __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { @@ -1353,6 +1355,7 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + rcu_read_unlock(); return ret; } @@ -1407,7 +1410,7 @@ out: */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree(t); + kfree_rcu(t, rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) From 3f00e4a9c96f4488a924aff4e35b77c8eced897e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 15 Feb 2024 13:12:18 -0800 Subject: [PATCH 144/327] selftests/bpf: Test racing between bpf_timer_cancel_and_free and bpf_timer_cancel This selftest is based on a Alexei's test adopted from an internal user to troubleshoot another bug. During this exercise, a separate racing bug was discovered between bpf_timer_cancel_and_free and bpf_timer_cancel. The details can be found in the previous patch. This patch is to add a selftest that can trigger the bug. I can trigger the UAF everytime in my qemu setup with KASAN. The idea is to have multiple user space threads running in a tight loop to exercise both bpf_map_update_elem (which calls into bpf_timer_cancel_and_free) and bpf_timer_cancel. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20240215211218.990808-2-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/timer.c | 35 ++++++++++++++++++- tools/testing/selftests/bpf/progs/timer.c | 34 +++++++++++++++++- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 760ad96b4be0..d66687f1ee6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -4,10 +4,29 @@ #include "timer.skel.h" #include "timer_failure.skel.h" +#define NUM_THR 8 + +static void *spin_lock_thread(void *arg) +{ + int i, err, prog_fd = *(int *)arg; + LIBBPF_OPTS(bpf_test_run_opts, topts); + + for (i = 0; i < 10000; i++) { + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err") || + !ASSERT_OK(topts.retval, "test_run_opts retval")) + break; + } + + pthread_exit(arg); +} + static int timer(struct timer *timer_skel) { - int err, prog_fd; + int i, err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); + pthread_t thread_id[NUM_THR]; + void *ret; err = timer__attach(timer_skel); if (!ASSERT_OK(err, "timer_attach")) @@ -43,6 +62,20 @@ static int timer(struct timer *timer_skel) /* check that code paths completed */ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + prog_fd = bpf_program__fd(timer_skel->progs.race); + for (i = 0; i < NUM_THR; i++) { + err = pthread_create(&thread_id[i], NULL, + &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + while (i) { + err = pthread_join(thread_id[--i], &ret); + if (ASSERT_OK(err, "pthread_join")) + ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); + } + return 0; } diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 8b946c8188c6..f615da97df26 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -51,7 +51,8 @@ struct { __uint(max_entries, 1); __type(key, int); __type(value, struct elem); -} abs_timer SEC(".maps"), soft_timer_pinned SEC(".maps"), abs_timer_pinned SEC(".maps"); +} abs_timer SEC(".maps"), soft_timer_pinned SEC(".maps"), abs_timer_pinned SEC(".maps"), + race_array SEC(".maps"); __u64 bss_data; __u64 abs_data; @@ -390,3 +391,34 @@ int BPF_PROG2(test5, int, a) return 0; } + +static int race_timer_callback(void *race_array, int *race_key, struct bpf_timer *timer) +{ + bpf_timer_start(timer, 1000000, 0); + return 0; +} + +SEC("syscall") +int race(void *ctx) +{ + struct bpf_timer *timer; + int err, race_key = 0; + struct elem init; + + __builtin_memset(&init, 0, sizeof(struct elem)); + bpf_map_update_elem(&race_array, &race_key, &init, BPF_ANY); + + timer = bpf_map_lookup_elem(&race_array, &race_key); + if (!timer) + return 1; + + err = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); + if (err && err != -EBUSY) + return 1; + + bpf_timer_set_callback(timer, race_timer_callback); + bpf_timer_start(timer, 0, 0); + bpf_timer_cancel(timer); + + return 0; +} From 5f2ae606cb5a90839a9be9d22388c4200f820e75 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 17 Feb 2024 19:41:51 +0800 Subject: [PATCH 145/327] bpf: Fix an issue due to uninitialized bpf_iter_task Failure to initialize it->pos, coupled with the presence of an invalid value in the flags variable, can lead to it->pos referencing an invalid task, potentially resulting in a kernel panic. To mitigate this risk, it's crucial to ensure proper initialization of it->pos to NULL. Fixes: ac8148d957f5 ("bpf: bpf_iter_task_next: use next_task(kit->task) rather than next_task(kit->pos)") Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Oleg Nesterov Link: https://lore.kernel.org/bpf/20240217114152.1623-2-laoar.shao@gmail.com --- kernel/bpf/task_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index e5c3500443c6..ec4e97c61eef 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -978,6 +978,8 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != __alignof__(struct bpf_iter_task)); + kit->pos = NULL; + switch (flags) { case BPF_TASK_ITER_ALL_THREADS: case BPF_TASK_ITER_ALL_PROCS: From 5c138a8a4abe152fcbef1ed40a6a4b5727b2991b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 17 Feb 2024 19:41:52 +0800 Subject: [PATCH 146/327] selftests/bpf: Add negtive test cases for task iter Incorporate a test case to assess the handling of invalid flags or task__nullable parameters passed to bpf_iter_task_new(). Prior to the preceding commit, this scenario could potentially trigger a kernel panic. However, with the previous commit, this test case is expected to function correctly. Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240217114152.1623-3-laoar.shao@gmail.com --- tools/testing/selftests/bpf/prog_tests/iters.c | 1 + tools/testing/selftests/bpf/progs/iters_task.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index bf84d4a1d9ae..3c440370c1f0 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -193,6 +193,7 @@ static void subtest_task_iters(void) ASSERT_EQ(skel->bss->procs_cnt, 1, "procs_cnt"); ASSERT_EQ(skel->bss->threads_cnt, thread_num + 1, "threads_cnt"); ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 1, "proc_threads_cnt"); + ASSERT_EQ(skel->bss->invalid_cnt, 0, "invalid_cnt"); pthread_mutex_unlock(&do_nothing_mutex); for (int i = 0; i < thread_num; i++) ASSERT_OK(pthread_join(thread_ids[i], &ret), "pthread_join"); diff --git a/tools/testing/selftests/bpf/progs/iters_task.c b/tools/testing/selftests/bpf/progs/iters_task.c index c9b4055cd410..e4d53e40ff20 100644 --- a/tools/testing/selftests/bpf/progs/iters_task.c +++ b/tools/testing/selftests/bpf/progs/iters_task.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; -int procs_cnt, threads_cnt, proc_threads_cnt; +int procs_cnt, threads_cnt, proc_threads_cnt, invalid_cnt; void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; @@ -26,6 +26,16 @@ int iter_task_for_each_sleep(void *ctx) procs_cnt = threads_cnt = proc_threads_cnt = 0; bpf_rcu_read_lock(); + bpf_for_each(task, pos, NULL, ~0U) { + /* Below instructions shouldn't be executed for invalid flags */ + invalid_cnt++; + } + + bpf_for_each(task, pos, NULL, BPF_TASK_ITER_PROC_THREADS) { + /* Below instructions shouldn't be executed for invalid task__nullable */ + invalid_cnt++; + } + bpf_for_each(task, pos, NULL, BPF_TASK_ITER_ALL_PROCS) if (pos->pid == target_pid) procs_cnt++; From 6f7d0f5fd8e440c3446560100ac4ff9a55eec340 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Fri, 9 Feb 2024 10:23:47 -0500 Subject: [PATCH 147/327] platform/x86: think-lmi: Fix password opcode ordering for workstations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Lenovo workstations require the password opcode to be run before the attribute value is changed (if Admin password is enabled). Tested on some Thinkpads to confirm they are OK with this order too. Signed-off-by: Mark Pearson Fixes: 640a5fa50a42 ("platform/x86: think-lmi: Opcode support") Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240209152359.528919-1-mpearson-lenovo@squebb.ca Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/think-lmi.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index 3a396b763c49..ce3e08815a8e 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -1009,7 +1009,16 @@ static ssize_t current_value_store(struct kobject *kobj, * Note - this sets the variable and then the password as separate * WMI calls. Function tlmi_save_bios_settings will error if the * password is incorrect. + * Workstation's require the opcode to be set before changing the + * attribute. */ + if (tlmi_priv.pwd_admin->valid && tlmi_priv.pwd_admin->password[0]) { + ret = tlmi_opcode_setting("WmiOpcodePasswordAdmin", + tlmi_priv.pwd_admin->password); + if (ret) + goto out; + } + set_str = kasprintf(GFP_KERNEL, "%s,%s;", setting->display_name, new_setting); if (!set_str) { @@ -1021,17 +1030,10 @@ static ssize_t current_value_store(struct kobject *kobj, if (ret) goto out; - if (tlmi_priv.save_mode == TLMI_SAVE_BULK) { + if (tlmi_priv.save_mode == TLMI_SAVE_BULK) tlmi_priv.save_required = true; - } else { - if (tlmi_priv.pwd_admin->valid && tlmi_priv.pwd_admin->password[0]) { - ret = tlmi_opcode_setting("WmiOpcodePasswordAdmin", - tlmi_priv.pwd_admin->password); - if (ret) - goto out; - } + else ret = tlmi_save_bios_settings(""); - } } else { /* old non-opcode based authentication method (deprecated) */ if (tlmi_priv.pwd_admin->valid && tlmi_priv.pwd_admin->password[0]) { auth_str = kasprintf(GFP_KERNEL, "%s,%s,%s;", From 8f812373d195810c68e1beeabe2317f8c6a31012 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 10 Feb 2024 12:01:49 +0100 Subject: [PATCH 148/327] platform/x86: intel: int0002_vgpio: Pass IRQF_ONESHOT to request_irq() Since commit 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") the ACPI OSL code passes IRQF_ONESHOT when requesting the SCI. Since the INT0002 GPIO is typically shared with the ACPI SCI the INT0002 driver must pass the same flags. This fixes the INT0002 driver failing to probe due to following error + as well as removing the backtrace that follows this error: "genirq: Flags mismatch irq 9. 00000084 (INT0002) vs. 00002080 (acpi)" Fixes: 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240210110149.12803-1-hdegoede@redhat.com --- drivers/platform/x86/intel/int0002_vgpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c index b6708bab7c53..527d8fbc7cc1 100644 --- a/drivers/platform/x86/intel/int0002_vgpio.c +++ b/drivers/platform/x86/intel/int0002_vgpio.c @@ -196,7 +196,7 @@ static int int0002_probe(struct platform_device *pdev) * IRQs into gpiolib. */ ret = devm_request_irq(dev, irq, int0002_irq, - IRQF_SHARED, "INT0002", chip); + IRQF_ONESHOT | IRQF_SHARED, "INT0002", chip); if (ret) { dev_err(dev, "Error requesting IRQ %d: %d\n", irq, ret); return ret; From dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 12 Feb 2024 13:06:07 +0100 Subject: [PATCH 149/327] platform/x86: touchscreen_dmi: Allow partial (prefix) matches for ACPI names On some devices the ACPI name of the touchscreen is e.g. either MSSL1680:00 or MSSL1680:01 depending on the BIOS version. This happens for example on the "Chuwi Hi8 Air" tablet where the initial commit's ts_data uses "MSSL1680:00" but the tablets from the github issue and linux-hardware.org probe linked below both use "MSSL1680:01". Replace the strcmp() match on ts_data->acpi_name with a strstarts() check to allow using a partial match on just the ACPI HID of "MSSL1680" and change the ts_data->acpi_name for the "Chuwi Hi8 Air" accordingly to fix the touchscreen not working on models where it is "MSSL1680:01". Note this drops the length check for I2C_NAME_SIZE. This never was necessary since the ACPI names used are never more then 11 chars and I2C_NAME_SIZE is 20 so the replaced strncmp() would always stop long before reaching I2C_NAME_SIZE. Link: https://linux-hardware.org/?computer=AC4301C0542A Fixes: bbb97d728f77 ("platform/x86: touchscreen_dmi: Add info for the Chuwi Hi8 Air tablet") Closes: https://github.com/onitake/gsl-firmware/issues/91 Cc: stable@vger.kernel.org Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240212120608.30469-1-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 7aee5e9ff2b8..969477c83e56 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -81,7 +81,7 @@ static const struct property_entry chuwi_hi8_air_props[] = { }; static const struct ts_dmi_data chuwi_hi8_air_data = { - .acpi_name = "MSSL1680:00", + .acpi_name = "MSSL1680", .properties = chuwi_hi8_air_props, }; @@ -1821,7 +1821,7 @@ static void ts_dmi_add_props(struct i2c_client *client) int error; if (has_acpi_companion(dev) && - !strncmp(ts_data->acpi_name, client->name, I2C_NAME_SIZE)) { + strstarts(client->name, ts_data->acpi_name)) { error = device_create_managed_software_node(dev, ts_data->properties, NULL); if (error) dev_err(dev, "failed to add properties: %d\n", error); From f0ddb8a9021305e4e4e4179a2e47ecea6170d55d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 12 Feb 2024 13:06:08 +0100 Subject: [PATCH 150/327] platform/x86: touchscreen_dmi: Consolidate Goodix upside-down touchscreen data Now that prefix matches for ACPI names are supported, the ts_dmi_data structs for "GDIX1001:00" and "GDIX1001:01" can be consolidated into a single match matching on "GDIX1001". For consistency also change gdix1002_00_upside_down_data to match. Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240212120608.30469-2-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 969477c83e56..975cf24ae359 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -415,18 +415,13 @@ static const struct property_entry gdix1001_upside_down_props[] = { { } }; -static const struct ts_dmi_data gdix1001_00_upside_down_data = { - .acpi_name = "GDIX1001:00", +static const struct ts_dmi_data gdix1001_upside_down_data = { + .acpi_name = "GDIX1001", .properties = gdix1001_upside_down_props, }; -static const struct ts_dmi_data gdix1001_01_upside_down_data = { - .acpi_name = "GDIX1001:01", - .properties = gdix1001_upside_down_props, -}; - -static const struct ts_dmi_data gdix1002_00_upside_down_data = { - .acpi_name = "GDIX1002:00", +static const struct ts_dmi_data gdix1002_upside_down_data = { + .acpi_name = "GDIX1002", .properties = gdix1001_upside_down_props, }; @@ -1412,7 +1407,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* Juno Tablet */ - .driver_data = (void *)&gdix1002_00_upside_down_data, + .driver_data = (void *)&gdix1002_upside_down_data, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Default string"), /* Both product- and board-name being "Default string" is somewhat rare */ @@ -1658,7 +1653,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* Teclast X89 (Android version / BIOS) */ - .driver_data = (void *)&gdix1001_00_upside_down_data, + .driver_data = (void *)&gdix1001_upside_down_data, .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "WISKY"), DMI_MATCH(DMI_BOARD_NAME, "3G062i"), @@ -1666,7 +1661,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* Teclast X89 (Windows version / BIOS) */ - .driver_data = (void *)&gdix1001_01_upside_down_data, + .driver_data = (void *)&gdix1001_upside_down_data, .matches = { /* tPAD is too generic, also match on bios date */ DMI_MATCH(DMI_BOARD_VENDOR, "TECLAST"), @@ -1684,7 +1679,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* Teclast X98 Pro */ - .driver_data = (void *)&gdix1001_00_upside_down_data, + .driver_data = (void *)&gdix1001_upside_down_data, .matches = { /* * Only match BIOS date, because the manufacturers @@ -1788,7 +1783,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* "WinBook TW100" */ - .driver_data = (void *)&gdix1001_00_upside_down_data, + .driver_data = (void *)&gdix1001_upside_down_data, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "WinBook"), DMI_MATCH(DMI_PRODUCT_NAME, "TW100") @@ -1796,7 +1791,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { }, { /* WinBook TW700 */ - .driver_data = (void *)&gdix1001_00_upside_down_data, + .driver_data = (void *)&gdix1001_upside_down_data, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "WinBook"), DMI_MATCH(DMI_PRODUCT_NAME, "TW700") From 3da01394c0f727baaee728de290eb1ecaad099fb Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Fri, 16 Feb 2024 12:11:11 +0530 Subject: [PATCH 151/327] platform/x86/amd/pmf: Remove smart_pc_status enum Improve code readability by removing smart_pc_status enum, as the same can be done with a simple true/false check; Update the code checks accordingly. Also add a missing return on amd_pmf_init_smart_pc() success, to skip trying to setup the auto / slider modes which should not be used in this case. Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240216064112.962582-1-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 11 ++++++++--- drivers/platform/x86/amd/pmf/pmf.h | 5 ----- drivers/platform/x86/amd/pmf/tee-if.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index feaa09f5b35a..1d6dbd246d65 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -330,9 +330,14 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) dev_dbg(dev->dev, "SPS enabled and Platform Profiles registered\n"); } - if (!amd_pmf_init_smart_pc(dev)) { + amd_pmf_init_smart_pc(dev); + if (dev->smart_pc_enabled) { dev_dbg(dev->dev, "Smart PC Solution Enabled\n"); - } else if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { + /* If Smart PC is enabled, no need to check for other features */ + return; + } + + if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_init_auto_mode(dev); dev_dbg(dev->dev, "Auto Mode Init done\n"); } else if (is_apmf_func_supported(dev, APMF_FUNC_DYN_SLIDER_AC) || @@ -351,7 +356,7 @@ static void amd_pmf_deinit_features(struct amd_pmf_dev *dev) amd_pmf_deinit_sps(dev); } - if (!dev->smart_pc_enabled) { + if (dev->smart_pc_enabled) { amd_pmf_deinit_smart_pc(dev); } else if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_deinit_auto_mode(dev); diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 16999c5b334f..66cae1cca73c 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -441,11 +441,6 @@ struct apmf_dyn_slider_output { struct apmf_cnqf_power_set ps[APMF_CNQF_MAX]; } __packed; -enum smart_pc_status { - PMF_SMART_PC_ENABLED, - PMF_SMART_PC_DISABLED, -}; - /* Smart PC - TA internals */ enum system_state { SYSTEM_STATE_S0i3, diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index f8c0177afb0d..8b7e3f87702e 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -260,7 +260,7 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) res = amd_pmf_invoke_cmd_init(dev); if (res == TA_PMF_TYPE_SUCCESS) { /* Now its safe to announce that smart pc is enabled */ - dev->smart_pc_enabled = PMF_SMART_PC_ENABLED; + dev->smart_pc_enabled = true; /* * Start collecting the data from TA FW after a small delay * or else, we might end up getting stale values. @@ -268,7 +268,7 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) schedule_delayed_work(&dev->pb_work, msecs_to_jiffies(pb_actions_ms * 3)); } else { dev_err(dev->dev, "ta invoke cmd init failed err: %x\n", res); - dev->smart_pc_enabled = PMF_SMART_PC_DISABLED; + dev->smart_pc_enabled = false; return res; } From 11e298f3548a6fe5e6ad78f811abfba15e6ebbc1 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Fri, 16 Feb 2024 12:11:12 +0530 Subject: [PATCH 152/327] platform/x86/amd/pmf: Fix TEE enact command failure after suspend and resume TEE enact command failures are seen after each suspend/resume cycle; fix this by cancelling the policy builder workqueue before going into suspend and reschedule the workqueue after resume. [ 629.516792] ccp 0000:c2:00.2: tee: command 0x5 timed out, disabling PSP [ 629.516835] amd-pmf AMDI0102:00: TEE enact cmd failed. err: ffff000e, ret:0 [ 630.550464] amd-pmf AMDI0102:00: AMD_PMF_REGISTER_RESPONSE:1 [ 630.550511] amd-pmf AMDI0102:00: AMD_PMF_REGISTER_ARGUMENT:7 [ 630.550548] amd-pmf AMDI0102:00: AMD_PMF_REGISTER_MESSAGE:16 Fixes: ae82cef7d9c5 ("platform/x86/amd/pmf: Add support for PMF-TA interaction") Co-developed-by: Patil Rajesh Reddy Signed-off-by: Patil Rajesh Reddy Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240216064112.962582-2-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 1d6dbd246d65..853158933510 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -296,6 +296,9 @@ static int amd_pmf_suspend_handler(struct device *dev) { struct amd_pmf_dev *pdev = dev_get_drvdata(dev); + if (pdev->smart_pc_enabled) + cancel_delayed_work_sync(&pdev->pb_work); + kfree(pdev->buf); return 0; @@ -312,6 +315,9 @@ static int amd_pmf_resume_handler(struct device *dev) return ret; } + if (pdev->smart_pc_enabled) + schedule_delayed_work(&pdev->pb_work, msecs_to_jiffies(2000)); + return 0; } From b2b6fa6f5c57848c95ca7bf1f0a7d1bb340c3460 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Feb 2024 18:52:16 -0600 Subject: [PATCH 153/327] platform/x86/amd/pmf: Fix a suspend hang on Framework 13 The buffer is cleared in the suspend handler but used in the delayed work for amd_pmf_get_metrics(). Stop clearing it to fix the hang. Reported-by: Trolli Schmittlauch Closes: https://lore.kernel.org/regressions/ed2226ff-257b-4cfd-afd6-bf3be9785474@localhost/ Closes: https://community.frame.work/t/kernel-6-8-rc-system-freezes-after-resuming-from-suspend-reproducers-wanted/45381 Fixes: 2b3a7f06caaf ("platform/x86/amd/pmf: Change return type of amd_pmf_set_dram_addr()") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240217005216.113408-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 853158933510..4f734e049f4a 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -299,8 +299,6 @@ static int amd_pmf_suspend_handler(struct device *dev) if (pdev->smart_pc_enabled) cancel_delayed_work_sync(&pdev->pb_work); - kfree(pdev->buf); - return 0; } From 20545af302bb1f6a67c0348b64032c11ecfa77ba Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Feb 2024 19:41:06 -0600 Subject: [PATCH 154/327] platform/x86/amd/pmf: Add debugging message for missing policy data If a machine advertises Smart PC support but is missing policy data show a debugging message to help clarify why Smart PC wasn't enabled. Reviewed-by: Hans de Goede Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240217014107.113749-2-mario.limonciello@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/tee-if.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 8b7e3f87702e..1359ab340f7c 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -252,8 +252,10 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) cookie = readl(dev->policy_buf + POLICY_COOKIE_OFFSET); length = readl(dev->policy_buf + POLICY_COOKIE_LEN); - if (cookie != POLICY_SIGN_COOKIE || !length) + if (cookie != POLICY_SIGN_COOKIE || !length) { + dev_dbg(dev->dev, "cookie doesn't match\n"); return -EINVAL; + } /* Update the actual length */ dev->policy_sz = length + 512; From e7096150580849429ff3d43dd69a718bb2036be4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Feb 2024 19:41:07 -0600 Subject: [PATCH 155/327] platform/x86/amd/pmf: Fixup error handling for amd_pmf_init_smart_pc() amd_pmf_init_smart_pc() calls out to amd_pmf_get_bios_buffer() but the error handling flow doesn't clean everything up allocated memory. As amd_pmf_get_bios_buffer() is only called by amd_pmf_init_smart_pc(), fold it into the function and add labels to clean up any step that can fail along the way. Explicitly set everything allocated to NULL as there are other features that may access some of the same variables. Fixes: 7c45534afa44 ("platform/x86/amd/pmf: Add support for PMF Policy Binary") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240217014107.113749-3-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/tee-if.c | 67 ++++++++++++++++----------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 1359ab340f7c..4f74de680654 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -338,25 +338,6 @@ static void amd_pmf_remove_pb(struct amd_pmf_dev *dev) {} static void amd_pmf_hex_dump_pb(struct amd_pmf_dev *dev) {} #endif -static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) -{ - dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL); - if (!dev->policy_buf) - return -ENOMEM; - - dev->policy_base = devm_ioremap(dev->dev, dev->policy_addr, dev->policy_sz); - if (!dev->policy_base) - return -ENOMEM; - - memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); - - amd_pmf_hex_dump_pb(dev); - if (pb_side_load) - amd_pmf_open_pb(dev, dev->dbgfs_dir); - - return amd_pmf_start_policy_engine(dev); -} - static int amd_pmf_amdtee_ta_match(struct tee_ioctl_version_data *ver, const void *data) { return ver->impl_id == TEE_IMPL_ID_AMDTEE; @@ -455,22 +436,56 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) return ret; INIT_DELAYED_WORK(&dev->pb_work, amd_pmf_invoke_cmd); - amd_pmf_set_dram_addr(dev, true); - amd_pmf_get_bios_buffer(dev); + + ret = amd_pmf_set_dram_addr(dev, true); + if (ret) + goto error; + + dev->policy_base = devm_ioremap(dev->dev, dev->policy_addr, dev->policy_sz); + if (!dev->policy_base) { + ret = -ENOMEM; + goto error; + } + + dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL); + if (!dev->policy_buf) { + ret = -ENOMEM; + goto error; + } + + memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); + + amd_pmf_hex_dump_pb(dev); + if (pb_side_load) + amd_pmf_open_pb(dev, dev->dbgfs_dir); + dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL); if (!dev->prev_data) - return -ENOMEM; + goto error; - return dev->smart_pc_enabled; + ret = amd_pmf_start_policy_engine(dev); + if (ret) + goto error; + + return 0; + +error: + amd_pmf_deinit_smart_pc(dev); + + return ret; } void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) { - if (pb_side_load) + if (pb_side_load && dev->esbin) amd_pmf_remove_pb(dev); - kfree(dev->prev_data); - kfree(dev->policy_buf); cancel_delayed_work_sync(&dev->pb_work); + kfree(dev->prev_data); + dev->prev_data = NULL; + kfree(dev->policy_buf); + dev->policy_buf = NULL; + kfree(dev->buf); + dev->buf = NULL; amd_pmf_tee_deinit(dev); } From 76d41fb063338781e936765b7ed74224215ca178 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Feb 2024 19:56:42 -0600 Subject: [PATCH 156/327] platform/x86/amd/pmf: Fix a potential race with policy binary sideload The debugfs `update_policy` file is created before amd_pmf_start_policy_engine() has completed, and thus there could be a possible (albeit unlikely) race between sideloading a policy and the BIOS policy getting setup. Move the debugfs file creation after all BIOS policy is setup. Fixes: 10817f28e533 ("platform/x86/amd/pmf: Add capability to sideload of policy binary") Reported-by: Hans de Goede Closes: https://lore.kernel.org/platform-driver-x86/15df7d02-b0aa-457a-954a-9d280a592843@redhat.com/T/#m2c445f135e5ef9b53184be7fc9df84e15f89d4d9 Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240217015642.113806-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/tee-if.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 4f74de680654..8527dca9cf56 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -456,8 +456,6 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); amd_pmf_hex_dump_pb(dev); - if (pb_side_load) - amd_pmf_open_pb(dev, dev->dbgfs_dir); dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL); if (!dev->prev_data) @@ -467,6 +465,9 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) if (ret) goto error; + if (pb_side_load) + amd_pmf_open_pb(dev, dev->dbgfs_dir); + return 0; error: From 9c92006b896c767218aabe8947b62026a571cfd0 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 31 Jan 2024 09:19:33 +0100 Subject: [PATCH 157/327] irqchip/sifive-plic: Enable interrupt if needed before EOI RISC-V PLIC cannot "end-of-interrupt" (EOI) disabled interrupts, as explained in the description of Interrupt Completion in the PLIC spec: "The PLIC signals it has completed executing an interrupt handler by writing the interrupt ID it received from the claim to the claim/complete register. The PLIC does not check whether the completion ID is the same as the last claim ID for that target. If the completion ID does not match an interrupt source that *is currently enabled* for the target, the completion is silently ignored." Commit 69ea463021be ("irqchip/sifive-plic: Fixup EOI failed when masked") ensured that EOI is successful by enabling interrupt first, before EOI. Commit a1706a1c5062 ("irqchip/sifive-plic: Separate the enable and mask operations") removed the interrupt enabling code from the previous commit, because it assumes that interrupt should already be enabled at the point of EOI. However, this is incorrect: there is a window after a hart claiming an interrupt and before irq_desc->lock getting acquired, interrupt can be disabled during this window. Thus, EOI can be invoked while the interrupt is disabled, effectively nullify this EOI. This results in the interrupt never gets asserted again, and the device who uses this interrupt appears frozen. Make sure that interrupt is really enabled before EOI. Fixes: a1706a1c5062 ("irqchip/sifive-plic: Separate the enable and mask operations") Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Samuel Holland Cc: Marc Zyngier Cc: Guo Ren Cc: linux-riscv@lists.infradead.org Cc: Link: https://lore.kernel.org/r/20240131081933.144512-1-namcao@linutronix.de --- drivers/irqchip/irq-sifive-plic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 5b7bc4fd9517..bf0b40b0fad4 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -148,7 +148,13 @@ static void plic_irq_eoi(struct irq_data *d) { struct plic_handler *handler = this_cpu_ptr(&plic_handlers); - writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + if (unlikely(irqd_irq_disabled(d))) { + plic_toggle(handler, d->hwirq, 1); + writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + plic_toggle(handler, d->hwirq, 0); + } else { + writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + } } #ifdef CONFIG_SMP From db744ddd59be798c2627efbfc71f707f5a935a40 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Mon, 15 Jan 2024 19:26:49 +0530 Subject: [PATCH 158/327] PCI/MSI: Prevent MSI hardware interrupt number truncation While calculating the hardware interrupt number for a MSI interrupt, the higher bits (i.e. from bit-5 onwards a.k.a domain_nr >= 32) of the PCI domain number gets truncated because of the shifted value casting to return type of pci_domain_nr() which is 'int'. This for example is resulting in same hardware interrupt number for devices 0019:00:00.0 and 0039:00:00.0. To address this cast the PCI domain number to 'irq_hw_number_t' before left shifting it to calculate the hardware interrupt number. Please note that this fixes the issue only on 64-bit systems and doesn't change the behavior for 32-bit systems i.e. the 32-bit systems continue to have the issue. Since the issue surfaces only if there are too many PCIe controllers in the system which usually is the case in modern server systems and they don't tend to run 32-bit kernels. Fixes: 3878eaefb89a ("PCI/MSI: Enhance core to support hierarchy irqdomain") Signed-off-by: Vidya Sagar Signed-off-by: Thomas Gleixner Tested-by: Shanker Donthineni Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115135649.708536-1-vidyas@nvidia.com --- drivers/pci/msi/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index c8be056c248d..cfd84a899c82 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -61,7 +61,7 @@ static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc) return (irq_hw_number_t)desc->msi_index | pci_dev_id(dev) << 11 | - (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 27; + ((irq_hw_number_t)(pci_domain_nr(dev->bus) & 0xFFFFFFFF)) << 27; } static void pci_msi_domain_set_desc(msi_alloc_info_t *arg, From 882a2a724ee964c1ebe7268a91d5c8c8ddc796bf Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 15 Feb 2024 13:51:45 -0800 Subject: [PATCH 159/327] parisc: Fix stack unwinder Debugging shows a large number of unaligned access traps in the unwinder code. Code analysis reveals a number of issues with this code: - handle_interruption is passed twice through dereference_kernel_function_descriptor() - ret_from_kernel_thread, syscall_exit, intr_return, _switch_to_ret, and _call_on_stack are passed through dereference_kernel_function_descriptor() even though they are not declared as function pointers. To fix the problems, drop one of the calls to dereference_kernel_function_descriptor() for handle_interruption, and compare the other pointers directly. Fixes: 6414b30b39f9 ("parisc: unwind: Avoid missing prototype warning for handle_interruption()") Fixes: 8e0ba125c2bf ("parisc/unwind: fix unwinder when CONFIG_64BIT is enabled") Cc: Helge Deller Cc: Sven Schnelle Cc: John David Anglin Cc: Charlie Jenkins Cc: David Laight Signed-off-by: Guenter Roeck Signed-off-by: Helge Deller --- arch/parisc/kernel/unwind.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 27ae40a443b8..f7e0fee5ee55 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -228,10 +228,8 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int #ifdef CONFIG_IRQSTACKS extern void * const _call_on_stack; #endif /* CONFIG_IRQSTACKS */ - void *ptr; - ptr = dereference_kernel_function_descriptor(&handle_interruption); - if (pc_is_kernel_fn(pc, ptr)) { + if (pc_is_kernel_fn(pc, handle_interruption)) { struct pt_regs *regs = (struct pt_regs *)(info->sp - frame_size - PT_SZ_ALGN); dbg("Unwinding through handle_interruption()\n"); info->prev_sp = regs->gr[30]; @@ -239,13 +237,13 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int return 1; } - if (pc_is_kernel_fn(pc, ret_from_kernel_thread) || - pc_is_kernel_fn(pc, syscall_exit)) { + if (pc == (unsigned long)&ret_from_kernel_thread || + pc == (unsigned long)&syscall_exit) { info->prev_sp = info->prev_ip = 0; return 1; } - if (pc_is_kernel_fn(pc, intr_return)) { + if (pc == (unsigned long)&intr_return) { struct pt_regs *regs; dbg("Found intr_return()\n"); @@ -257,14 +255,14 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int } if (pc_is_kernel_fn(pc, _switch_to) || - pc_is_kernel_fn(pc, _switch_to_ret)) { + pc == (unsigned long)&_switch_to_ret) { info->prev_sp = info->sp - CALLEE_SAVE_FRAME_SIZE; info->prev_ip = *(unsigned long *)(info->prev_sp - RP_OFFSET); return 1; } #ifdef CONFIG_IRQSTACKS - if (pc_is_kernel_fn(pc, _call_on_stack)) { + if (pc == (unsigned long)&_call_on_stack) { info->prev_sp = *(unsigned long *)(info->sp - FRAME_SIZE - REG_SZ); info->prev_ip = *(unsigned long *)(info->sp - FRAME_SIZE - RP_OFFSET); return 1; From baf8361e54550a48a7087b603313ad013cc13386 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 13 Feb 2024 18:21:35 -0800 Subject: [PATCH 160/327] x86/bugs: Add asm helpers for executing VERW MDS mitigation requires clearing the CPU buffers before returning to user. This needs to be done late in the exit-to-user path. Current location of VERW leaves a possibility of kernel data ending up in CPU buffers for memory accesses done after VERW such as: 1. Kernel data accessed by an NMI between VERW and return-to-user can remain in CPU buffers since NMI returning to kernel does not execute VERW to clear CPU buffers. 2. Alyssa reported that after VERW is executed, CONFIG_GCC_PLUGIN_STACKLEAK=y scrubs the stack used by a system call. Memory accesses during stack scrubbing can move kernel stack contents into CPU buffers. 3. When caller saved registers are restored after a return from function executing VERW, the kernel stack accesses can remain in CPU buffers(since they occur after VERW). To fix this VERW needs to be moved very late in exit-to-user path. In preparation for moving VERW to entry/exit asm code, create macros that can be used in asm. Also make VERW patching depend on a new feature flag X86_FEATURE_CLEAR_CPU_BUF. Reported-by: Alyssa Milburn Suggested-by: Andrew Cooper Suggested-by: Peter Zijlstra Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-1-a6216d83edb7%40linux.intel.com --- arch/x86/entry/entry.S | 23 +++++++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/nospec-branch.h | 13 +++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8c8d38f0cb1d..003379049924 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include .pushsection .noinstr.text, "ax" @@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb) EXPORT_SYMBOL_GPL(entry_ibpb); .popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensure VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(mds_verw_sel) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(mds_verw_sel); +/* For KVM */ +EXPORT_SYMBOL_GPL(mds_verw_sel); + +.popsection + diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index fdf723b6f6d0..2b62cdd8dd12 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -95,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 262e65539f83..077083ec81cb 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -315,6 +315,17 @@ #endif .endm +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS + ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF +.endm + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -536,6 +547,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +extern u16 mds_verw_sel; + #include /** From 3c7501722e6b31a6e56edd23cea5e77dbb9ffd1a Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 13 Feb 2024 18:21:52 -0800 Subject: [PATCH 161/327] x86/entry_64: Add VERW just before userspace transition Mitigation for MDS is to use VERW instruction to clear any secrets in CPU Buffers. Any memory accesses after VERW execution can still remain in CPU buffers. It is safer to execute VERW late in return to user path to minimize the window in which kernel data can end up in CPU buffers. There are not many kernel secrets to be had after SWITCH_TO_USER_CR3. Add support for deploying VERW mitigation after user register state is restored. This helps minimize the chances of kernel data ending up into CPU buffers after executing VERW. Note that the mitigation at the new location is not yet enabled. Corner case not handled ======================= Interrupts returning to kernel don't clear CPUs buffers since the exit-to-user path is expected to do that anyways. But, there could be a case when an NMI is generated in kernel after the exit-to-user path has cleared the buffers. This case is not handled and NMI returning to kernel don't clear CPU buffers because: 1. It is rare to get an NMI after VERW, but before returning to userspace. 2. For an unprivileged user, there is no known way to make that NMI less rare or target it. 3. It would take a large number of these precisely-timed NMIs to mount an actual attack. There's presumably not enough bandwidth. 4. The NMI in question occurs after a VERW, i.e. when user state is restored and most interesting data is already scrubbed. Whats left is only the data that NMI touches, and that may or may not be of any interest. Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-2-a6216d83edb7%40linux.intel.com --- arch/x86/entry/entry_64.S | 11 +++++++++++ arch/x86/entry/entry_64_compat.S | 1 + 2 files changed, 12 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c..9bb485977629 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -161,6 +161,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -573,6 +574,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) .Lswapgs_and_iret: swapgs + CLEAR_CPU_BUFFERS /* Assert that the IRET frame indicates user mode. */ testb $3, 8(%rsp) jnz .Lnative_iret @@ -723,6 +725,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -1449,6 +1453,12 @@ nmi_restore: std movq $0, 5*8(%rsp) /* clear "NMI executing" */ + /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this @@ -1466,6 +1476,7 @@ SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl SYM_CODE_END(entry_SYSCALL32_ignore) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index de94e2e84ecc..eabf48c4d4b4 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -270,6 +270,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR From a0e2dab44d22b913b4c228c8b52b2a104434b0b3 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 13 Feb 2024 18:22:08 -0800 Subject: [PATCH 162/327] x86/entry_32: Add VERW just before userspace transition As done for entry_64, add support for executing VERW late in exit to user path for 32-bit mode. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-3-a6216d83edb7%40linux.intel.com --- arch/x86/entry/entry_32.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c73047bf9f4b..fba427646805 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -885,6 +885,7 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax + CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. @@ -954,6 +955,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1146,6 +1148,7 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi + CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: From 6613d82e617dd7eb8b0c40b2fe3acea655b1d611 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 13 Feb 2024 18:22:24 -0800 Subject: [PATCH 163/327] x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key The VERW mitigation at exit-to-user is enabled via a static branch mds_user_clear. This static branch is never toggled after boot, and can be safely replaced with an ALTERNATIVE() which is convenient to use in asm. Switch to ALTERNATIVE() to use the VERW mitigation late in exit-to-user path. Also remove the now redundant VERW in exc_nmi() and arch_exit_to_user_mode(). Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-4-a6216d83edb7%40linux.intel.com --- Documentation/arch/x86/mds.rst | 34 ++++++++++++++++++++-------- arch/x86/include/asm/entry-common.h | 1 - arch/x86/include/asm/nospec-branch.h | 12 ---------- arch/x86/kernel/cpu/bugs.c | 15 +++++------- arch/x86/kernel/nmi.c | 3 --- arch/x86/kvm/vmx/vmx.c | 2 +- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/Documentation/arch/x86/mds.rst b/Documentation/arch/x86/mds.rst index e73fdff62c0a..c58c72362911 100644 --- a/Documentation/arch/x86/mds.rst +++ b/Documentation/arch/x86/mds.rst @@ -95,6 +95,9 @@ The kernel provides a function to invoke the buffer clearing: mds_clear_cpu_buffers() +Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path. +Other than CFLAGS.ZF, this macro doesn't clobber any registers. + The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state (idle) transitions. @@ -138,17 +141,30 @@ Mitigation points When transitioning from kernel to user space the CPU buffers are flushed on affected CPUs when the mitigation is not disabled on the kernel - command line. The migitation is enabled through the static key - mds_user_clear. + command line. The mitigation is enabled through the feature flag + X86_FEATURE_CLEAR_CPU_BUF. - The mitigation is invoked in prepare_exit_to_usermode() which covers - all but one of the kernel to user space transitions. The exception - is when we return from a Non Maskable Interrupt (NMI), which is - handled directly in do_nmi(). + The mitigation is invoked just before transitioning to userspace after + user registers are restored. This is done to minimize the window in + which kernel data could be accessed after VERW e.g. via an NMI after + VERW. - (The reason that NMI is special is that prepare_exit_to_usermode() can - enable IRQs. In NMI context, NMIs are blocked, and we don't want to - enable IRQs with NMIs blocked.) + **Corner case not handled** + Interrupts returning to kernel don't clear CPUs buffers since the + exit-to-user path is expected to do that anyways. But, there could be + a case when an NMI is generated in kernel after the exit-to-user path + has cleared the buffers. This case is not handled and NMI returning to + kernel don't clear CPU buffers because: + + 1. It is rare to get an NMI after VERW, but before returning to userspace. + 2. For an unprivileged user, there is no known way to make that NMI + less rare or target it. + 3. It would take a large number of these precisely-timed NMIs to mount + an actual attack. There's presumably not enough bandwidth. + 4. The NMI in question occurs after a VERW, i.e. when user state is + restored and most interesting data is already scrubbed. Whats left + is only the data that NMI touches, and that may or may not be of + any interest. 2. C-State transition diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index ce8f50192ae3..7e523bb3d2d3 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 077083ec81cb..2aa52cab1e46 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -540,7 +540,6 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); @@ -574,17 +573,6 @@ static __always_inline void mds_clear_cpu_buffers(void) asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); } -/** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - /** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b91..48d049cd74e7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -111,9 +111,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -252,7 +249,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -356,7 +353,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -424,7 +421,7 @@ static void __init mmio_select_mitigation(void) */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else static_branch_enable(&mmio_stale_data_clear); @@ -484,12 +481,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69fe..3082cf24b69e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -563,9 +563,6 @@ nmi_restart: } if (this_cpu_dec_return(nmi_state)) goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1111d9d08903..db8a5fe7edf6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7227,7 +7227,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) + else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF)) mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) From 706a189dcf74d3b3f955e9384785e726ed6c7c80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Feb 2024 18:22:40 -0800 Subject: [PATCH 164/327] KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH Use EFLAGS.CF instead of EFLAGS.ZF to track whether to use VMRESUME versus VMLAUNCH. Freeing up EFLAGS.ZF will allow doing VERW, which clobbers ZF, for MDS mitigations as late as possible without needing to duplicate VERW for both paths. Signed-off-by: Sean Christopherson Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/all/20240213-delay-verw-v8-5-a6216d83edb7%40linux.intel.com --- arch/x86/kvm/vmx/run_flags.h | 7 +++++-- arch/x86/kvm/vmx/vmenter.S | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index edc3f16cc189..6a9bfdfbb6e5 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,7 +2,10 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME (1 << 0) -#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 906ecd001511..ef7cfbad4d57 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - test $VMX_RUN_VMRESUME, %ebx + bt $VMX_RUN_VMRESUME_SHIFT, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -161,8 +161,8 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ - jz .Lvmlaunch + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ + jnc .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" From 43fb862de8f628c5db5e96831c915b9aebf62d33 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 13 Feb 2024 18:22:56 -0800 Subject: [PATCH 165/327] KVM/VMX: Move VERW closer to VMentry for MDS mitigation During VMentry VERW is executed to mitigate MDS. After VERW, any memory access like register push onto stack may put host data in MDS affected CPU buffers. A guest can then use MDS to sample host data. Although likelihood of secrets surviving in registers at current VERW callsite is less, but it can't be ruled out. Harden the MDS mitigation by moving the VERW mitigation late in VMentry path. Note that VERW for MMIO Stale Data mitigation is unchanged because of the complexity of per-guest conditional VERW which is not easy to handle that late in asm with no GPRs available. If the CPU is also affected by MDS, VERW is unconditionally executed late in asm regardless of guest having MMIO access. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Sean Christopherson Link: https://lore.kernel.org/all/20240213-delay-verw-v8-6-a6216d83edb7%40linux.intel.com --- arch/x86/kvm/vmx/vmenter.S | 3 +++ arch/x86/kvm/vmx/vmx.c | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index ef7cfbad4d57..2bfbf758d061 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -161,6 +161,9 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX + /* Clobbers EFLAGS.ZF */ + CLEAR_CPU_BUFFERS + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ jnc .Lvmlaunch diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index db8a5fe7edf6..88a4ff200d04 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -388,7 +388,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + /* + * Disable VERW's behavior of clearing CPU buffers for the guest if the + * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled + * the mitigation. Disabling the clearing behavior provides a + * performance boost for guests that aren't aware that manually clearing + * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry + * and VM-Exit. + */ + vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && + (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -7224,11 +7233,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ + /* + * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW + * mitigation for MDS is done late in VMentry and is still + * executed in spite of L1D Flush. This is because an extra VERW + * should not matter much after the big hammer L1D Flush. + */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF)) - mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); From 2c88c16dc20e88dd54d2f6f4d01ae1dce6cc9654 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Feb 2024 22:44:11 -0500 Subject: [PATCH 166/327] erofs: fix handling kern_mount() failure if you have a variable that holds NULL or a pointer to live struct mount, do not shove ERR_PTR() into it - not if you later treat "not NULL" as "holds a pointer to object". Signed-off-by: Al Viro --- fs/erofs/fscache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index bc12030393b2..29ad5b1cc7fd 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb) goto out; if (!erofs_pseudo_mnt) { - erofs_pseudo_mnt = kern_mount(&erofs_fs_type); - if (IS_ERR(erofs_pseudo_mnt)) { - err = PTR_ERR(erofs_pseudo_mnt); + struct vfsmount *mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto out; } + erofs_pseudo_mnt = mnt; } domain->volume = sbi->volume; From 69f89168b310878be82d7d97bc0d22068ad858c0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 12 Feb 2024 18:42:13 +0000 Subject: [PATCH 167/327] usb: typec: tpcm: Fix issues with power being removed during reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the merge of b717dfbf73e8 ("Revert "usb: typec: tcpm: fix cc role at port reset"") into mainline the LibreTech Renegade Elite/Firefly has died during boot, the main symptom observed in testing is a sudden stop in console output. Gábor Stefanik identified in review that the patch would cause power to be removed from devices without batteries (like this board), observing that while the patch is correct according to the spec this appears to be an oversight in the spec. Given that the change makes previously working systems unusable let's revert it, there was some discussion of identifying systems that have alternative power and implementing the standards conforming behaviour in only that case. Fixes: b717dfbf73e8 ("Revert "usb: typec: tcpm: fix cc role at port reset"") Cc: stable Cc: Badhri Jagan Sridharan Signed-off-by: Mark Brown Acked-by: Heikki Krogerus Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240212-usb-fix-renegade-v1-1-22c43c88d635@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 295ae7eb912c..66e532edcece 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4873,7 +4873,8 @@ static void run_state_machine(struct tcpm_port *port) break; case PORT_RESET: tcpm_reset_port(port); - tcpm_set_cc(port, TYPEC_CC_OPEN); + tcpm_set_cc(port, tcpm_default_state(port) == SNK_UNATTACHED ? + TYPEC_CC_RD : tcpm_rp_cc(port)); tcpm_set_state(port, PORT_RESET_WAIT_OFF, PD_T_ERROR_RECOVERY); break; From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Sat, 17 Feb 2024 16:14:31 +0800 Subject: [PATCH 168/327] cachefiles: fix memory leak in cachefiles_add_cache() The following memory leak was reported after unbinding /dev/cachefiles: ================================================================== unreferenced object 0xffff9b674176e3c0 (size 192): comm "cachefilesd2", pid 680, jiffies 4294881224 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc ea38a44b): [] kmem_cache_alloc+0x2d5/0x370 [] prepare_creds+0x26/0x2e0 [] cachefiles_determine_cache_security+0x1f/0x120 [] cachefiles_add_cache+0x13c/0x3a0 [] cachefiles_daemon_write+0x146/0x1c0 [] vfs_write+0xcb/0x520 [] ksys_write+0x69/0xf0 [] do_syscall_64+0x72/0x140 [] entry_SYSCALL_64_after_hwframe+0x6e/0x76 ================================================================== Put the reference count of cache_cred in cachefiles_daemon_unbind() to fix the problem. And also put cache_cred in cachefiles_add_cache() error branch to avoid memory leaks. Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem") CC: stable@vger.kernel.org Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com Acked-by: David Howells Reviewed-by: Jingbo Xu Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/cachefiles/cache.c | 2 ++ fs/cachefiles/daemon.c | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 7077f72e6f47..f449f7340aad 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -168,6 +168,8 @@ error_unsupported: dput(root); error_open_root: cachefiles_end_secure(cache, saved_cred); + put_cred(cache->cache_cred); + cache->cache_cred = NULL; error_getsec: fscache_relinquish_cache(cache_cookie); cache->cache = NULL; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 3f24905f4066..6465e2574230 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) cachefiles_put_directory(cache->graveyard); cachefiles_put_directory(cache->store); mntput(cache->mnt); + put_cred(cache->cache_cred); kfree(cache->rootdirname); kfree(cache->secctx); From bfacaf71a1482d936804213a3ffa6de73558280e Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 19 Feb 2024 14:39:02 +0000 Subject: [PATCH 169/327] afs: Fix ignored callbacks over ipv4 When searching for a matching peer, all addresses need to be searched, not just the ipv6 ones in the fs_addresses6 list. Given that the lists no longer contain addresses, there is little reason to splitting things between separate lists, so unify them into a single list. When processing an incoming callback from an ipv4 address, this would lead to a failure to set call->server, resulting in the callback being ignored and the client seeing stale contents. Fixes: 72904d7b9bfb ("rxrpc, afs: Allow afs to pin rxrpc_peer objects") Reported-by: Markus Suvanto Link: https://lists.infradead.org/pipermail/linux-afs/2024-February/008035.html Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lists.infradead.org/pipermail/linux-afs/2024-February/008037.html # v1 Link: https://lists.infradead.org/pipermail/linux-afs/2024-February/008066.html # v2 Link: https://lore.kernel.org/r/20240219143906.138346-2-dhowells@redhat.com Signed-off-by: Christian Brauner --- fs/afs/internal.h | 6 ++---- fs/afs/main.c | 3 +-- fs/afs/server.c | 14 +++++--------- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9c03fcf7ffaa..6ce5a612937c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -321,8 +321,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ - struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ + struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; @@ -561,8 +560,7 @@ struct afs_server { struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ - struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ diff --git a/fs/afs/main.c b/fs/afs/main.c index 1b3bd21c168a..a14f6013e316 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses4); - INIT_HLIST_HEAD(&net->fs_addresses6); + INIT_HLIST_HEAD(&net->fs_addresses); seqlock_init(&net->fs_addr_lock); INIT_WORK(&net->fs_manager, afs_manage_servers); diff --git a/fs/afs/server.c b/fs/afs/server.c index e169121f603e..038f9d0ae3af 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { estate = rcu_dereference(server->endpoint_state); alist = estate->addresses; for (i = 0; i < alist->nr_addrs; i++) @@ -177,10 +177,8 @@ added_dup: * bit, but anything we might want to do gets messy and memory * intensive. */ - if (alist->nr_ipv4 > 0) - hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); - if (alist->nr_addrs > alist->nr_ipv4) - hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); + if (alist->nr_addrs > 0) + hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); write_sequnlock(&net->fs_addr_lock); @@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr4_link)) - hlist_del_rcu(&server->addr4_link); - if (!hlist_unhashed(&server->addr6_link)) - hlist_del_rcu(&server->addr6_link); + if (!hlist_unhashed(&server->addr_link)) + hlist_del_rcu(&server->addr_link); } write_sequnlock(&net->fs_lock); From 6ea38e2aeb72349cad50e38899b0ba6fbcb2af3d Mon Sep 17 00:00:00 2001 From: Daniil Dulov Date: Mon, 19 Feb 2024 14:39:03 +0000 Subject: [PATCH 170/327] afs: Increase buffer size in afs_update_volume_status() The max length of volume->vid value is 20 characters. So increase idbuf[] size up to 24 to avoid overflow. Found by Linux Verification Center (linuxtesting.org) with SVACE. [DH: Actually, it's 20 + NUL, so increase it to 24 and use snprintf()] Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: Daniil Dulov Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240211150442.3416-1-d.dulov@aladdin.ru/ # v1 Link: https://lore.kernel.org/r/20240212083347.10742-1-d.dulov@aladdin.ru/ # v2 Link: https://lore.kernel.org/r/20240219143906.138346-3-dhowells@redhat.com Signed-off-by: Christian Brauner --- fs/afs/volume.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 020ecd45e476..af3a3f57c1b3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { struct afs_server_list *new, *old, *discard; struct afs_vldb_entry *vldb; - char idbuf[16]; + char idbuf[24]; int ret, idsz; _enter(""); @@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%llu", volume->vid); + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { From 5559cea2d5aa3018a5f00dd2aca3427ba09b386b Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Thu, 15 Feb 2024 23:27:17 +0300 Subject: [PATCH 171/327] ipv6: sr: fix possible use-after-free and null-ptr-deref The pernet operations structure for the subsystem must be registered before registering the generic netlink family. Fixes: 915d7e5e5930 ("ipv6: sr: add code base for control plane support of SR-IPv6") Signed-off-by: Vasiliy Kovalev Link: https://lore.kernel.org/r/20240215202717.29815-1-kovalev@altlinux.org Signed-off-by: Paolo Abeni --- net/ipv6/seg6.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 29346a6eec9f..35508abd76f4 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -512,22 +512,24 @@ int __init seg6_init(void) { int err; - err = genl_register_family(&seg6_genl_family); + err = register_pernet_subsys(&ip6_segments_ops); if (err) goto out; - err = register_pernet_subsys(&ip6_segments_ops); + err = genl_register_family(&seg6_genl_family); if (err) - goto out_unregister_genl; + goto out_unregister_pernet; #ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) - goto out_unregister_pernet; + goto out_unregister_genl; err = seg6_local_init(); - if (err) - goto out_unregister_pernet; + if (err) { + seg6_iptunnel_exit(); + goto out_unregister_genl; + } #endif #ifdef CONFIG_IPV6_SEG6_HMAC @@ -548,11 +550,11 @@ out_unregister_iptun: #endif #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL -out_unregister_pernet: - unregister_pernet_subsys(&ip6_segments_ops); -#endif out_unregister_genl: genl_unregister_family(&seg6_genl_family); +#endif +out_unregister_pernet: + unregister_pernet_subsys(&ip6_segments_ops); goto out; } From def689fc26b9a9622d2e2cb0c4933dd3b1c8071c Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Thu, 15 Feb 2024 23:34:00 +0300 Subject: [PATCH 172/327] devlink: fix possible use-after-free and memory leaks in devlink_init() The pernet operations structure for the subsystem must be registered before registering the generic netlink family. Make an unregister in case of unsuccessful registration. Fixes: 687125b5799c ("devlink: split out core code") Signed-off-by: Vasiliy Kovalev Link: https://lore.kernel.org/r/20240215203400.29976-1-kovalev@altlinux.org Signed-off-by: Paolo Abeni --- net/devlink/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/devlink/core.c b/net/devlink/core.c index 6a58342752b4..7f0b093208d7 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -529,14 +529,20 @@ static int __init devlink_init(void) { int err; - err = genl_register_family(&devlink_nl_family); - if (err) - goto out; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; + err = genl_register_family(&devlink_nl_family); + if (err) + goto out_unreg_pernet_subsys; err = register_netdevice_notifier(&devlink_port_netdevice_nb); + if (!err) + return 0; + genl_unregister_family(&devlink_nl_family); + +out_unreg_pernet_subsys: + unregister_pernet_subsys(&devlink_pernet_ops); out: WARN_ON(err); return err; From a7d6027790acea24446ddd6632d394096c0f4667 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 Feb 2024 15:05:16 -0800 Subject: [PATCH 173/327] arp: Prevent overflow in arp_req_get(). syzkaller reported an overflown write in arp_req_get(). [0] When ioctl(SIOCGARP) is issued, arp_req_get() looks up an neighbour entry and copies neigh->ha to struct arpreq.arp_ha.sa_data. The arp_ha here is struct sockaddr, not struct sockaddr_storage, so the sa_data buffer is just 14 bytes. In the splat below, 2 bytes are overflown to the next int field, arp_flags. We initialise the field just after the memcpy(), so it's not a problem. However, when dev->addr_len is greater than 22 (e.g. MAX_ADDR_LEN), arp_netmask is overwritten, which could be set as htonl(0xFFFFFFFFUL) in arp_ioctl() before calling arp_req_get(). To avoid the overflow, let's limit the max length of memcpy(). Note that commit b5f0de6df6dc ("net: dev: Convert sa_data to flexible array in struct sockaddr") just silenced syzkaller. [0]: memcpy: detected field-spanning write (size 16) of single field "r->arp_ha.sa_data" at net/ipv4/arp.c:1128 (size 14) WARNING: CPU: 0 PID: 144638 at net/ipv4/arp.c:1128 arp_req_get+0x411/0x4a0 net/ipv4/arp.c:1128 Modules linked in: CPU: 0 PID: 144638 Comm: syz-executor.4 Not tainted 6.1.74 #31 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014 RIP: 0010:arp_req_get+0x411/0x4a0 net/ipv4/arp.c:1128 Code: fd ff ff e8 41 42 de fb b9 0e 00 00 00 4c 89 fe 48 c7 c2 20 6d ab 87 48 c7 c7 80 6d ab 87 c6 05 25 af 72 04 01 e8 5f 8d ad fb <0f> 0b e9 6c fd ff ff e8 13 42 de fb be 03 00 00 00 4c 89 e7 e8 a6 RSP: 0018:ffffc900050b7998 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88803a815000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8641a44a RDI: 0000000000000001 RBP: ffffc900050b7a98 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 203a7970636d656d R12: ffff888039c54000 R13: 1ffff92000a16f37 R14: ffff88803a815084 R15: 0000000000000010 FS: 00007f172bf306c0(0000) GS:ffff88805aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f172b3569f0 CR3: 0000000057f12005 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: arp_ioctl+0x33f/0x4b0 net/ipv4/arp.c:1261 inet_ioctl+0x314/0x3a0 net/ipv4/af_inet.c:981 sock_do_ioctl+0xdf/0x260 net/socket.c:1204 sock_ioctl+0x3ef/0x650 net/socket.c:1321 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x18e/0x220 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x64/0xce RIP: 0033:0x7f172b262b8d Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f172bf300b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f172b3abf80 RCX: 00007f172b262b8d RDX: 0000000020000000 RSI: 0000000000008954 RDI: 0000000000000003 RBP: 00007f172b2d3493 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f172b3abf80 R15: 00007f172bf10000 Reported-by: syzkaller Reported-by: Bjoern Doebel Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240215230516.31330-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/arp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9456f5bb35e5..0d0d725b46ad 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1125,7 +1125,8 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) if (neigh) { if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) { read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + memcpy(r->arp_ha.sa_data, neigh->ha, + min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); r->arp_ha.sa_family = dev->type; From 23f9c2c066e7e5052406fb8f04a115d3d0260b22 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Feb 2024 08:19:45 -0800 Subject: [PATCH 174/327] docs: netdev: update the link to the CI repo Netronome graciously transferred the original NIPA repo to our new netdev umbrella org. Link to that instead of my private fork. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240216161945.2208842-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/process/maintainer-netdev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 84ee60fceef2..fd96e4a3cef9 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -431,7 +431,7 @@ patchwork checks Checks in patchwork are mostly simple wrappers around existing kernel scripts, the sources are available at: -https://github.com/kuba-moo/nipa/tree/master/tests +https://github.com/linux-netdev/nipa/tree/master/tests **Do not** post your patches just to run them through the checks. You must ensure that your patches are ready by testing them locally From 20c8c4dafe93e82441583e93bd68c0d256d7bed4 Mon Sep 17 00:00:00 2001 From: Amit Machhiwal Date: Wed, 7 Feb 2024 11:15:26 +0530 Subject: [PATCH 175/327] KVM: PPC: Book3S HV: Fix L2 guest reboot failure due to empty 'arch_compat' Currently, rebooting a pseries nested qemu-kvm guest (L2) results in below error as L1 qemu sends PVR value 'arch_compat' == 0 via ppc_set_compat ioctl. This triggers a condition failure in kvmppc_set_arch_compat() resulting in an EINVAL. qemu-system-ppc64: Unable to set CPU compatibility mode in KVM: Invalid argument Also, a value of 0 for arch_compat generally refers the default compatibility of the host. But, arch_compat, being a Guest Wide Element in nested API v2, cannot be set to 0 in GSB as PowerVM (L0) expects a non-zero value. A value of 0 triggers a kernel trap during a reboot and consequently causes it to fail: [ 22.106360] reboot: Restarting system KVM: unknown exit, hardware reason ffffffffffffffea NIP 0000000000000100 LR 000000000000fe44 CTR 0000000000000000 XER 0000000020040092 CPU#0 MSR 0000000000001000 HID0 0000000000000000 HF 6c000000 iidx 3 didx 3 TB 00000000 00000000 DECR 0 GPR00 0000000000000000 0000000000000000 c000000002a8c300 000000007fe00000 GPR04 0000000000000000 0000000000000000 0000000000001002 8000000002803033 GPR08 000000000a000000 0000000000000000 0000000000000004 000000002fff0000 GPR12 0000000000000000 c000000002e10000 0000000105639200 0000000000000004 GPR16 0000000000000000 000000010563a090 0000000000000000 0000000000000000 GPR20 0000000105639e20 00000001056399c8 00007fffe54abab0 0000000105639288 GPR24 0000000000000000 0000000000000001 0000000000000001 0000000000000000 GPR28 0000000000000000 0000000000000000 c000000002b30840 0000000000000000 CR 00000000 [ - - - - - - - - ] RES 000@ffffffffffffffff SRR0 0000000000000000 SRR1 0000000000000000 PVR 0000000000800200 VRSAVE 0000000000000000 SPRG0 0000000000000000 SPRG1 0000000000000000 SPRG2 0000000000000000 SPRG3 0000000000000000 SPRG4 0000000000000000 SPRG5 0000000000000000 SPRG6 0000000000000000 SPRG7 0000000000000000 HSRR0 0000000000000000 HSRR1 0000000000000000 CFAR 0000000000000000 LPCR 0000000000020400 PTCR 0000000000000000 DAR 0000000000000000 DSISR 0000000000000000 kernel:trap=0xffffffea | pc=0x100 | msr=0x1000 This patch updates kvmppc_set_arch_compat() to use the host PVR value if 'compat_pvr' == 0 indicating that qemu doesn't want to enforce any specific PVR compat mode. The relevant part of the code might need a rework if PowerVM implements a support for `arch_compat == 0` in nestedv2 API. Fixes: 19d31c5f1157 ("KVM: PPC: Add support for nestedv2 guests") Reviewed-by: "Aneesh Kumar K.V (IBM)" Reviewed-by: Vaibhav Jain Signed-off-by: Amit Machhiwal Signed-off-by: Michael Ellerman Link: https://msgid.link/20240207054526.3720087-1-amachhiw@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 26 ++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_hv_nestedv2.c | 20 ++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 52427fc2a33f..0b921704da45 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -391,6 +391,24 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) /* Dummy value used in computing PCR value below */ #define PCR_ARCH_31 (PCR_ARCH_300 << 1) +static inline unsigned long map_pcr_to_cap(unsigned long pcr) +{ + unsigned long cap = 0; + + switch (pcr) { + case PCR_ARCH_300: + cap = H_GUEST_CAP_POWER9; + break; + case PCR_ARCH_31: + cap = H_GUEST_CAP_POWER10; + break; + default: + break; + } + + return cap; +} + static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) { unsigned long host_pcr_bit = 0, guest_pcr_bit = 0, cap = 0; @@ -424,11 +442,9 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) break; case PVR_ARCH_300: guest_pcr_bit = PCR_ARCH_300; - cap = H_GUEST_CAP_POWER9; break; case PVR_ARCH_31: guest_pcr_bit = PCR_ARCH_31; - cap = H_GUEST_CAP_POWER10; break; default: return -EINVAL; @@ -440,6 +456,12 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) return -EINVAL; if (kvmhv_on_pseries() && kvmhv_is_nestedv2()) { + /* + * 'arch_compat == 0' would mean the guest should default to + * L1's compatibility. In this case, the guest would pick + * host's PCR and evaluate the corresponding capabilities. + */ + cap = map_pcr_to_cap(guest_pcr_bit); if (!(cap & nested_capabilities)) return -EINVAL; } diff --git a/arch/powerpc/kvm/book3s_hv_nestedv2.c b/arch/powerpc/kvm/book3s_hv_nestedv2.c index 5378eb40b162..8e6f5355f08b 100644 --- a/arch/powerpc/kvm/book3s_hv_nestedv2.c +++ b/arch/powerpc/kvm/book3s_hv_nestedv2.c @@ -138,6 +138,7 @@ static int gs_msg_ops_vcpu_fill_info(struct kvmppc_gs_buff *gsb, vector128 v; int rc, i; u16 iden; + u32 arch_compat = 0; vcpu = gsm->data; @@ -347,8 +348,23 @@ static int gs_msg_ops_vcpu_fill_info(struct kvmppc_gs_buff *gsb, break; } case KVMPPC_GSID_LOGICAL_PVR: - rc = kvmppc_gse_put_u32(gsb, iden, - vcpu->arch.vcore->arch_compat); + /* + * Though 'arch_compat == 0' would mean the default + * compatibility, arch_compat, being a Guest Wide + * Element, cannot be filled with a value of 0 in GSB + * as this would result into a kernel trap. + * Hence, when `arch_compat == 0`, arch_compat should + * default to L1's PVR. + */ + if (!vcpu->arch.vcore->arch_compat) { + if (cpu_has_feature(CPU_FTR_ARCH_31)) + arch_compat = PVR_ARCH_31; + else if (cpu_has_feature(CPU_FTR_ARCH_300)) + arch_compat = PVR_ARCH_300; + } else { + arch_compat = vcpu->arch.vcore->arch_compat; + } + rc = kvmppc_gse_put_u32(gsb, iden, arch_compat); break; } From ae366ba8576da0135d7d3db2dfa6304f3338d0c2 Mon Sep 17 00:00:00 2001 From: Emil Renner Berthing Date: Mon, 19 Feb 2024 18:25:13 +0100 Subject: [PATCH 176/327] gpiolib: Handle no pin_ranges in gpiochip_generic_config() Similar to gpiochip_generic_request() and gpiochip_generic_free() the gpiochip_generic_config() function needs to handle the case where there are no pinctrl pins mapped to the GPIOs, usually through the gpio-ranges device tree property. Commit f34fd6ee1be8 ("gpio: dwapb: Use generic request, free and set_config") set the .set_config callback to gpiochip_generic_config() in the dwapb GPIO driver so the GPIO API can set pinctrl configuration for the corresponding pins. Most boards using the dwapb driver do not set the gpio-ranges device tree property though, and in this case gpiochip_generic_config() would return -EPROPE_DEFER rather than the previous -ENOTSUPP return value. This in turn makes gpio_set_config_with_argument_optional() fail and propagate the error to any driver requesting GPIOs. Fixes: 2956b5d94a76 ("pinctrl / gpio: Introduce .set_config() callback for GPIO chips") Reported-by: Jisheng Zhang Closes: https://lore.kernel.org/linux-gpio/ZdC_g3U4l0CJIWzh@xhacker/ Tested-by: Jisheng Zhang Signed-off-by: Emil Renner Berthing Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 8b3a0f45b574..e434e8cc1229 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2042,6 +2042,11 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_free); int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset, unsigned long config) { +#ifdef CONFIG_PINCTRL + if (list_empty(&gc->gpiodev->pin_ranges)) + return -ENOTSUPP; +#endif + return pinctrl_gpio_set_config(gc, offset, config); } EXPORT_SYMBOL_GPL(gpiochip_generic_config); From 802379b8f9e169293e9ba7089e5f1a6340e2e7a3 Mon Sep 17 00:00:00 2001 From: Hojin Nam Date: Fri, 16 Feb 2024 10:45:22 +0900 Subject: [PATCH 177/327] perf: CXL: fix CPMU filter value mask length CPMU filter value is described as 4B length in CXL r3.0 8.2.7.2.2. However, it is used as 2B length in code and comments. Reviewed-by: Jonathan Cameron Signed-off-by: Hojin Nam Link: https://lore.kernel.org/r/20240216014522.32321-1-hj96.nam@samsung.com Signed-off-by: Will Deacon --- drivers/perf/cxl_pmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c index bc0d414a6aff..308c9969642e 100644 --- a/drivers/perf/cxl_pmu.c +++ b/drivers/perf/cxl_pmu.c @@ -59,7 +59,7 @@ #define CXL_PMU_COUNTER_CFG_EVENT_GRP_ID_IDX_MSK GENMASK_ULL(63, 59) #define CXL_PMU_FILTER_CFG_REG(n, f) (0x400 + 4 * ((f) + (n) * 8)) -#define CXL_PMU_FILTER_CFG_VALUE_MSK GENMASK(15, 0) +#define CXL_PMU_FILTER_CFG_VALUE_MSK GENMASK(31, 0) #define CXL_PMU_COUNTER_REG(n) (0xc00 + 8 * (n)) @@ -314,9 +314,9 @@ static bool cxl_pmu_config1_get_edge(struct perf_event *event) } /* - * CPMU specification allows for 8 filters, each with a 16 bit value... - * So we need to find 8x16bits to store it in. - * As the value used for disable is 0xffff, a separate enable switch + * CPMU specification allows for 8 filters, each with a 32 bit value... + * So we need to find 8x32bits to store it in. + * As the value used for disable is 0xffff_ffff, a separate enable switch * is needed. */ @@ -642,7 +642,7 @@ static void cxl_pmu_event_start(struct perf_event *event, int flags) if (cxl_pmu_config1_hdm_filter_en(event)) cfg = cxl_pmu_config2_get_hdm_decoder(event); else - cfg = GENMASK(15, 0); /* No filtering if 0xFFFF_FFFF */ + cfg = GENMASK(31, 0); /* No filtering if 0xFFFF_FFFF */ writeq(cfg, base + CXL_PMU_FILTER_CFG_REG(hwc->idx, 0)); } From a6b3eb304a82c29665a0ab947cfe276f6d29f523 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Feb 2024 12:05:47 +0000 Subject: [PATCH 178/327] Revert "arm64: jump_label: use constraints "Si" instead of "i"" This reverts commit f9daab0ad01cf9d165dbbbf106ca4e61d06e7fe8. Geert reports that his particular GCC 5.5 vintage toolchain fails to build an arm64 defconfig because of this change: | arch/arm64/include/asm/jump_label.h:25:2: error: invalid 'asm': | invalid operand | asm goto( ^ Aopparently, this is something we claim to support, so let's revert back to the old jump label constraint for now while discussions about raising the minimum GCC version are ongoing. Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/CAMuHMdX+6fnAf8Hm6EqYJPAjrrLO9T7c=Gu3S8V_pqjSDowJ6g@mail.gmail.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/jump_label.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index b7716b215f91..48ddc0f45d22 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -15,10 +15,6 @@ #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE -/* - * Prefer the constraint "S" to support PIC with GCC. Clang before 19 does not - * support "S" on a symbol with a constant offset, so we use "i" as a fallback. - */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -27,9 +23,9 @@ static __always_inline bool arch_static_branch(struct static_key * const key, " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" " .long 1b - ., %l[l_yes] - . \n\t" - " .quad (%[key] - .) + %[bit0] \n\t" + " .quad %c0 - . \n\t" " .popsection \n\t" - : : [key]"Si"(key), [bit0]"i"(branch) : : l_yes); + : : "i"(&((char *)key)[branch]) : : l_yes); return false; l_yes: @@ -44,9 +40,9 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" " .long 1b - ., %l[l_yes] - . \n\t" - " .quad (%[key] - .) + %[bit0] \n\t" + " .quad %c0 - . \n\t" " .popsection \n\t" - : : [key]"Si"(key), [bit0]"i"(branch) : : l_yes); + : : "i"(&((char *)key)[branch]) : : l_yes); return false; l_yes: From 9533864816fb4a6207c63b7a98396351ce1a9fae Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 13 Feb 2024 23:06:32 +0000 Subject: [PATCH 179/327] arm64/sme: Restore SME registers on exit from suspend The fields in SMCR_EL1 and SMPRI_EL1 reset to an architecturally UNKNOWN value. Since we do not otherwise manage the traps configured in this register at runtime we need to reconfigure them after a suspend in case nothing else was kind enough to preserve them for us. The vector length will be restored as part of restoring the SME state for the next SME using task. Fixes: a1f4ccd25cc2 ("arm64/sme: Provide Kconfig for SME") Reported-by: Jackson Cooper-Driver Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240213-arm64-sme-resume-v3-1-17e05e493471@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimd.h | 2 ++ arch/arm64/kernel/fpsimd.c | 14 ++++++++++++++ arch/arm64/kernel/suspend.c | 3 +++ 3 files changed, 19 insertions(+) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 481d94416d69..b67b89c54e1c 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -386,6 +386,7 @@ extern void sme_alloc(struct task_struct *task, bool flush); extern unsigned int sme_get_vl(void); extern int sme_set_current_vl(unsigned long arg); extern int sme_get_current_vl(void); +extern void sme_suspend_exit(void); /* * Return how many bytes of memory are required to store the full SME @@ -421,6 +422,7 @@ static inline int sme_max_vl(void) { return 0; } static inline int sme_max_virtualisable_vl(void) { return 0; } static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } static inline int sme_get_current_vl(void) { return -EINVAL; } +static inline void sme_suspend_exit(void) { } static inline size_t sme_state_size(struct task_struct const *task) { diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 25ceaee6b025..d52592088afa 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1311,6 +1311,20 @@ void __init sme_setup(void) get_sme_default_vl()); } +void sme_suspend_exit(void) +{ + u64 smcr = 0; + + if (!system_supports_sme()) + return; + + if (system_supports_fa64()) + smcr |= SMCR_ELx_FA64; + + write_sysreg_s(smcr, SYS_SMCR_EL1); + write_sysreg_s(0, SYS_SMPRI_EL1); +} + #endif /* CONFIG_ARM64_SME */ static void sve_init_regs(void) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index eca4d0435211..eaaff94329cd 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,8 @@ void notrace __cpu_suspend_exit(void) */ spectre_v4_enable_mitigation(NULL); + sme_suspend_exit(); + /* Restore additional feature-specific configuration */ ptrauth_suspend_exit(); } From d7b77a0d565b048cb0808fa8a4fb031352b22a01 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 13 Feb 2024 23:06:33 +0000 Subject: [PATCH 180/327] arm64/sme: Restore SMCR_EL1.EZT0 on exit from suspend The fields in SMCR_EL1 reset to an architecturally UNKNOWN value. Since we do not otherwise manage the traps configured in this register at runtime we need to reconfigure them after a suspend in case nothing else was kind enough to preserve them for us. Do so for SMCR_EL1.EZT0. Fixes: d4913eee152d ("arm64/sme: Add basic enumeration for SME2") Reported-by: Jackson Cooper-Driver Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240213-arm64-sme-resume-v3-2-17e05e493471@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index d52592088afa..f27acca550d5 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1320,6 +1320,8 @@ void sme_suspend_exit(void) if (system_supports_fa64()) smcr |= SMCR_ELx_FA64; + if (system_supports_sme2()) + smcr |= SMCR_ELx_EZT0; write_sysreg_s(smcr, SYS_SMCR_EL1); write_sysreg_s(0, SYS_SMPRI_EL1); From bd8905d70944aae5063fd91c667e6f846ee92718 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 16 Feb 2024 21:17:18 +0100 Subject: [PATCH 181/327] platform/x86: x86-android-tablets: Fix keyboard touchscreen on Lenovo Yogabook1 X90 After commit 4014ae236b1d ("platform/x86: x86-android-tablets: Stop using gpiolib private APIs") the touchscreen in the keyboard half of the Lenovo Yogabook1 X90 stopped working with the following error: Goodix-TS i2c-goodix_ts: error -EBUSY: Failed to get irq GPIO The problem is that when getting the IRQ for instantiated i2c_client-s from a GPIO (rather then using an IRQ directly from the IOAPIC), x86_acpi_irq_helper_get() now properly requests the GPIO, which disallows other drivers from requesting it. Normally this is a good thing, but the goodix touchscreen also uses the IRQ as an output during reset to select which of its 2 possible I2C addresses should be used. Add a new free_gpio flag to struct x86_acpi_irq_data to deal with this and release the GPIO after getting the IRQ in this special case. Fixes: 4014ae236b1d ("platform/x86: x86-android-tablets: Stop using gpiolib private APIs") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240216201721.239791-2-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets/core.c | 3 +++ drivers/platform/x86/x86-android-tablets/lenovo.c | 1 + drivers/platform/x86/x86-android-tablets/x86-android-tablets.h | 1 + 3 files changed, 5 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets/core.c b/drivers/platform/x86/x86-android-tablets/core.c index f8221a15575b..f6547c9d7584 100644 --- a/drivers/platform/x86/x86-android-tablets/core.c +++ b/drivers/platform/x86/x86-android-tablets/core.c @@ -113,6 +113,9 @@ int x86_acpi_irq_helper_get(const struct x86_acpi_irq_data *data) if (irq_type != IRQ_TYPE_NONE && irq_type != irq_get_trigger_type(irq)) irq_set_irq_type(irq, irq_type); + if (data->free_gpio) + devm_gpiod_put(&x86_android_tablet_device->dev, gpiod); + return irq; case X86_ACPI_IRQ_TYPE_PMIC: status = acpi_get_handle(NULL, data->chip, &handle); diff --git a/drivers/platform/x86/x86-android-tablets/lenovo.c b/drivers/platform/x86/x86-android-tablets/lenovo.c index f1c66a61bfc5..c297391955ad 100644 --- a/drivers/platform/x86/x86-android-tablets/lenovo.c +++ b/drivers/platform/x86/x86-android-tablets/lenovo.c @@ -116,6 +116,7 @@ static const struct x86_i2c_client_info lenovo_yb1_x90_i2c_clients[] __initconst .trigger = ACPI_EDGE_SENSITIVE, .polarity = ACPI_ACTIVE_LOW, .con_id = "goodix_ts_irq", + .free_gpio = true, }, }, { /* Wacom Digitizer in keyboard half */ diff --git a/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h b/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h index 49fed9410adb..468993edfeee 100644 --- a/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h +++ b/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h @@ -39,6 +39,7 @@ struct x86_acpi_irq_data { int index; int trigger; /* ACPI_EDGE_SENSITIVE / ACPI_LEVEL_SENSITIVE */ int polarity; /* ACPI_ACTIVE_HIGH / ACPI_ACTIVE_LOW / ACPI_ACTIVE_BOTH */ + bool free_gpio; /* Release GPIO after getting IRQ (for TYPE_GPIOINT) */ const char *con_id; }; From dc5afd720f84de3c1f5d700eb0b858006a2dc468 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 16 Feb 2024 21:17:19 +0100 Subject: [PATCH 182/327] platform/x86: Add new get_serdev_controller() helper In some cases UART attached devices which require an in kernel driver, e.g. UART attached Bluetooth HCIs are described in the ACPI tables by an ACPI device with a broken or missing UartSerialBusV2() resource. This causes the kernel to create a /dev/ttyS# char-device for the UART instead of creating an in kernel serdev-controller + serdev-device pair for the in kernel driver. The quirk handling in acpi_quirk_skip_serdev_enumeration() makes the kernel create a serdev-controller device for these UARTs instead of a /dev/ttyS#. Instantiating the actual serdev-device to bind to is up to pdx86 code, so far this was handled by the x86-android-tablets code. But since commit b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") the serdev-controller device has moved in the device hierarchy from (e.g.) /sys/devices/pci0000:00/8086228A:00/serial0 to /sys/devices/pci0000:00/8086228A:00/8086228A:00:0/8086228A:00:0.0/serial0 . This makes this a bit trickier to do and another driver is in the works which will also need this functionality. Add a new helper to get the serdev-controller device, so that the new code for this can be shared. Fixes: b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") Cc: Tony Lindgren Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240216201721.239791-3-hdegoede@redhat.com --- drivers/platform/x86/serdev_helpers.h | 80 +++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 drivers/platform/x86/serdev_helpers.h diff --git a/drivers/platform/x86/serdev_helpers.h b/drivers/platform/x86/serdev_helpers.h new file mode 100644 index 000000000000..bcf3a0c356ea --- /dev/null +++ b/drivers/platform/x86/serdev_helpers.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * In some cases UART attached devices which require an in kernel driver, + * e.g. UART attached Bluetooth HCIs are described in the ACPI tables + * by an ACPI device with a broken or missing UartSerialBusV2() resource. + * + * This causes the kernel to create a /dev/ttyS# char-device for the UART + * instead of creating an in kernel serdev-controller + serdev-device pair + * for the in kernel driver. + * + * The quirk handling in acpi_quirk_skip_serdev_enumeration() makes the kernel + * create a serdev-controller device for these UARTs instead of a /dev/ttyS#. + * + * Instantiating the actual serdev-device to bind to is up to pdx86 code, + * this header provides a helper for getting the serdev-controller device. + */ +#include +#include +#include +#include +#include +#include + +static inline struct device * +get_serdev_controller(const char *serial_ctrl_hid, + const char *serial_ctrl_uid, + int serial_ctrl_port, + const char *serdev_ctrl_name) +{ + struct device *ctrl_dev, *child; + struct acpi_device *ctrl_adev; + char name[32]; + int i; + + ctrl_adev = acpi_dev_get_first_match_dev(serial_ctrl_hid, serial_ctrl_uid, -1); + if (!ctrl_adev) { + pr_err("error could not get %s/%s serial-ctrl adev\n", + serial_ctrl_hid, serial_ctrl_uid); + return ERR_PTR(-ENODEV); + } + + /* get_first_physical_node() returns a weak ref */ + ctrl_dev = get_device(acpi_get_first_physical_node(ctrl_adev)); + if (!ctrl_dev) { + pr_err("error could not get %s/%s serial-ctrl physical node\n", + serial_ctrl_hid, serial_ctrl_uid); + ctrl_dev = ERR_PTR(-ENODEV); + goto put_ctrl_adev; + } + + /* Walk host -> uart-ctrl -> port -> serdev-ctrl */ + for (i = 0; i < 3; i++) { + switch (i) { + case 0: + snprintf(name, sizeof(name), "%s:0", dev_name(ctrl_dev)); + break; + case 1: + snprintf(name, sizeof(name), "%s.%d", + dev_name(ctrl_dev), serial_ctrl_port); + break; + case 2: + strscpy(name, serdev_ctrl_name, sizeof(name)); + break; + } + + child = device_find_child_by_name(ctrl_dev, name); + put_device(ctrl_dev); + if (!child) { + pr_err("error could not find '%s' device\n", name); + ctrl_dev = ERR_PTR(-ENODEV); + goto put_ctrl_adev; + } + + ctrl_dev = child; + } + +put_ctrl_adev: + acpi_dev_put(ctrl_adev); + return ctrl_dev; +} From 812a79b52b92345d777b6377fa538747d366b6ce Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 16 Feb 2024 21:17:20 +0100 Subject: [PATCH 183/327] platform/x86: x86-android-tablets: Fix serdev instantiation no longer working After commit b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") x86_instantiate_serdev() no longer works due to the serdev-controller-device moving in the device hierarchy from (e.g.) /sys/devices/pci0000:00/8086228A:00/serial0 to /sys/devices/pci0000:00/8086228A:00/8086228A:00:0/8086228A:00:0.0/serial0 Use the new get_serdev_controller() helper function to fix this. Fixes: b286f4e87e32 ("serial: core: Move tty and serdev to be children of serial core port device") Cc: Tony Lindgren Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240216201721.239791-4-hdegoede@redhat.com --- .../platform/x86/x86-android-tablets/core.c | 35 +++++-------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets/core.c b/drivers/platform/x86/x86-android-tablets/core.c index f6547c9d7584..a3415f1c0b5f 100644 --- a/drivers/platform/x86/x86-android-tablets/core.c +++ b/drivers/platform/x86/x86-android-tablets/core.c @@ -21,6 +21,7 @@ #include #include "x86-android-tablets.h" +#include "../serdev_helpers.h" static struct platform_device *x86_android_tablet_device; @@ -232,38 +233,20 @@ static __init int x86_instantiate_spi_dev(const struct x86_dev_info *dev_info, i static __init int x86_instantiate_serdev(const struct x86_serdev_info *info, int idx) { - struct acpi_device *ctrl_adev, *serdev_adev; + struct acpi_device *serdev_adev; struct serdev_device *serdev; struct device *ctrl_dev; int ret = -ENODEV; - ctrl_adev = acpi_dev_get_first_match_dev(info->ctrl_hid, info->ctrl_uid, -1); - if (!ctrl_adev) { - pr_err("error could not get %s/%s ctrl adev\n", - info->ctrl_hid, info->ctrl_uid); - return -ENODEV; - } + ctrl_dev = get_serdev_controller(info->ctrl_hid, info->ctrl_uid, 0, + info->ctrl_devname); + if (IS_ERR(ctrl_dev)) + return PTR_ERR(ctrl_dev); serdev_adev = acpi_dev_get_first_match_dev(info->serdev_hid, NULL, -1); if (!serdev_adev) { pr_err("error could not get %s serdev adev\n", info->serdev_hid); - goto put_ctrl_adev; - } - - /* get_first_physical_node() returns a weak ref, no need to put() it */ - ctrl_dev = acpi_get_first_physical_node(ctrl_adev); - if (!ctrl_dev) { - pr_err("error could not get %s/%s ctrl physical dev\n", - info->ctrl_hid, info->ctrl_uid); - goto put_serdev_adev; - } - - /* ctrl_dev now points to the controller's parent, get the controller */ - ctrl_dev = device_find_child_by_name(ctrl_dev, info->ctrl_devname); - if (!ctrl_dev) { - pr_err("error could not get %s/%s %s ctrl dev\n", - info->ctrl_hid, info->ctrl_uid, info->ctrl_devname); - goto put_serdev_adev; + goto put_ctrl_dev; } serdev = serdev_device_alloc(to_serdev_controller(ctrl_dev)); @@ -286,8 +269,8 @@ static __init int x86_instantiate_serdev(const struct x86_serdev_info *info, int put_serdev_adev: acpi_dev_put(serdev_adev); -put_ctrl_adev: - acpi_dev_put(ctrl_adev); +put_ctrl_dev: + put_device(ctrl_dev); return ret; } From 8215ca518164d35f10c0b5545c8bb80f538638b8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 16 Feb 2024 21:17:21 +0100 Subject: [PATCH 184/327] platform/x86: x86-android-tablets: Fix acer_b1_750_goodix_gpios name The Acer B1 750 tablet used a Novatek NVT-ts touchscreen, not a Goodix touchscreen. Rename acer_b1_750_goodix_gpios to acer_b1_750_nvt_ts_gpios to correctly reflect this. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240216201721.239791-5-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets/other.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets/other.c b/drivers/platform/x86/x86-android-tablets/other.c index bc6bbf7ec6ea..278402dcb808 100644 --- a/drivers/platform/x86/x86-android-tablets/other.c +++ b/drivers/platform/x86/x86-android-tablets/other.c @@ -68,7 +68,7 @@ static const struct x86_i2c_client_info acer_b1_750_i2c_clients[] __initconst = }, }; -static struct gpiod_lookup_table acer_b1_750_goodix_gpios = { +static struct gpiod_lookup_table acer_b1_750_nvt_ts_gpios = { .dev_id = "i2c-NVT-ts", .table = { GPIO_LOOKUP("INT33FC:01", 26, "reset", GPIO_ACTIVE_LOW), @@ -77,7 +77,7 @@ static struct gpiod_lookup_table acer_b1_750_goodix_gpios = { }; static struct gpiod_lookup_table * const acer_b1_750_gpios[] = { - &acer_b1_750_goodix_gpios, + &acer_b1_750_nvt_ts_gpios, &int3496_reference_gpios, NULL }; From 84c16d01ff219bc0a5dca5219db6b8b86a6854fb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 16 Feb 2024 21:33:00 +0100 Subject: [PATCH 185/327] platform/x86: intel-vbtn: Stop calling "VBDL" from notify_handler Commit 14c200b7ca46 ("platform/x86: intel-vbtn: Fix missing tablet-mode-switch events") causes 2 issues on the ThinkPad X1 Tablet Gen2: 1. The ThinkPad will wake up immediately from suspend 2. When put in tablet mode SW_TABLET_MODE reverts to 0 after about 1 second Both these issues are caused by the "VBDL" ACPI method call added at the end of the notify_handler. And it never became entirely clear if this call is even necessary to fix the issue of missing tablet-mode-switch events on the Dell Inspiron 7352. Drop the "VBDL" ACPI method call again to fix the 2 issues this is causing on the ThinkPad X1 Tablet Gen2. Fixes: 14c200b7ca46 ("platform/x86: intel-vbtn: Fix missing tablet-mode-switch events") Reported-by: Alexander Kobel Closes: https://lore.kernel.org/platform-driver-x86/295984ce-bd4b-49bd-adc5-ffe7c898d7f0@a-kobel.de/ Cc: regressions@lists.linux.dev Cc: Arnold Gozum Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Tested-by: Alexander Kobel Link: https://lore.kernel.org/r/20240216203300.245826-1-hdegoede@redhat.com --- drivers/platform/x86/intel/vbtn.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 210b0a81b7ec..084c355c86f5 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -200,9 +200,6 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) autorelease = val && (!ke_rel || ke_rel->type == KE_IGNORE); sparse_keymap_report_event(input_dev, event, val, autorelease); - - /* Some devices need this to report further events */ - acpi_evaluate_object(handle, "VBDL", NULL, NULL); } /* From 427c70dec738318b7f71e1b9d829ff0e9771d493 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Feb 2024 20:23:11 -0600 Subject: [PATCH 186/327] platform/x86: thinkpad_acpi: Only update profile if successfully converted Randomly a Lenovo Z13 will trigger a kernel warning traceback from this condition: ``` if (WARN_ON((profile < 0) || (profile >= ARRAY_SIZE(profile_names)))) ``` This happens because thinkpad-acpi always assumes that convert_dytc_to_profile() successfully updated the profile. On the contrary a condition can occur that when dytc_profile_refresh() is called the profile doesn't get updated as there is a -EOPNOTSUPP branch. Catch this situation and avoid updating the profile. Also log this into dynamic debugging in case any other modes should be added in the future. Fixes: c3bfcd4c6762 ("platform/x86: thinkpad_acpi: Add platform profile support") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240217022311.113879-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index c4895e9bc714..5ecd9d33250d 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -10308,6 +10308,7 @@ static int convert_dytc_to_profile(int funcmode, int dytcmode, return 0; default: /* Unknown function */ + pr_debug("unknown function 0x%x\n", funcmode); return -EOPNOTSUPP; } return 0; @@ -10493,8 +10494,8 @@ static void dytc_profile_refresh(void) return; perfmode = (output >> DYTC_GET_MODE_BIT) & 0xF; - convert_dytc_to_profile(funcmode, perfmode, &profile); - if (profile != dytc_current_profile) { + err = convert_dytc_to_profile(funcmode, perfmode, &profile); + if (!err && profile != dytc_current_profile) { dytc_current_profile = profile; platform_profile_notify(); } From eb0d253ff9c74dee30aa92fe460b825eb28acd73 Mon Sep 17 00:00:00 2001 From: Andrzej Kacprowski Date: Tue, 20 Feb 2024 14:16:24 +0100 Subject: [PATCH 187/327] accel/ivpu: Don't enable any tiles by default on VPU40xx There is no point in requesting 1 tile on VPU40xx as the FW will probably need more tiles to run workloads, so it will have to reconfigure PLL anyway. Don't enable any tiles and allow the FW to perform initial tile configuration. This improves NPU boot stability as the tiles are always enabled only by the FW from the same initial state. Fixes: 79cdc56c4a54 ("accel/ivpu: Add initial support for VPU 4") Cc: stable@vger.kernel.org Signed-off-by: Andrzej Kacprowski Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240220131624.1447813-1-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_hw_40xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index 1c995307c113..a1523d0b1ef3 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -24,7 +24,7 @@ #define SKU_HW_ID_SHIFT 16u #define SKU_HW_ID_MASK 0xffff0000u -#define PLL_CONFIG_DEFAULT 0x1 +#define PLL_CONFIG_DEFAULT 0x0 #define PLL_CDYN_DEFAULT 0x80 #define PLL_EPP_DEFAULT 0x80 #define PLL_REF_CLK_FREQ (50 * 1000000) From fdf87a0dc26d0550c60edc911cda42f9afec3557 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 5 Feb 2024 11:23:34 +0100 Subject: [PATCH 188/327] ARM: ep93xx: Add terminator to gpiod_lookup_table Without the terminator, if a con_id is passed to gpio_find() that does not exist in the lookup table the function will not stop looping correctly, and eventually cause an oops. Cc: stable@vger.kernel.org Fixes: b2e63555592f ("i2c: gpio: Convert to use descriptors") Reported-by: Andy Shevchenko Signed-off-by: Nikita Shubin Reviewed-by: Linus Walleij Acked-by: Alexander Sverdlin Signed-off-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20240205102337.439002-1-alexander.sverdlin@gmail.com Signed-off-by: Arnd Bergmann --- arch/arm/mach-ep93xx/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index 71b113976420..8b1ec60a9a46 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -339,6 +339,7 @@ static struct gpiod_lookup_table ep93xx_i2c_gpiod_table = { GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN), GPIO_LOOKUP_IDX("G", 0, NULL, 1, GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN), + { } }, }; From 944d5fe50f3f03daacfea16300e656a1691c4a23 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 4 Feb 2024 15:25:12 +0000 Subject: [PATCH 189/327] sched/membarrier: reduce the ability to hammer on sys_membarrier On some systems, sys_membarrier can be very expensive, causing overall slowdowns for everything. So put a lock on the path in order to serialize the accesses to prevent the ability for this to be called at too high of a frequency and saturate the machine. Signed-off-by: Greg Kroah-Hartman Reviewed-and-tested-by: Mathieu Desnoyers Acked-by: Borislav Petkov Fixes: 22e4ebb97582 ("membarrier: Provide expedited private command") Fixes: c5f58bd58f43 ("membarrier: Provide GLOBAL_EXPEDITED command") Signed-off-by: Linus Torvalds --- kernel/sched/membarrier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2ad881d07752..4e715b9b278e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -162,6 +162,9 @@ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ | MEMBARRIER_CMD_GET_REGISTRATIONS) +static DEFINE_MUTEX(membarrier_ipi_mutex); +#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex) + static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ @@ -259,6 +262,7 @@ static int membarrier_global_expedited(void) if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { @@ -347,6 +351,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); if (cpu_id >= 0) { @@ -460,6 +465,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) * between threads which are users of @mm has its membarrier state * updated. */ + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { From c88f5e553fe38b2ffc4c33d08654e5281b297677 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 19 Feb 2024 21:27:39 +0100 Subject: [PATCH 190/327] dm-integrity: recheck the integrity tag after a failure If a userspace process reads (with O_DIRECT) multiple blocks into the same buffer, dm-integrity reports an error [1]. The error is reported in a log and it may cause RAID leg being kicked out of the array. This commit fixes dm-integrity, so that if integrity verification fails, the data is read again into a kernel buffer (where userspace can't modify it) and the integrity tag is rechecked. If the recheck succeeds, the content of the kernel buffer is copied into the user buffer; if the recheck fails, an integrity error is reported. [1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 93 +++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c5f03aab4552..e46c798c5c36 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -278,6 +278,8 @@ struct dm_integrity_c { atomic64_t number_of_mismatches; + mempool_t recheck_pool; + struct notifier_block reboot_notifier; }; @@ -1689,6 +1691,79 @@ failed: get_random_bytes(result, ic->tag_size); } +static void integrity_recheck(struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct dm_integrity_c *ic = dio->ic; + struct bvec_iter iter; + struct bio_vec bv; + sector_t sector, logical_sector, area, offset; + char checksum_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + struct page *page; + void *buffer; + + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, + &dio->metadata_offset); + sector = get_data_sector(ic, area, offset); + logical_sector = dio->range.logical_sector; + + page = mempool_alloc(&ic->recheck_pool, GFP_NOIO); + buffer = page_to_virt(page); + + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { + unsigned pos = 0; + + do { + char *mem; + int r; + struct dm_io_request io_req; + struct dm_io_region io_loc; + io_req.bi_opf = REQ_OP_READ; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = sector; + io_loc.count = ic->sectors_per_block; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dio->bi_status = errno_to_blk_status(r); + goto free_ret; + } + + integrity_sector_checksum(ic, logical_sector, buffer, + checksum_onstack); + r = dm_integrity_rw_tag(ic, checksum_onstack, &dio->metadata_block, + &dio->metadata_offset, ic->tag_size, TAG_CMP); + if (r) { + if (r > 0) { + DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", + bio->bi_bdev, logical_sector); + atomic64_inc(&ic->number_of_mismatches); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", + bio, logical_sector, 0); + r = -EILSEQ; + } + dio->bi_status = errno_to_blk_status(r); + goto free_ret; + } + + mem = bvec_kmap_local(&bv); + memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); + kunmap_local(mem); + + pos += ic->sectors_per_block << SECTOR_SHIFT; + sector += ic->sectors_per_block; + logical_sector += ic->sectors_per_block; + } while (pos < bv.bv_len); + } +free_ret: + mempool_free(page, &ic->recheck_pool); +} + static void integrity_metadata(struct work_struct *w) { struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); @@ -1776,15 +1851,8 @@ again: checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - sector_t s; - - s = sector - ((r + ic->tag_size - 1) / ic->tag_size); - DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", - bio->bi_bdev, s); - r = -EILSEQ; - atomic64_inc(&ic->number_of_mismatches); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", - bio, s, 0); + integrity_recheck(dio); + goto skip_io; } if (likely(checksums != checksums_onstack)) kfree(checksums); @@ -4261,6 +4329,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } + r = mempool_init_page_pool(&ic->recheck_pool, 1, 0); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->metadata_wq) { @@ -4609,6 +4683,7 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + mempool_exit(&ic->recheck_pool); mempool_exit(&ic->journal_io_mempool); if (ic->io) dm_io_client_destroy(ic->io); From 9177f3c0dea6143d05cac1bbd28668fd0e216d11 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 19 Feb 2024 21:28:09 +0100 Subject: [PATCH 191/327] dm-verity: recheck the hash after a failure If a userspace process reads (with O_DIRECT) multiple blocks into the same buffer, dm-verity reports an error [1]. This commit fixes dm-verity, so that if hash verification fails, the data is read again into a kernel buffer (where userspace can't modify it) and the hash is rechecked. If the recheck succeeds, the content of the kernel buffer is copied into the user buffer; if the recheck fails, an error is reported. [1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 86 ++++++++++++++++++++++++++++++++--- drivers/md/dm-verity.h | 6 +++ 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 82662f5769c4..224469e1efbc 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -482,6 +482,63 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, return 0; } +static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memcpy(data, io->recheck_buffer, len); + io->recheck_buffer += len; + + return 0; +} + +static int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter start, sector_t cur_block) +{ + struct page *page; + void *buffer; + int r; + struct dm_io_request io_req; + struct dm_io_region io_loc; + + page = mempool_alloc(&v->recheck_pool, GFP_NOIO); + buffer = page_to_virt(page); + + io_req.bi_opf = REQ_OP_READ; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = buffer; + io_req.notify.fn = NULL; + io_req.client = v->io; + io_loc.bdev = v->data_dev->bdev; + io_loc.sector = cur_block << (v->data_dev_block_bits - SECTOR_SHIFT); + io_loc.count = 1 << (v->data_dev_block_bits - SECTOR_SHIFT); + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) + goto free_ret; + + r = verity_hash(v, verity_io_hash_req(v, io), buffer, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io), true); + if (unlikely(r)) + goto free_ret; + + if (memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size)) { + r = -EIO; + goto free_ret; + } + + io->recheck_buffer = buffer; + r = verity_for_bv_block(v, io, &start, verity_recheck_copy); + if (unlikely(r)) + goto free_ret; + + r = 0; +free_ret: + mempool_free(page, &v->recheck_pool); + + return r; +} + static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len) { @@ -508,9 +565,7 @@ static int verity_verify_io(struct dm_verity_io *io) { bool is_zero; struct dm_verity *v = io->v; -#if defined(CONFIG_DM_VERITY_FEC) struct bvec_iter start; -#endif struct bvec_iter iter_copy; struct bvec_iter *iter; struct crypto_wait wait; @@ -561,10 +616,7 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; -#if defined(CONFIG_DM_VERITY_FEC) - if (verity_fec_is_enabled(v)) - start = *iter; -#endif + start = *iter; r = verity_for_io_block(v, io, iter, &wait); if (unlikely(r < 0)) return r; @@ -586,6 +638,10 @@ static int verity_verify_io(struct dm_verity_io *io) * tasklet since it may sleep, so fallback to work-queue. */ return -EAGAIN; + } else if (verity_recheck(v, io, start, cur_block) == 0) { + if (v->validated_blocks) + set_bit(cur_block, v->validated_blocks); + continue; #if defined(CONFIG_DM_VERITY_FEC) } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, cur_block, NULL, &start) == 0) { @@ -941,6 +997,10 @@ static void verity_dtr(struct dm_target *ti) if (v->verify_wq) destroy_workqueue(v->verify_wq); + mempool_exit(&v->recheck_pool); + if (v->io) + dm_io_client_destroy(v->io); + if (v->bufio) dm_bufio_client_destroy(v->bufio); @@ -1379,6 +1439,20 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->hash_blocks = hash_position; + r = mempool_init_page_pool(&v->recheck_pool, 1, 0); + if (unlikely(r)) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + + v->io = dm_io_client_create(); + if (IS_ERR(v->io)) { + r = PTR_ERR(v->io); + v->io = NULL; + ti->error = "Cannot allocate dm io"; + goto bad; + } + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), dm_bufio_alloc_callback, NULL, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index f3f607008419..4620a98c9956 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -11,6 +11,7 @@ #ifndef DM_VERITY_H #define DM_VERITY_H +#include #include #include #include @@ -68,6 +69,9 @@ struct dm_verity { unsigned long *validated_blocks; /* bitset blocks validated */ char *signature_key_desc; /* signature keyring reference */ + + struct dm_io_client *io; + mempool_t recheck_pool; }; struct dm_verity_io { @@ -84,6 +88,8 @@ struct dm_verity_io { struct work_struct work; + char *recheck_buffer; + /* * Three variably-size fields follow this struct: * From 50c70240097ce41fe6bce6478b80478281e4d0f7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 19 Feb 2024 21:30:10 +0100 Subject: [PATCH 192/327] dm-crypt: don't modify the data when using authenticated encryption It was said that authenticated encryption could produce invalid tag when the data that is being encrypted is modified [1]. So, fix this problem by copying the data into the clone bio first and then encrypt them inside the clone bio. This may reduce performance, but it is needed to prevent the user from corrupting the device by writing data with O_DIRECT and modifying them at the same time. [1] https://lore.kernel.org/all/20240207004723.GA35324@sol.localdomain/T/ Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f745f8508243..14c5be6eda3b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2071,6 +2071,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) io->ctx.bio_out = clone; io->ctx.iter_out = clone->bi_iter; + if (crypt_integrity_aead(cc)) { + bio_copy_data(clone, io->base_bio); + io->ctx.bio_in = clone; + io->ctx.iter_in = clone->bi_iter; + } + sector += bio_sectors(clone); crypt_inc_pending(io); From 42e15d12070b4ff9af2b980f1b65774c2dab0507 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 19 Feb 2024 21:31:11 +0100 Subject: [PATCH 193/327] dm-crypt: recheck the integrity tag after a failure If a userspace process reads (with O_DIRECT) multiple blocks into the same buffer, dm-crypt reports an authentication error [1]. The error is reported in a log and it may cause RAID leg being kicked out of the array. This commit fixes dm-crypt, so that if integrity verification fails, the data is read again into a kernel buffer (where userspace can't modify it) and the integrity tag is rechecked. If the recheck succeeds, the content of the kernel buffer is copied into the user buffer; if the recheck fails, an integrity error is reported. [1] https://people.redhat.com/~mpatocka/testcases/blk-auth-modify/read2.c Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 89 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 14c5be6eda3b..6044614bab76 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -62,6 +62,8 @@ struct convert_context { struct skcipher_request *req; struct aead_request *req_aead; } r; + bool aead_recheck; + bool aead_failed; }; @@ -82,6 +84,8 @@ struct dm_crypt_io { blk_status_t error; sector_t sector; + struct bvec_iter saved_bi_iter; + struct rb_node rb_node; } CRYPTO_MINALIGN_ATTR; @@ -1370,10 +1374,13 @@ static int crypt_convert_block_aead(struct crypt_config *cc, if (r == -EBADMSG) { sector_t s = le64_to_cpu(*sector); - DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", - ctx->bio_in->bi_bdev, s); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", - ctx->bio_in, s, 0); + ctx->aead_failed = true; + if (ctx->aead_recheck) { + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + } } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) @@ -1757,6 +1764,8 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, io->base_bio = bio; io->sector = sector; io->error = 0; + io->ctx.aead_recheck = false; + io->ctx.aead_failed = false; io->ctx.r.req = NULL; io->integrity_metadata = NULL; io->integrity_metadata_from_pool = false; @@ -1768,6 +1777,8 @@ static void crypt_inc_pending(struct dm_crypt_io *io) atomic_inc(&io->io_pending); } +static void kcryptd_queue_read(struct dm_crypt_io *io); + /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. @@ -1781,6 +1792,15 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; + if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) && + cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) { + io->ctx.aead_recheck = true; + io->ctx.aead_failed = false; + io->error = 0; + kcryptd_queue_read(io); + return; + } + if (io->ctx.r.req) crypt_free_req(cc, io->ctx.r.req, base_bio); @@ -1816,15 +1836,19 @@ static void crypt_endio(struct bio *clone) struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned int rw = bio_data_dir(clone); - blk_status_t error; + blk_status_t error = clone->bi_status; + + if (io->ctx.aead_recheck && !error) { + kcryptd_queue_crypt(io); + return; + } /* * free the processed pages */ - if (rw == WRITE) + if (rw == WRITE || io->ctx.aead_recheck) crypt_free_buffer_pages(cc, clone); - error = clone->bi_status; bio_put(clone); if (rw == READ && !error) { @@ -1845,6 +1869,22 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) struct crypt_config *cc = io->cc; struct bio *clone; + if (io->ctx.aead_recheck) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) + return 1; + crypt_inc_pending(io); + clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); + if (unlikely(!clone)) { + crypt_dec_pending(io); + return 1; + } + clone->bi_iter.bi_sector = cc->start + io->sector; + crypt_convert_init(cc, &io->ctx, clone, clone, io->sector); + io->saved_bi_iter = clone->bi_iter; + dm_submit_bio_remap(io->base_bio, clone); + return 0; + } + /* * We need the original biovec array in order to decrypt the whole bio * data *afterwards* -- thanks to immutable biovecs we don't need to @@ -2113,6 +2153,14 @@ dec: static void kcryptd_crypt_read_done(struct dm_crypt_io *io) { + if (io->ctx.aead_recheck) { + if (!io->error) { + io->ctx.bio_in->bi_iter = io->saved_bi_iter; + bio_copy_data(io->base_bio, io->ctx.bio_in); + } + crypt_free_buffer_pages(io->cc, io->ctx.bio_in); + bio_put(io->ctx.bio_in); + } crypt_dec_pending(io); } @@ -2142,11 +2190,17 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, - io->sector); + if (io->ctx.aead_recheck) { + io->ctx.cc_sector = io->sector + cc->iv_offset; + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + } else { + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); - r = crypt_convert(cc, &io->ctx, - test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + } /* * Crypto API backlogged the request, because its queue was full * and we're in softirq context, so continue from a workqueue @@ -2188,10 +2242,13 @@ static void kcryptd_async_done(void *data, int error) if (error == -EBADMSG) { sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); - DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", - ctx->bio_in->bi_bdev, s); - dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", - ctx->bio_in, s, 0); + ctx->aead_failed = true; + if (ctx->aead_recheck) { + DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu", + ctx->bio_in->bi_bdev, s); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead", + ctx->bio_in, s, 0); + } io->error = BLK_STS_PROTECTION; } else if (error < 0) io->error = BLK_STS_IOERR; @@ -3116,7 +3173,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar sval = strchr(opt_string + strlen("integrity:"), ':') + 1; if (!strcasecmp(sval, "aead")) { set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); - } else if (strcasecmp(sval, "none")) { + } else if (strcasecmp(sval, "none")) { ti->error = "Unknown integrity profile"; return -EINVAL; } From 787f1b2800464aa277236a66eb3c279535edd460 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 20 Feb 2024 19:11:51 +0100 Subject: [PATCH 194/327] dm-verity, dm-crypt: align "struct bvec_iter" correctly "struct bvec_iter" is defined with the __packed attribute, so it is aligned on a single byte. On X86 (and on other architectures that support unaligned addresses in hardware), "struct bvec_iter" is accessed using the 8-byte and 4-byte memory instructions, however these instructions are less efficient if they operate on unaligned addresses. (on RISC machines that don't have unaligned access in hardware, GCC generates byte-by-byte accesses that are very inefficient - see [1]) This commit reorders the entries in "struct dm_verity_io" and "struct convert_context", so that "struct bvec_iter" is aligned on 8 bytes. [1] https://lore.kernel.org/all/ZcLuWUNRZadJr0tQ@fedora/T/ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-verity.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6044614bab76..f54cd6714af3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -53,11 +53,11 @@ struct convert_context { struct completion restart; struct bio *bio_in; - struct bio *bio_out; struct bvec_iter iter_in; + struct bio *bio_out; struct bvec_iter iter_out; - u64 cc_sector; atomic_t cc_pending; + u64 cc_sector; union { struct skcipher_request *req; struct aead_request *req_aead; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 4620a98c9956..db93a91169d5 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -80,12 +80,12 @@ struct dm_verity_io { /* original value of bio->bi_end_io */ bio_end_io_t *orig_bi_end_io; + struct bvec_iter iter; + sector_t block; unsigned int n_blocks; bool in_tasklet; - struct bvec_iter iter; - struct work_struct work; char *recheck_buffer; From 0e0c50e85a364bb7f1c52c84affe7f9e88c57da7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 20 Feb 2024 13:32:52 -0500 Subject: [PATCH 195/327] dm-crypt, dm-integrity, dm-verity: bump target version Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-verity-target.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f54cd6714af3..59445763e55a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3702,7 +3702,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 24, 0}, + .version = {1, 25, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index e46c798c5c36..44a1dcc06dd7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4736,7 +4736,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 224469e1efbc..06c8b637849c 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1560,7 +1560,7 @@ int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned i static struct target_type verity_target = { .name = "verity", .features = DM_TARGET_IMMUTABLE, - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, From fca7526b7d8910c6125cb1ebc3e78ccd5f50ec52 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 20 Feb 2024 12:16:47 -0800 Subject: [PATCH 196/327] drm/tests/drm_buddy: fix build failure on 32-bit targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guenter Roeck reports that commit a64056bb5a32 ("drm/tests/drm_buddy: add alloc_contiguous test") causes build failures on 32-bit targets: "This patch breaks the build on all 32-bit systems since it introduces an unhandled direct 64-bit divide operation. ERROR: modpost: "__umoddi3" [drivers/gpu/drm/tests/drm_buddy_test.ko] undefined! ERROR: modpost: "__moddi3" [drivers/gpu/drm/tests/drm_buddy_test.ko] undefined!" and the uses of 'u64' are all entirely pointless. Yes, the arguments to drm_buddy_init() and drm_buddy_alloc_blocks() are in fact of type 'u64', but none of the values here are remotely relevant, and the compiler will happily just do the type expansion. Of course, in a perfect world the compiler would also have just noticed that all the values in question are tiny, and range analysis would have shown that doing a 64-bit divide is pointless, but that is admittedly expecting a fair amount of the compiler. IOW, we shouldn't write code that the compiler then has to notice is unnecessarily complicated just to avoid extra work. We do have fairly high expectations of compilers, but kernel code should be reasonable to begin with. It turns out that there are also other issues with this code: the KUnit assertion messages have incorrect types in the format strings, but that's a widely spread issue caused by the KUnit infrastructure not having enabled format string verification. We'll get that sorted out separately. Reported-by: Guenter Roeck Fixes: a64056bb5a32 ("drm/tests/drm_buddy: add alloc_contiguous test") Link: https://lore.kernel.org/all/538327ff-8d34-41d5-a9ae-1a334744f5ae@roeck-us.net/ Cc: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Signed-off-by: Linus Torvalds --- drivers/gpu/drm/tests/drm_buddy_test.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index fee6bec757d1..8a464f7f4c61 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -21,7 +21,8 @@ static inline u64 get_size(int order, u64 chunk_size) static void drm_test_buddy_alloc_contiguous(struct kunit *test) { - u64 mm_size, ps = SZ_4K, i, n_pages, total; + const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; + unsigned long i, n_pages, total; struct drm_buddy_block *block; struct drm_buddy mm; LIST_HEAD(left); @@ -29,8 +30,6 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test) LIST_HEAD(right); LIST_HEAD(allocated); - mm_size = 16 * 3 * SZ_4K; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); /* From e08f65491c0e467127310b13333dd61e89c14bc1 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 13 Feb 2024 13:34:25 -0600 Subject: [PATCH 197/327] arm64: dts: freescale: Disable interrupt_map check Several Freescale Layerscape platforms extirq binding use a malformed interrupt-map property missing parent address cells. These are documented in of_irq_imap_abusers list in drivers/of/irq.c. In order to enable dtc interrupt_map check tree wide, we need to disable it for these platforms which will not be fixed (as that would break compatibility). Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-1-f2dee1292525@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/freescale/Makefile | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/Makefile b/arch/arm64/boot/dts/freescale/Makefile index 2e027675d7bb..2cb0212b63c6 100644 --- a/arch/arm64/boot/dts/freescale/Makefile +++ b/arch/arm64/boot/dts/freescale/Makefile @@ -20,23 +20,41 @@ dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1046a-frwy.dtb dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1046a-qds.dtb dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1046a-rdb.dtb dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1046a-tqmls1046a-mbls10xxa.dtb +DTC_FLAGS_fsl-ls1088a-qds := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1088a-qds.dtb +DTC_FLAGS_fsl-ls1088a-rdb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1088a-rdb.dtb +DTC_FLAGS_fsl-ls1088a-ten64 := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1088a-ten64.dtb +DTC_FLAGS_fsl-ls1088a-tqmls1088a-mbls10xxa := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1088a-tqmls1088a-mbls10xxa.dtb +DTC_FLAGS_fsl-ls2080a-qds := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2080a-qds.dtb +DTC_FLAGS_fsl-ls2080a-rdb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2080a-rdb.dtb +DTC_FLAGS_fsl-ls2081a-rdb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2081a-rdb.dtb +DTC_FLAGS_fsl-ls2080a-simu := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2080a-simu.dtb +DTC_FLAGS_fsl-ls2088a-qds := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2088a-qds.dtb +DTC_FLAGS_fsl-ls2088a-rdb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls2088a-rdb.dtb +DTC_FLAGS_fsl-lx2160a-bluebox3 := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-bluebox3.dtb +DTC_FLAGS_fsl-lx2160a-bluebox3-rev-a := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-bluebox3-rev-a.dtb +DTC_FLAGS_fsl-lx2160a-clearfog-cx := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-clearfog-cx.dtb +DTC_FLAGS_fsl-lx2160a-honeycomb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-honeycomb.dtb +DTC_FLAGS_fsl-lx2160a-qds := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-qds.dtb +DTC_FLAGS_fsl-lx2160a-rdb := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-rdb.dtb +DTC_FLAGS_fsl-lx2162a-clearfog := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2162a-clearfog.dtb +DTC_FLAGS_fsl-lx2162a-qds := -Wno-interrupt_map dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2162a-qds.dtb fsl-ls1028a-qds-13bb-dtbs := fsl-ls1028a-qds.dtb fsl-ls1028a-qds-13bb.dtbo @@ -53,6 +71,7 @@ dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1028a-qds-85bb.dtb dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1028a-qds-899b.dtb dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-ls1028a-qds-9999.dtb +DTC_FLAGS_fsl-lx2160a-tqmlx2160a-mblx2160a := -Wno-interrupt_map fsl-lx2160a-tqmlx2160a-mblx2160a-12-11-x-dtbs := fsl-lx2160a-tqmlx2160a-mblx2160a.dtb \ fsl-lx2160a-tqmlx2160a-mblx2160a_12_x_x.dtbo \ fsl-lx2160a-tqmlx2160a-mblx2160a_x_11_x.dtbo From 96fd598e9c34cfa68402a4da3020c9236cfacf35 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 13 Feb 2024 13:34:26 -0600 Subject: [PATCH 198/327] arm: dts: Fix dtc interrupt_provider warnings The dtc interrupt_provider warning is off by default. Fix all the warnings so it can be enabled. Signed-off-by: Rob Herring Reviewed-by: Andrew Jeffery Reviewed-by: Alexandre Torgue Acked-by: Florian Fainelli #Broadcom Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-2-f2dee1292525@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/amazon/alpine.dtsi | 1 - arch/arm/boot/dts/aspeed/aspeed-g4.dtsi | 14 -------------- arch/arm/boot/dts/aspeed/aspeed-g5.dtsi | 15 +-------------- arch/arm/boot/dts/aspeed/aspeed-g6.dtsi | 18 ++---------------- arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi | 3 +++ arch/arm/boot/dts/broadcom/bcm-hr2.dtsi | 1 + arch/arm/boot/dts/broadcom/bcm-nsp.dtsi | 2 ++ arch/arm/boot/dts/marvell/kirkwood-l-50.dts | 2 ++ arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi | 2 ++ .../boot/dts/nvidia/tegra30-apalis-v1.1.dtsi | 1 - arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi | 1 - arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi | 1 - arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts | 3 --- arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi | 2 +- arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi | 1 - arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi | 1 - arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi | 1 - .../dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi | 1 + .../nxp/imx/imx6qdl-phytec-phycore-som.dtsi | 1 + arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts | 1 + .../boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts | 1 + arch/arm/boot/dts/st/stm32429i-eval.dts | 1 - arch/arm/boot/dts/st/stm32mp157c-dk2.dts | 1 - .../boot/dts/ti/omap/am5729-beagleboneai.dts | 1 - 24 files changed, 18 insertions(+), 58 deletions(-) diff --git a/arch/arm/boot/dts/amazon/alpine.dtsi b/arch/arm/boot/dts/amazon/alpine.dtsi index ff68dfb4eb78..90bd12feac01 100644 --- a/arch/arm/boot/dts/amazon/alpine.dtsi +++ b/arch/arm/boot/dts/amazon/alpine.dtsi @@ -167,7 +167,6 @@ msix: msix@fbe00000 { compatible = "al,alpine-msix"; reg = <0x0 0xfbe00000 0x0 0x100000>; - interrupt-controller; msi-controller; al,msi-base-spi = <96>; al,msi-num-spis = <64>; diff --git a/arch/arm/boot/dts/aspeed/aspeed-g4.dtsi b/arch/arm/boot/dts/aspeed/aspeed-g4.dtsi index 530491ae5eb2..857cb26ed6d7 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-g4.dtsi +++ b/arch/arm/boot/dts/aspeed/aspeed-g4.dtsi @@ -466,7 +466,6 @@ i2c0: i2c-bus@40 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x40 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -482,7 +481,6 @@ i2c1: i2c-bus@80 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x80 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -498,7 +496,6 @@ i2c2: i2c-bus@c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0xc0 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -515,7 +512,6 @@ i2c3: i2c-bus@100 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x100 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -532,7 +528,6 @@ i2c4: i2c-bus@140 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x140 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -549,7 +544,6 @@ i2c5: i2c-bus@180 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x180 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -566,7 +560,6 @@ i2c6: i2c-bus@1c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x1c0 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -583,7 +576,6 @@ i2c7: i2c-bus@300 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x300 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -600,7 +592,6 @@ i2c8: i2c-bus@340 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x340 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -617,7 +608,6 @@ i2c9: i2c-bus@380 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x380 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -634,7 +624,6 @@ i2c10: i2c-bus@3c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x3c0 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -651,7 +640,6 @@ i2c11: i2c-bus@400 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x400 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -668,7 +656,6 @@ i2c12: i2c-bus@440 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x440 0x40>; compatible = "aspeed,ast2400-i2c-bus"; @@ -685,7 +672,6 @@ i2c13: i2c-bus@480 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x480 0x40>; compatible = "aspeed,ast2400-i2c-bus"; diff --git a/arch/arm/boot/dts/aspeed/aspeed-g5.dtsi b/arch/arm/boot/dts/aspeed/aspeed-g5.dtsi index 04f98d1dbb97..e6f3cf3c721e 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-g5.dtsi +++ b/arch/arm/boot/dts/aspeed/aspeed-g5.dtsi @@ -363,6 +363,7 @@ interrupts = <40>; reg = <0x1e780200 0x0100>; clocks = <&syscon ASPEED_CLK_APB>; + #interrupt-cells = <2>; interrupt-controller; bus-frequency = <12000000>; pinctrl-names = "default"; @@ -594,7 +595,6 @@ i2c0: i2c-bus@40 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x40 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -610,7 +610,6 @@ i2c1: i2c-bus@80 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x80 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -626,7 +625,6 @@ i2c2: i2c-bus@c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0xc0 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -643,7 +641,6 @@ i2c3: i2c-bus@100 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x100 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -660,7 +657,6 @@ i2c4: i2c-bus@140 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x140 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -677,7 +673,6 @@ i2c5: i2c-bus@180 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x180 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -694,7 +689,6 @@ i2c6: i2c-bus@1c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x1c0 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -711,7 +705,6 @@ i2c7: i2c-bus@300 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x300 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -728,7 +721,6 @@ i2c8: i2c-bus@340 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x340 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -745,7 +737,6 @@ i2c9: i2c-bus@380 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x380 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -762,7 +753,6 @@ i2c10: i2c-bus@3c0 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x3c0 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -779,7 +769,6 @@ i2c11: i2c-bus@400 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x400 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -796,7 +785,6 @@ i2c12: i2c-bus@440 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x440 0x40>; compatible = "aspeed,ast2500-i2c-bus"; @@ -813,7 +801,6 @@ i2c13: i2c-bus@480 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x480 0x40>; compatible = "aspeed,ast2500-i2c-bus"; diff --git a/arch/arm/boot/dts/aspeed/aspeed-g6.dtsi b/arch/arm/boot/dts/aspeed/aspeed-g6.dtsi index c4d1faade8be..29f94696d8b1 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-g6.dtsi +++ b/arch/arm/boot/dts/aspeed/aspeed-g6.dtsi @@ -474,6 +474,7 @@ reg = <0x1e780500 0x100>; interrupts = ; clocks = <&syscon ASPEED_CLK_APB2>; + #interrupt-cells = <2>; interrupt-controller; bus-frequency = <12000000>; pinctrl-names = "default"; @@ -488,6 +489,7 @@ reg = <0x1e780600 0x100>; interrupts = ; clocks = <&syscon ASPEED_CLK_APB2>; + #interrupt-cells = <2>; interrupt-controller; bus-frequency = <12000000>; pinctrl-names = "default"; @@ -902,7 +904,6 @@ i2c0: i2c-bus@80 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x80 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -917,7 +918,6 @@ i2c1: i2c-bus@100 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x100 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -932,7 +932,6 @@ i2c2: i2c-bus@180 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x180 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -947,7 +946,6 @@ i2c3: i2c-bus@200 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x200 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -962,7 +960,6 @@ i2c4: i2c-bus@280 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x280 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -977,7 +974,6 @@ i2c5: i2c-bus@300 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x300 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -992,7 +988,6 @@ i2c6: i2c-bus@380 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x380 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1007,7 +1002,6 @@ i2c7: i2c-bus@400 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x400 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1022,7 +1016,6 @@ i2c8: i2c-bus@480 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x480 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1037,7 +1030,6 @@ i2c9: i2c-bus@500 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x500 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1052,7 +1044,6 @@ i2c10: i2c-bus@580 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x580 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1067,7 +1058,6 @@ i2c11: i2c-bus@600 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x600 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1082,7 +1072,6 @@ i2c12: i2c-bus@680 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x680 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1097,7 +1086,6 @@ i2c13: i2c-bus@700 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x700 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1112,7 +1100,6 @@ i2c14: i2c-bus@780 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x780 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; @@ -1127,7 +1114,6 @@ i2c15: i2c-bus@800 { #address-cells = <1>; #size-cells = <0>; - #interrupt-cells = <1>; reg = <0x800 0x80>; compatible = "aspeed,ast2600-i2c-bus"; clocks = <&syscon ASPEED_CLK_APB2>; diff --git a/arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi b/arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi index f9f79ed82518..07ca0d993c9f 100644 --- a/arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm-cygnus.dtsi @@ -167,6 +167,7 @@ #gpio-cells = <2>; gpio-controller; interrupt-controller; + #interrupt-cells = <2>; interrupt-parent = <&mailbox>; interrupts = <0>; }; @@ -247,6 +248,7 @@ gpio-controller; interrupts = ; interrupt-controller; + #interrupt-cells = <2>; }; i2c1: i2c@1800b000 { @@ -518,6 +520,7 @@ gpio-controller; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; gpio-ranges = <&pinctrl 0 42 1>, <&pinctrl 1 44 3>, diff --git a/arch/arm/boot/dts/broadcom/bcm-hr2.dtsi b/arch/arm/boot/dts/broadcom/bcm-hr2.dtsi index 788a6806191a..75545b10ef2f 100644 --- a/arch/arm/boot/dts/broadcom/bcm-hr2.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm-hr2.dtsi @@ -200,6 +200,7 @@ gpio-controller; ngpios = <4>; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; }; diff --git a/arch/arm/boot/dts/broadcom/bcm-nsp.dtsi b/arch/arm/boot/dts/broadcom/bcm-nsp.dtsi index 9d20ba3b1ffb..6a4482c93167 100644 --- a/arch/arm/boot/dts/broadcom/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm-nsp.dtsi @@ -180,6 +180,7 @@ gpio-controller; ngpios = <32>; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; gpio-ranges = <&pinctrl 0 0 32>; }; @@ -352,6 +353,7 @@ gpio-controller; ngpios = <4>; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; }; diff --git a/arch/arm/boot/dts/marvell/kirkwood-l-50.dts b/arch/arm/boot/dts/marvell/kirkwood-l-50.dts index dffb9f84e67c..c841eb8e7fb1 100644 --- a/arch/arm/boot/dts/marvell/kirkwood-l-50.dts +++ b/arch/arm/boot/dts/marvell/kirkwood-l-50.dts @@ -65,6 +65,7 @@ gpio2: gpio-expander@20 { #gpio-cells = <2>; #interrupt-cells = <2>; + interrupt-controller; compatible = "semtech,sx1505q"; reg = <0x20>; @@ -79,6 +80,7 @@ gpio3: gpio-expander@21 { #gpio-cells = <2>; #interrupt-cells = <2>; + interrupt-controller; compatible = "semtech,sx1505q"; reg = <0x21>; diff --git a/arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi b/arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi index fd671c7a1e5d..6e1f0f164cb4 100644 --- a/arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi +++ b/arch/arm/boot/dts/nuvoton/nuvoton-wpcm450.dtsi @@ -120,6 +120,7 @@ interrupts = <2 IRQ_TYPE_LEVEL_HIGH>, <3 IRQ_TYPE_LEVEL_HIGH>, <4 IRQ_TYPE_LEVEL_HIGH>; + #interrupt-cells = <2>; interrupt-controller; }; @@ -128,6 +129,7 @@ gpio-controller; #gpio-cells = <2>; interrupts = <5 IRQ_TYPE_LEVEL_HIGH>; + #interrupt-cells = <2>; interrupt-controller; }; diff --git a/arch/arm/boot/dts/nvidia/tegra30-apalis-v1.1.dtsi b/arch/arm/boot/dts/nvidia/tegra30-apalis-v1.1.dtsi index 1640763fd4af..ff0d684622f7 100644 --- a/arch/arm/boot/dts/nvidia/tegra30-apalis-v1.1.dtsi +++ b/arch/arm/boot/dts/nvidia/tegra30-apalis-v1.1.dtsi @@ -997,7 +997,6 @@ compatible = "st,stmpe811"; reg = <0x41>; irq-gpio = <&gpio TEGRA_GPIO(V, 0) GPIO_ACTIVE_LOW>; - interrupt-controller; id = <0>; blocks = <0x5>; irq-trigger = <0x1>; diff --git a/arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi b/arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi index 3b6fad273cab..d38f1dd38a90 100644 --- a/arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi +++ b/arch/arm/boot/dts/nvidia/tegra30-apalis.dtsi @@ -980,7 +980,6 @@ compatible = "st,stmpe811"; reg = <0x41>; irq-gpio = <&gpio TEGRA_GPIO(V, 0) GPIO_ACTIVE_LOW>; - interrupt-controller; id = <0>; blocks = <0x5>; irq-trigger = <0x1>; diff --git a/arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi b/arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi index 4eb526fe9c55..81c8a5fd92cc 100644 --- a/arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi +++ b/arch/arm/boot/dts/nvidia/tegra30-colibri.dtsi @@ -861,7 +861,6 @@ compatible = "st,stmpe811"; reg = <0x41>; irq-gpio = <&gpio TEGRA_GPIO(V, 0) GPIO_ACTIVE_LOW>; - interrupt-controller; id = <0>; blocks = <0x5>; irq-trigger = <0x1>; diff --git a/arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts b/arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts index db8c332df6a1..cad112e05475 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts +++ b/arch/arm/boot/dts/nxp/imx/imx6q-b850v3.dts @@ -227,7 +227,6 @@ #address-cells = <3>; #size-cells = <2>; - #interrupt-cells = <1>; bridge@2,1 { compatible = "pci10b5,8605"; @@ -235,7 +234,6 @@ #address-cells = <3>; #size-cells = <2>; - #interrupt-cells = <1>; /* Intel Corporation I210 Gigabit Network Connection */ ethernet@3,0 { @@ -250,7 +248,6 @@ #address-cells = <3>; #size-cells = <2>; - #interrupt-cells = <1>; /* Intel Corporation I210 Gigabit Network Connection */ switch_nic: ethernet@4,0 { diff --git a/arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi b/arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi index 99f4f6ac71d4..c1ae7c47b442 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6q-bx50v3.dtsi @@ -245,6 +245,7 @@ reg = <0x74>; gpio-controller; #gpio-cells = <2>; + #interrupt-cells = <2>; interrupt-controller; interrupt-parent = <&gpio2>; interrupts = <3 IRQ_TYPE_LEVEL_LOW>; @@ -390,7 +391,6 @@ #address-cells = <3>; #size-cells = <2>; - #interrupt-cells = <1>; }; }; diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi index 2ae93f57fe5a..ea40623d12e5 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi @@ -626,7 +626,6 @@ blocks = <0x5>; id = <0>; interrupts = <10 IRQ_TYPE_LEVEL_LOW>; - interrupt-controller; interrupt-parent = <&gpio4>; irq-trigger = <0x1>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi index 55c90f6393ad..d3a7a6eeb8e0 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-colibri.dtsi @@ -550,7 +550,6 @@ blocks = <0x5>; interrupts = <20 IRQ_TYPE_LEVEL_LOW>; interrupt-parent = <&gpio6>; - interrupt-controller; id = <0>; irq-trigger = <0x1>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi index a63e73adc1fc..42b2ba23aefc 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-emcon.dtsi @@ -225,7 +225,6 @@ pinctrl-0 = <&pinctrl_pmic>; interrupt-parent = <&gpio2>; interrupts = <8 IRQ_TYPE_LEVEL_LOW>; - interrupt-controller; onkey { compatible = "dlg,da9063-onkey"; diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi index 113974520d54..c0c47adc5866 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-pfla02.dtsi @@ -124,6 +124,7 @@ reg = <0x58>; interrupt-parent = <&gpio2>; interrupts = <9 IRQ_TYPE_LEVEL_LOW>; /* active-low GPIO2_9 */ + #interrupt-cells = <2>; interrupt-controller; regulators { diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-phycore-som.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-phycore-som.dtsi index 86b4269e0e01..85e278eb2016 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-phycore-som.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-phytec-phycore-som.dtsi @@ -100,6 +100,7 @@ interrupt-parent = <&gpio1>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; gpio-controller; #gpio-cells = <2>; diff --git a/arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts b/arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts index 12361fcbe24a..1b965652291b 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts +++ b/arch/arm/boot/dts/nxp/imx/imx7d-pico-dwarf.dts @@ -63,6 +63,7 @@ gpio-controller; #gpio-cells = <2>; #interrupt-cells = <2>; + interrupt-controller; reg = <0x25>; }; diff --git a/arch/arm/boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts b/arch/arm/boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts index b0ed68af0546..029f49be40e3 100644 --- a/arch/arm/boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts +++ b/arch/arm/boot/dts/nxp/vf/vf610-zii-dev-rev-b.dts @@ -338,6 +338,7 @@ reg = <0x22>; gpio-controller; #gpio-cells = <2>; + #interrupt-cells = <2>; interrupt-controller; interrupt-parent = <&gpio3>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; diff --git a/arch/arm/boot/dts/st/stm32429i-eval.dts b/arch/arm/boot/dts/st/stm32429i-eval.dts index 576235ec3c51..afa417b34b25 100644 --- a/arch/arm/boot/dts/st/stm32429i-eval.dts +++ b/arch/arm/boot/dts/st/stm32429i-eval.dts @@ -222,7 +222,6 @@ reg = <0x42>; interrupts = <8 3>; interrupt-parent = <&gpioi>; - interrupt-controller; wakeup-source; stmpegpio: stmpe_gpio { diff --git a/arch/arm/boot/dts/st/stm32mp157c-dk2.dts b/arch/arm/boot/dts/st/stm32mp157c-dk2.dts index 510cca5acb79..7a701f7ef0c7 100644 --- a/arch/arm/boot/dts/st/stm32mp157c-dk2.dts +++ b/arch/arm/boot/dts/st/stm32mp157c-dk2.dts @@ -64,7 +64,6 @@ reg = <0x38>; interrupts = <2 2>; interrupt-parent = <&gpiof>; - interrupt-controller; touchscreen-size-x = <480>; touchscreen-size-y = <800>; status = "okay"; diff --git a/arch/arm/boot/dts/ti/omap/am5729-beagleboneai.dts b/arch/arm/boot/dts/ti/omap/am5729-beagleboneai.dts index c8e55642f9c6..3e834fc7e370 100644 --- a/arch/arm/boot/dts/ti/omap/am5729-beagleboneai.dts +++ b/arch/arm/boot/dts/ti/omap/am5729-beagleboneai.dts @@ -415,7 +415,6 @@ reg = <0x41>; interrupts = <30 IRQ_TYPE_LEVEL_LOW>; interrupt-parent = <&gpio2>; - interrupt-controller; id = <0>; blocks = <0x5>; irq-trigger = <0x1>; From 91adecf911e5df78ea3e8f866e69db2c33416a5c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 13 Feb 2024 13:34:27 -0600 Subject: [PATCH 199/327] arm64: dts: Fix dtc interrupt_provider warnings The dtc interrupt_provider warning is off by default. Fix all the warnings so it can be enabled. Signed-off-by: Rob Herring Reviewed-By: AngeloGioacchino Del Regno # Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Florian Fainelli #Broadcom Acked-by: Chanho Min Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-3-f2dee1292525@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/amazon/alpine-v2.dtsi | 1 - arch/arm64/boot/dts/amazon/alpine-v3.dtsi | 1 - arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi | 1 + arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi | 1 + arch/arm64/boot/dts/lg/lg1312.dtsi | 1 - arch/arm64/boot/dts/lg/lg1313.dtsi | 1 - arch/arm64/boot/dts/marvell/armada-ap80x.dtsi | 1 - arch/arm64/boot/dts/mediatek/mt8195-demo.dts | 1 + arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 4 ++++ 9 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi index dccbba6e7f98..dbf2dce8d1d6 100644 --- a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi +++ b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi @@ -145,7 +145,6 @@ msix: msix@fbe00000 { compatible = "al,alpine-msix"; reg = <0x0 0xfbe00000 0x0 0x100000>; - interrupt-controller; msi-controller; al,msi-base-spi = <160>; al,msi-num-spis = <160>; diff --git a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi index 39481d7fd7d4..3ea178acdddf 100644 --- a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi +++ b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi @@ -355,7 +355,6 @@ msix: msix@fbe00000 { compatible = "al,alpine-msix"; reg = <0x0 0xfbe00000 0x0 0x100000>; - interrupt-controller; msi-controller; al,msi-base-spi = <336>; al,msi-num-spis = <959>; diff --git a/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi b/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi index 9dcd25ec2c04..896d1f33b5b6 100644 --- a/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi +++ b/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi @@ -586,6 +586,7 @@ #gpio-cells = <2>; gpio-controller; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; }; diff --git a/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi b/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi index f049687d6b96..d8516ec0dae7 100644 --- a/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi +++ b/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi @@ -450,6 +450,7 @@ #gpio-cells = <2>; gpio-controller; interrupt-controller; + #interrupt-cells = <2>; interrupts = ; gpio-ranges = <&pinmux 0 0 16>, <&pinmux 16 71 2>, diff --git a/arch/arm64/boot/dts/lg/lg1312.dtsi b/arch/arm64/boot/dts/lg/lg1312.dtsi index 48ec4ebec0a8..b864ffa74ea8 100644 --- a/arch/arm64/boot/dts/lg/lg1312.dtsi +++ b/arch/arm64/boot/dts/lg/lg1312.dtsi @@ -126,7 +126,6 @@ amba { #address-cells = <2>; #size-cells = <1>; - #interrupt-cells = <3>; compatible = "simple-bus"; interrupt-parent = <&gic>; diff --git a/arch/arm64/boot/dts/lg/lg1313.dtsi b/arch/arm64/boot/dts/lg/lg1313.dtsi index 3869460aa5dc..996fb39bb50c 100644 --- a/arch/arm64/boot/dts/lg/lg1313.dtsi +++ b/arch/arm64/boot/dts/lg/lg1313.dtsi @@ -126,7 +126,6 @@ amba { #address-cells = <2>; #size-cells = <1>; - #interrupt-cells = <3>; compatible = "simple-bus"; interrupt-parent = <&gic>; diff --git a/arch/arm64/boot/dts/marvell/armada-ap80x.dtsi b/arch/arm64/boot/dts/marvell/armada-ap80x.dtsi index 2c920e22cec2..7ec7c789d87e 100644 --- a/arch/arm64/boot/dts/marvell/armada-ap80x.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-ap80x.dtsi @@ -138,7 +138,6 @@ odmi: odmi@300000 { compatible = "marvell,odmi-controller"; - interrupt-controller; msi-controller; marvell,odmi-frames = <4>; reg = <0x300000 0x4000>, diff --git a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts index 69c7f3954ae5..4127cb84eba4 100644 --- a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts +++ b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts @@ -128,6 +128,7 @@ compatible = "mediatek,mt6360"; reg = <0x34>; interrupt-controller; + #interrupt-cells = <1>; interrupts-extended = <&pio 101 IRQ_TYPE_EDGE_FALLING>; interrupt-names = "IRQB"; diff --git a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi index 3885ef3454ff..50de17e4fb3f 100644 --- a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi +++ b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi @@ -234,6 +234,7 @@ gpio-controller; #gpio-cells = <2>; interrupt-controller; + #interrupt-cells = <2>; interrupt-parent = <&gpio6>; interrupts = <8 IRQ_TYPE_EDGE_FALLING>; @@ -294,6 +295,7 @@ gpio-controller; #gpio-cells = <2>; interrupt-controller; + #interrupt-cells = <2>; interrupt-parent = <&gpio6>; interrupts = <4 IRQ_TYPE_EDGE_FALLING>; }; @@ -314,6 +316,7 @@ gpio-controller; #gpio-cells = <2>; interrupt-controller; + #interrupt-cells = <2>; interrupt-parent = <&gpio7>; interrupts = <3 IRQ_TYPE_EDGE_FALLING>; }; @@ -324,6 +327,7 @@ gpio-controller; #gpio-cells = <2>; interrupt-controller; + #interrupt-cells = <2>; interrupt-parent = <&gpio5>; interrupts = <9 IRQ_TYPE_EDGE_FALLING>; }; From f02b0f0dc26fbb77fe47b6e47cc5c211f0432c37 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 13 Feb 2024 13:34:28 -0600 Subject: [PATCH 200/327] arm: dts: Fix dtc interrupt_map warnings The dtc interrupt_map warning is off because its dependency, interrupt_provider, is off by default. Fix all the warnings so it can be enabled. Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-4-f2dee1292525@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts | 2 ++ .../boot/dts/intel/ixp/intel-ixp42x-goramo-multilink.dts | 2 ++ arch/arm/boot/dts/qcom/qcom-sdx55.dtsi | 8 ++++---- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts index 4d70f6afd13a..6d5e69035f94 100644 --- a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts +++ b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-gateway-7001.dts @@ -60,6 +60,8 @@ * We have slots (IDSEL) 1 and 2 with one assigned IRQ * each handling all IRQs. */ + #interrupt-cells = <1>; + interrupt-map-mask = <0xf800 0 0 7>; interrupt-map = /* IDSEL 1 */ <0x0800 0 0 1 &gpio0 11 IRQ_TYPE_LEVEL_LOW>, /* INT A on slot 1 is irq 11 */ diff --git a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-goramo-multilink.dts b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-goramo-multilink.dts index 9ec0169bacf8..5f4c849915db 100644 --- a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-goramo-multilink.dts +++ b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-goramo-multilink.dts @@ -89,6 +89,8 @@ * The slots have Ethernet, Ethernet, NEC and MPCI. * The IDSELs are 11, 12, 13, 14. */ + #interrupt-cells = <1>; + interrupt-map-mask = <0xf800 0 0 7>; interrupt-map = /* IDSEL 11 - Ethernet A */ <0x5800 0 0 1 &gpio0 4 IRQ_TYPE_LEVEL_LOW>, /* INT A on slot 11 is irq 4 */ diff --git a/arch/arm/boot/dts/qcom/qcom-sdx55.dtsi b/arch/arm/boot/dts/qcom/qcom-sdx55.dtsi index 2045fc779f88..27429d0fedfb 100644 --- a/arch/arm/boot/dts/qcom/qcom-sdx55.dtsi +++ b/arch/arm/boot/dts/qcom/qcom-sdx55.dtsi @@ -340,10 +340,10 @@ "msi8"; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 0x7>; - interrupt-map = <0 0 0 1 &intc 0 0 0 141 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ - <0 0 0 2 &intc 0 0 0 142 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ - <0 0 0 3 &intc 0 0 0 143 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ - <0 0 0 4 &intc 0 0 0 144 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ + interrupt-map = <0 0 0 1 &intc 0 141 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ + <0 0 0 2 &intc 0 142 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ + <0 0 0 3 &intc 0 143 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ + <0 0 0 4 &intc 0 144 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ clocks = <&gcc GCC_PCIE_PIPE_CLK>, <&gcc GCC_PCIE_AUX_CLK>, From 704dccec0d490f2ad06f3f16ebed254d81906c3a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 13 Feb 2024 13:34:29 -0600 Subject: [PATCH 201/327] arm64: dts: qcom: Fix interrupt-map cell sizes The PCI node interrupt-map properties have the wrong size as #address-cells in the interrupt parent are not accounted for. The dtc interrupt_map check catches this, but the warning is off because its dependency, interrupt_provider, is off by default. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20240213-arm-dt-cleanups-v1-5-f2dee1292525@kernel.org Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/qcom/ipq6018.dtsi | 8 ++++---- arch/arm64/boot/dts/qcom/ipq8074.dtsi | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/ipq6018.dtsi b/arch/arm64/boot/dts/qcom/ipq6018.dtsi index 5e1277fea725..61c8fd49c966 100644 --- a/arch/arm64/boot/dts/qcom/ipq6018.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq6018.dtsi @@ -830,10 +830,10 @@ #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 0x7>; - interrupt-map = <0 0 0 1 &intc 0 75 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ - <0 0 0 2 &intc 0 78 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ - <0 0 0 3 &intc 0 79 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ - <0 0 0 4 &intc 0 83 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ + interrupt-map = <0 0 0 1 &intc 0 0 0 75 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ + <0 0 0 2 &intc 0 0 0 78 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ + <0 0 0 3 &intc 0 0 0 79 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ + <0 0 0 4 &intc 0 0 0 83 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ clocks = <&gcc GCC_SYS_NOC_PCIE0_AXI_CLK>, <&gcc GCC_PCIE0_AXI_M_CLK>, diff --git a/arch/arm64/boot/dts/qcom/ipq8074.dtsi b/arch/arm64/boot/dts/qcom/ipq8074.dtsi index cf295bed3299..26441447c866 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074.dtsi @@ -814,13 +814,13 @@ interrupt-names = "msi"; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 0x7>; - interrupt-map = <0 0 0 1 &intc 0 142 + interrupt-map = <0 0 0 1 &intc 0 0 142 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ - <0 0 0 2 &intc 0 143 + <0 0 0 2 &intc 0 0 143 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ - <0 0 0 3 &intc 0 144 + <0 0 0 3 &intc 0 0 144 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ - <0 0 0 4 &intc 0 145 + <0 0 0 4 &intc 0 0 145 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ clocks = <&gcc GCC_SYS_NOC_PCIE1_AXI_CLK>, @@ -876,13 +876,13 @@ interrupt-names = "msi"; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 0x7>; - interrupt-map = <0 0 0 1 &intc 0 75 + interrupt-map = <0 0 0 1 &intc 0 0 75 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ - <0 0 0 2 &intc 0 78 + <0 0 0 2 &intc 0 0 78 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ - <0 0 0 3 &intc 0 79 + <0 0 0 3 &intc 0 0 79 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ - <0 0 0 4 &intc 0 83 + <0 0 0 4 &intc 0 0 83 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ clocks = <&gcc GCC_SYS_NOC_PCIE0_AXI_CLK>, From 0df8669f69a8638f04c6a3d1f3b7056c2c18f62c Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 19 Feb 2024 09:05:38 -0700 Subject: [PATCH 202/327] docs: Instruct LaTeX to cope with deeper nesting The addition of the XFS online fsck documentation starting with commit a8f6c2e54ddc ("xfs: document the motivation for online fsck design") added a deeper level of nesting than LaTeX is prepared to deal with. That caused a pdfdocs build failure with the helpful "Too deeply nested" error message buried deeply in Documentation/output/filesystems.log. Increase the "maxlistdepth" parameter to instruct LaTeX that it needs to deal with the deeper nesting whether it wants to or not. Suggested-by: Akira Yokosawa Tested-by: Akira Yokosawa Cc: stable@vger.kernel.org # v6.4+ Link: https://lore.kernel.org/linux-doc/67f6ac60-7957-4b92-9d72-a08fbad0e028@gmail.com/ Signed-off-by: Jonathan Corbet --- Documentation/conf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/conf.py b/Documentation/conf.py index 5830b01c5642..da64c9fb7e07 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -388,6 +388,12 @@ latex_elements = { verbatimhintsturnover=false, ''', + # + # Some of our authors are fond of deep nesting; tell latex to + # cope. + # + 'maxlistdepth': '10', + # For CJK One-half spacing, need to be in front of hyperref 'extrapackages': r'\usepackage{setspace}', From e3b63e966cac0bf78aaa1efede1827a252815a1d Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Jan 2024 08:51:27 +0000 Subject: [PATCH 203/327] mm: zswap: fix missing folio cleanup in writeback race path In zswap_writeback_entry(), after we get a folio from __read_swap_cache_async(), we grab the tree lock again to check that the swap entry was not invalidated and recycled. If it was, we delete the folio we just added to the swap cache and exit. However, __read_swap_cache_async() returns the folio locked when it is newly allocated, which is always true for this path, and the folio is ref'd. Make sure to unlock and put the folio before returning. This was discovered by code inspection, probably because this path handles a race condition that should not happen often, and the bug would not crash the system, it will only strand the folio indefinitely. Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com Fixes: 04fc7816089c ("mm: fix zswap writeback race condition") Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Domenico Cerasuolo Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 350dd2fc8159..d2423247acfd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } spin_unlock(&tree->lock); From e9e3db69966d5e9e6f7e7d017b407c0025180fe5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Feb 2024 12:13:06 -0800 Subject: [PATCH 204/327] mm/damon/core: check apply interval in damon_do_apply_schemes() kdamond_apply_schemes() checks apply intervals of schemes and avoid further applying any schemes if no scheme passed its apply interval. However, the following schemes applying function, damon_do_apply_schemes() iterates all schemes without the apply interval check. As a result, the shortest apply interval is applied to all schemes. Fix the problem by checking the apply interval in damon_do_apply_schemes(). Link: https://lkml.kernel.org/r/20240205201306.88562-1-sj@kernel.org Fixes: 42f994b71404 ("mm/damon/core: implement scheme-specific apply interval") Signed-off-by: SeongJae Park Cc: [6.7.x] Signed-off-by: Andrew Morton --- mm/damon/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 36f6f1d21ff0..5b325749fc12 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1026,6 +1026,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + if (!s->wmarks.activated) continue; @@ -1176,10 +1179,6 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (c->passed_sample_intervals != s->next_apply_sis) continue; - s->next_apply_sis += - (s->apply_interval_us ? s->apply_interval_us : - c->attrs.aggr_interval) / sample_interval; - if (!s->wmarks.activated) continue; @@ -1195,6 +1194,14 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); } + + damon_for_each_scheme(s, c) { + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + s->next_apply_sis += + (s->apply_interval_us ? s->apply_interval_us : + c->attrs.aggr_interval) / sample_interval; + } } /* From 7efa6f2c803366f84c3c362f01e822490669d72b Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Mon, 5 Feb 2024 14:50:56 +0000 Subject: [PATCH 205/327] selftests/mm: uffd-unit-test check if huge page size is 0 If HUGETLBFS is not enabled then the default_huge_page_size function will return 0 and cause a divide by 0 error. Add a check to see if the huge page size is 0 and skip the hugetlb tests if it is. Link: https://lkml.kernel.org/r/20240205145055.3545806-2-terry.tritton@linaro.org Fixes: 16a45b57cbf2 ("selftests/mm: add framework for uffd-unit-test") Signed-off-by: Terry Tritton Cc: Peter Griffin Cc: Shuah Khan Cc: Peter Xu Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index cce90a10515a..2b9f8cc52639 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1517,6 +1517,12 @@ int main(int argc, char *argv[]) continue; uffd_test_start("%s on %s", test->name, mem_type->name); + if ((mem_type->mem_flag == MEM_HUGETLB || + mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && + (default_huge_page_size() == 0)) { + uffd_test_skip("huge page size is 0, feature missing?"); + continue; + } if (!uffd_feature_supported(test)) { uffd_test_skip("feature missing"); continue; From 16e96ba5e92ce06b54f0862d44bb27bfa00d6c23 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 15:24:42 -0800 Subject: [PATCH 206/327] mm/swap_state: update zswap LRU's protection range with the folio locked When a folio is swapped in, the protection size of the corresponding zswap LRU is incremented, so that the zswap shrinker is more conservative with its reclaiming action. This field is embedded within the struct lruvec, so updating it requires looking up the folio's memcg and lruvec. However, currently this lookup can happen after the folio is unlocked, for instance if a new folio is allocated, and swap_read_folio() unlocks the folio before returning. In this scenario, there is no stability guarantee for the binding between a folio and its memcg and lruvec: * A folio's memcg and lruvec can be freed between the lookup and the update, leading to a UAF. * Folio migration can clear the now-unlocked folio's memcg_data, which directs the zswap LRU protection size update towards the root memcg instead of the original memcg. This was recently picked up by the syzbot thanks to a warning in the inlined folio_lruvec() call. Move the zswap LRU protection range update above the swap_read_folio() call, and only when a new page is allocated, to prevent this. [nphamcs@gmail.com: add VM_WARN_ON_ONCE() to zswap_folio_swapin()] Link: https://lkml.kernel.org/r/20240206180855.3987204-1-nphamcs@gmail.com [nphamcs@gmail.com: remove unneeded if (folio) checks] Link: https://lkml.kernel.org/r/20240206191355.83755-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240205232442.3240571-1-nphamcs@gmail.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Reported-by: syzbot+17a611d10af7d18a7092@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000ae47f90610803260@google.com/ Signed-off-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/swap_state.c | 10 ++++++---- mm/zswap.c | 7 +++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index e671266ad772..7255c01a1e4e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -680,9 +680,10 @@ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } @@ -855,9 +856,10 @@ skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } diff --git a/mm/zswap.c b/mm/zswap.c index d2423247acfd..36903d938c15 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -377,10 +377,9 @@ void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } /********************************* From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 7 Feb 2024 02:25:59 +0800 Subject: [PATCH 207/327] mm/swap: fix race when skipping swapcache When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads swapin the same entry at the same time, they get different pages (A, B). Before one thread (T0) finishes the swapin and installs page (A) to the PTE, another thread (T1) could finish swapin of page (B), swap_free the entry, then swap out the possibly modified page reusing the same entry. It breaks the pte_same check in (T0) because PTE value is unchanged, causing ABA problem. Thread (T0) will install a stalled page (A) into the PTE and cause data corruption. One possible callstack is like this: CPU0 CPU1 ---- ---- do_swap_page() do_swap_page() with same entry swap_read_folio() <- read to page A swap_read_folio() <- read to page B ... set_pte_at() swap_free() <- entry is free pte_same() <- Check pass, PTE seems unchanged, but page A is stalled! swap_free() <- page B content lost! set_pte_at() <- staled page A installed! And besides, for ZRAM, swap_free() allows the swap device to discard the entry content, so even if page (B) is not modified, if swap_read_folio() on CPU0 happens later than swap_free() on CPU1, it may also cause data loss. To fix this, reuse swapcache_prepare which will pin the swap entry using the cache flag, and allow only one thread to swap it in, also prevent any parallel code from putting the entry in the cache. Release the pin after PT unlocked. Racers just loop and wait since it's a rare and very short event. A schedule_timeout_uninterruptible(1) call is added to avoid repeated page faults wasting too much CPU, causing livelock or adding too much noise to perf statistics. A similar livelock issue was described in commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead") Reproducer: This race issue can be triggered easily using a well constructed reproducer and patched brd (with a delay in read path) [1]: With latest 6.8 mainline, race caused data loss can be observed easily: $ gcc -g -lpthread test-thread-swap-race.c && ./a.out Polulating 32MB of memory region... Keep swapping out... Starting round 0... Spawning 65536 workers... 32746 workers spawned, wait for done... Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss! Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss! Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss! Round 0 Failed, 15 data loss! This reproducer spawns multiple threads sharing the same memory region using a small swap device. Every two threads updates mapped pages one by one in opposite direction trying to create a race, with one dedicated thread keep swapping out the data out using madvise. The reproducer created a reproduce rate of about once every 5 minutes, so the race should be totally possible in production. After this patch, I ran the reproducer for over a few hundred rounds and no data loss observed. Performance overhead is minimal, microbenchmark swapin 10G from 32G zram: Before: 10934698 us After: 11157121 us Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag) [kasong@tencent.com: v4] Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel.com/ Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1] Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Acked-by: Yu Zhao Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Yosry Ahmed Cc: Yu Zhao Cc: Barry Song <21cnbao@gmail.com> Cc: SeongJae Park Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ mm/memory.c | 20 ++++++++++++++++++++ mm/swap.h | 5 +++++ mm/swapfile.c | 13 +++++++++++++ 4 files changed, 43 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad261..8d28f6091a32 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } +static inline int swapcache_prepare(swp_entry_t swp) +{ + return 0; +} + static inline void swap_free(swp_entry_t swp) { } diff --git a/mm/memory.c b/mm/memory.c index 15f8b10ea17c..0bfc8b007c01 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; + bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; @@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread may + * finish swapin first, free the entry, and swapout + * reusing the same entry. It's undetectable as + * pte_same() returns true due to entry reuse. + */ + if (swapcache_prepare(entry)) { + /* Relax a bit to prevent rapid repeated page faults */ + schedule_timeout_uninterruptible(1); + goto out; + } + need_clear_cache = true; + /* skip swapcache */ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); @@ -4117,6 +4132,9 @@ unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: + /* Clear the swap cache pin for direct swapin after PTL unlock */ + if (need_clear_cache) + swapcache_clear(si, entry); if (si) put_swap_device(si); return ret; @@ -4131,6 +4149,8 @@ out_release: folio_unlock(swapcache); folio_put(swapcache); } + if (need_clear_cache) + swapcache_clear(si, entry); if (si) put_swap_device(si); return ret; diff --git a/mm/swap.h b/mm/swap.h index 758c46ca671e..fc2f6ade7f80 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +{ +} + static inline struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 556ff7347d5f..746aa9da5302 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char usage; + + ci = lock_cluster_or_swap_info(si, offset); + usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); + unlock_cluster_or_swap_info(si, ci); + if (!usage) + free_swap_slot(entry); +} + struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); From 1eb1e984379e2da04361763f66eec90dd75cf63e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 8 Feb 2024 07:30:10 -0800 Subject: [PATCH 208/327] lib/Kconfig.debug: TEST_IOV_ITER depends on MMU Trying to run the iov_iter unit test on a nommu system such as the qemu kc705-nommu emulation results in a crash. KTAP version 1 # Subtest: iov_iter # module: kunit_iov_iter 1..9 BUG: failure at mm/nommu.c:318/vmap()! Kernel panic - not syncing: BUG! The test calls vmap() directly, but vmap() is not supported on nommu systems, causing the crash. TEST_IOV_ITER therefore needs to depend on MMU. Link: https://lkml.kernel.org/r/20240208153010.1439753-1-linux@roeck-us.net Fixes: 2d71340ff1d4 ("iov_iter: Kunit tests for copying to/from an iterator") Signed-off-by: Guenter Roeck Cc: David Howells Cc: Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 975a07f9f1cc..ef36b829ae1f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2235,6 +2235,7 @@ config TEST_DIV64 config TEST_IOV_ITER tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS depends on KUNIT + depends on MMU default KUNIT_ALL_TESTS help Enable this to turn on testing of the operation of the I/O iterator From 678e54d4bb9a4822f8ae99690ac131c5d490cdb1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 8 Feb 2024 02:32:54 +0000 Subject: [PATCH 209/327] mm/zswap: invalidate duplicate entry when !zswap_enabled We have to invalidate any duplicate entry even when !zswap_enabled since zswap can be disabled anytime. If the folio store success before, then got dirtied again but zswap disabled, we won't invalidate the old duplicate entry in the zswap_store(). So later lru writeback may overwrite the new data in swapfile. Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev Fixes: 42c06a0e8ebe ("mm: kill frontswap") Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Nhat Pham Cc: Yosry Ahmed Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 36903d938c15..db4625af65fb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!zswap_enabled || !tree) + if (!tree) return false; /* @@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); + + if (!zswap_enabled) + return false; + objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); From 4f155af0ae4464134bfcfd9f043b6b727c84e947 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 9 Feb 2024 08:39:12 +0530 Subject: [PATCH 210/327] mm/memblock: add MEMBLOCK_RSRV_NOINIT into flagname[] array The commit 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag") skipped adding this newly introduced memblock flag into flagname[] array, thus preventing a correct memblock flags output for applicable memblock regions. Link: https://lkml.kernel.org/r/20240209030912.1382251-1-anshuman.khandual@arm.com Fixes: 77e6c43e137c ("memblock: introduce MEMBLOCK_RSRV_NOINIT flag") Signed-off-by: Anshuman Khandual Reviewed-by: Mike Rapoport Cc: Signed-off-by: Andrew Morton --- mm/memblock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memblock.c b/mm/memblock.c index 4dcb2ee35eca..d9f4b82cbffe 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2249,6 +2249,7 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", + [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", }; static int memblock_debug_show(struct seq_file *m, void *private) From 118642d7f606fc9b9c92ee611275420320290ffb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 13 Feb 2024 03:16:34 -0500 Subject: [PATCH 211/327] mm: memcontrol: clarify swapaccount=0 deprecation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The swapaccount deprecation warning is throwing false positives. Since we deprecated the knob and defaulted to enabling, the only reports we've been getting are from folks that set swapaccount=1. While this is a nice affirmation that always-enabling was the right choice, we certainly don't want to warn when users request the supported mode. Only warn when disabling is requested, and clarify the warning. [colin.i.king@gmail.com: spelling: "commdandline" -> "commandline"] Link: https://lkml.kernel.org/r/20240215090544.1649201-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/20240213081634.3652326-1-hannes@cmpxchg.org Fixes: b25806dcd3d5 ("mm: memcontrol: deprecate swapaccounting=0 mode") Signed-off-by: Colin Ian King Reported-by: "Jonas Schäfer" Reported-by: Narcis Garcia Suggested-by: Yosry Ahmed Signed-off-by: Johannes Weiner Reviewed-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ed40f9d3a27..61932c9215e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7971,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - pr_warn_once("The swapaccount= commandline option is deprecated. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); + bool res; + + if (!kstrtobool(s, &res) && !res) + pr_warn_once("The swapaccount=0 commandline option is deprecated " + "in favor of configuring swap control via cgroupfs. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); From 0721a614ef798053a4a54c74e2501b8d15b0eff3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 Feb 2024 18:36:32 -0800 Subject: [PATCH 212/327] mm/damon/sysfs-schemes: handle schemes sysfs dir removal before commit_schemes_quota_goals 'commit_schemes_quota_goals' command handler, damos_sysfs_set_quota_scores() assumes the number of schemes sysfs directory will be same to the number of schemes of the DAMON context. The assumption is wrong since users can remove schemes sysfs directories while DAMON is running. In the case, illegal memory accesses can happen. Fix it by checking the case. Link: https://lkml.kernel.org/r/20240213023633.124928-1-sj@kernel.org Fixes: d91beaa505a0 ("mm/damon/sysfs-schemes: implement a command for scheme quota goals only commit") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index dd2fb5127009..ae0f0b314f3a 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1905,6 +1905,10 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + /* user could have removed the scheme sysfs dir */ + if (i >= sysfs_schemes->nr) + break; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, &scheme->quota); From 379c5aaa14351aa9faf1f569fe9e3c4f4b02f6a0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 19 Feb 2024 12:50:50 -0800 Subject: [PATCH 213/327] MAINTAINERS: mailmap: update Shakeel's email address Moving to linux.dev based email for kernel work. Link: https://lkml.kernel.org/r/20240219205050.887810-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index b99a238ee3bd..08f28f2999f0 100644 --- a/.mailmap +++ b/.mailmap @@ -553,6 +553,7 @@ Senthilkumar N L Serge Hallyn Serge Hallyn Seth Forshee +Shakeel Butt Shannon Nelson Shannon Nelson Shannon Nelson diff --git a/MAINTAINERS b/MAINTAINERS index 9ed4d3868539..c3c9cf33595c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5378,7 +5378,7 @@ CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko M: Roman Gushchin -M: Shakeel Butt +M: Shakeel Butt R: Muchun Song L: cgroups@vger.kernel.org L: linux-mm@kvack.org From 1b0ca4e4ff10a2c8402e2cf70132c683e1c772e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 11:40:24 -0800 Subject: [PATCH 214/327] mm/damon/reclaim: fix quota stauts loss due to online tunings Patch series "mm/damon: fix quota status loss due to online tunings". DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status when applying new user parameters, and hence could cause temporal quota accuracy degradation. Fix it by preserving the status. This patch (of 2): For online parameters change, DAMON_RECLAIM creates new scheme based on latest values of the parameters and replaces the old scheme with the new one. When creating it, the internal status of the quota of the old scheme is not preserved. As a result, charging of the quota starts from zero after the online tuning. The data that collected to estimate the throughput of the scheme's action is also reset, and therefore the estimation should start from the scratch again. Because the throughput estimation is being used to convert the time quota to the effective size quota, this could result in temporal time quota inaccuracy. It would be recovered over time, though. In short, the quota accuracy could be temporarily degraded after online parameters update. Fix the problem by checking the case and copying the internal fields for the status. Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index ab974e477d2f..66e190f0374a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_scheme(void) &damon_reclaim_wmarks); } +static void damon_reclaim_copy_quota_status(struct damos_quota *dst, + struct damos_quota *src) +{ + dst->total_charged_sz = src->total_charged_sz; + dst->total_charged_ns = src->total_charged_ns; + dst->charged_sz = src->charged_sz; + dst->charged_from = src->charged_from; + dst->charge_target_from = src->charge_target_from; + dst->charge_addr_from = src->charge_addr_from; +} + static int damon_reclaim_apply_parameters(void) { - struct damos *scheme; + struct damos *scheme, *old_scheme; struct damos_filter *filter; int err = 0; @@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; + if (!list_empty(&ctx->schemes)) { + damon_for_each_scheme(old_scheme, ctx) + damon_reclaim_copy_quota_status(&scheme->quota, + &old_scheme->quota); + } if (skip_anon) { filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); if (!filter) { From 13d0599ab3b2ff17f798353f24bcbef1659d3cfc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 11:40:25 -0800 Subject: [PATCH 215/327] mm/damon/lru_sort: fix quota status loss due to online tunings For online parameters change, DAMON_LRU_SORT creates new schemes based on latest values of the parameters and replaces the old schemes with the new one. When creating it, the internal status of the quotas of the old schemes is not preserved. As a result, charging of the quota starts from zero after the online tuning. The data that collected to estimate the throughput of the scheme's action is also reset, and therefore the estimation should start from the scratch again. Because the throughput estimation is being used to convert the time quota to the effective size quota, this could result in temporal time quota inaccuracy. It would be recovered over time, though. In short, the quota accuracy could be temporarily degraded after online parameters update. Fix the problem by checking the case and copying the internal fields for the status. Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: SeongJae Park Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index f2e5f9431892..3de2916a65c3 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } +static void damon_lru_sort_copy_quota_status(struct damos_quota *dst, + struct damos_quota *src) +{ + dst->total_charged_sz = src->total_charged_sz; + dst->total_charged_ns = src->total_charged_ns; + dst->charged_sz = src->charged_sz; + dst->charged_from = src->charged_from; + dst->charge_target_from = src->charge_target_from; + dst->charge_addr_from = src->charge_addr_from; +} + static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme; + struct damos *scheme, *hot_scheme, *cold_scheme; + struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL; unsigned int hot_thres, cold_thres; int err = 0; @@ -195,18 +207,35 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + damon_for_each_scheme(scheme, ctx) { + if (!old_hot_scheme) { + old_hot_scheme = scheme; + continue; + } + old_cold_scheme = scheme; + } + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * hot_thres_access_freq / 1000; - scheme = damon_lru_sort_new_hot_scheme(hot_thres); - if (!scheme) + hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!hot_scheme) return -ENOMEM; - damon_set_schemes(ctx, &scheme, 1); + if (old_hot_scheme) + damon_lru_sort_copy_quota_status(&hot_scheme->quota, + &old_hot_scheme->quota); cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; - scheme = damon_lru_sort_new_cold_scheme(cold_thres); - if (!scheme) + cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!cold_scheme) { + damon_destroy_scheme(hot_scheme); return -ENOMEM; - damon_add_scheme(ctx, scheme); + } + if (old_cold_scheme) + damon_lru_sort_copy_quota_status(&cold_scheme->quota, + &old_cold_scheme->quota); + + damon_set_schemes(ctx, &hot_scheme, 1); + damon_add_scheme(ctx, cold_scheme); return damon_set_region_biggest_system_ram_default(target, &monitor_region_start, From 2597c9947b0174fcc71bdd7ab6cb49c2b4291e95 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Tue, 13 Feb 2024 14:39:58 +1100 Subject: [PATCH 216/327] kasan: guard release_free_meta() shadow access with kasan_arch_is_ready() release_free_meta() accesses the shadow directly through the path kasan_slab_free __kasan_slab_free kasan_release_object_meta release_free_meta kasan_mem_to_shadow There are no kasan_arch_is_ready() guards here, allowing an oops when the shadow is not initialized. The oops can be seen on a Power8 KVM guest. This patch adds the guard to release_free_meta(), as it's the first level that specifically requires the shadow. It is safe to put the guard at the start of this function, before the stack put: only kasan_save_free_info() can initialize the saved stack, which itself is guarded with kasan_arch_is_ready() by its caller poison_slab_object(). If the arch becomes ready before release_free_meta() then we will not observe KASAN_SLAB_FREE_META in the object's shadow, so we will not put an uninitialized stack either. Link: https://lkml.kernel.org/r/20240213033958.139383-1-bgray@linux.ibm.com Fixes: 63b85ac56a64 ("kasan: stop leaking stack trace handles") Signed-off-by: Benjamin Gray Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Michael Ellerman Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index df6627f62402..032bf3e98c24 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -522,6 +522,9 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { + if (!kasan_arch_is_ready()) + return; + /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; From f3e6b3ae9cfc128af11b665c6ef4022ba2683778 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 18 Feb 2024 14:28:57 -0800 Subject: [PATCH 217/327] acpi/ghes: Remove CXL CPER notifications Initial tests with the CXL CPER implementation identified that error reports were being duplicated in the log and the trace event [1]. Then it was discovered that the notification handler took sleeping locks while the GHES event handling runs in spin_lock_irqsave() context [2] While the duplicate reporting was fixed in v6.8-rc4, the fix for the sleeping-lock-vs-atomic collision would enjoy more time to settle and gain some test cycles. Given how late it is in the development cycle, remove the CXL hookup for now and try again during the next merge window. Note that end result is that v6.8 does not emit CXL CPER payloads to the kernel log, but this is in line with the CXL trend to move error reporting to trace events instead of the kernel log. Cc: Ard Biesheuvel Cc: Rafael J. Wysocki Cc: Jonathan Cameron Reviewed-by: Ira Weiny Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1] Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2] Signed-off-by: Dan Williams --- drivers/acpi/apei/ghes.c | 63 --------------------------------------- drivers/cxl/pci.c | 57 +---------------------------------- include/linux/cxl-event.h | 18 ----------- 3 files changed, 1 insertion(+), 137 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fe825a432c5b..ab2a82cb1b0b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -674,52 +673,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } -/* - * Only a single callback can be registered for CXL CPER events. - */ -static DECLARE_RWSEM(cxl_cper_rw_sem); -static cxl_cper_callback cper_callback; - -static void cxl_cper_post_event(enum cxl_event_type event_type, - struct cxl_cper_event_rec *rec) -{ - if (rec->hdr.length <= sizeof(rec->hdr) || - rec->hdr.length > sizeof(*rec)) { - pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n", - rec->hdr.length); - return; - } - - if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) { - pr_err(FW_WARN "CXL CPER invalid event\n"); - return; - } - - guard(rwsem_read)(&cxl_cper_rw_sem); - if (cper_callback) - cper_callback(event_type, rec); -} - -int cxl_cper_register_callback(cxl_cper_callback callback) -{ - guard(rwsem_write)(&cxl_cper_rw_sem); - if (cper_callback) - return -EINVAL; - cper_callback = callback; - return 0; -} -EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL); - -int cxl_cper_unregister_callback(cxl_cper_callback callback) -{ - guard(rwsem_write)(&cxl_cper_rw_sem); - if (callback != cper_callback) - return -EINVAL; - cper_callback = NULL; - return 0; -} -EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL); - static bool ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) { @@ -754,22 +707,6 @@ static bool ghes_do_proc(struct ghes *ghes, } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev, sync); - } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); - - cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec); - } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); - - cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec); - } else if (guid_equal(sec_type, - &CPER_SEC_CXL_MEM_MODULE_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); - - cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec); } else { void *err = acpi_hest_get_payload(gdata); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 233e7c42c161..2ff361e756d6 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -974,61 +974,6 @@ static struct pci_driver cxl_pci_driver = { }, }; -#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0) -static void cxl_cper_event_call(enum cxl_event_type ev_type, - struct cxl_cper_event_rec *rec) -{ - struct cper_cxl_event_devid *device_id = &rec->hdr.device_id; - struct pci_dev *pdev __free(pci_dev_put) = NULL; - enum cxl_event_log_type log_type; - struct cxl_dev_state *cxlds; - unsigned int devfn; - u32 hdr_flags; - - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num); - pdev = pci_get_domain_bus_and_slot(device_id->segment_num, - device_id->bus_num, devfn); - if (!pdev) - return; - - guard(pci_dev)(pdev); - if (pdev->driver != &cxl_pci_driver) - return; - - cxlds = pci_get_drvdata(pdev); - if (!cxlds) - return; - - /* Fabricate a log type */ - hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags); - log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags); - - cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type, - &uuid_null, &rec->event); -} - -static int __init cxl_pci_driver_init(void) -{ - int rc; - - rc = cxl_cper_register_callback(cxl_cper_event_call); - if (rc) - return rc; - - rc = pci_register_driver(&cxl_pci_driver); - if (rc) - cxl_cper_unregister_callback(cxl_cper_event_call); - - return rc; -} - -static void __exit cxl_pci_driver_exit(void) -{ - pci_unregister_driver(&cxl_pci_driver); - cxl_cper_unregister_callback(cxl_cper_event_call); -} - -module_init(cxl_pci_driver_init); -module_exit(cxl_pci_driver_exit); +module_pci_driver(cxl_pci_driver); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 91125eca4c8a..03fa6d50d46f 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -140,22 +140,4 @@ struct cxl_cper_event_rec { union cxl_event event; } __packed; -typedef void (*cxl_cper_callback)(enum cxl_event_type type, - struct cxl_cper_event_rec *rec); - -#ifdef CONFIG_ACPI_APEI_GHES -int cxl_cper_register_callback(cxl_cper_callback callback); -int cxl_cper_unregister_callback(cxl_cper_callback callback); -#else -static inline int cxl_cper_register_callback(cxl_cper_callback callback) -{ - return 0; -} - -static inline int cxl_cper_unregister_callback(cxl_cper_callback callback) -{ - return 0; -} -#endif - #endif /* _LINUX_CXL_EVENT_H */ From 5c6224bfabbf7f3e491c51ab50fd2c6f92ba1141 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 16 Feb 2024 19:11:34 -0800 Subject: [PATCH 218/327] cxl/acpi: Fix load failures due to single window creation failure The expectation is that cxl_parse_cfwms() continues in the face the of failure as evidenced by code like: cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); if (IS_ERR(cxlrd)) return 0; There are other error paths in that function which mistakenly follow idiomatic expectations and return an error when they should not. Most of those mistakes are innocuous checks that hardly ever fail in practice. However, a recent change succeed in making the implementation more fragile by applying an idiomatic, but still wrong "fix" [1]. In this failure case the kernel reports: cxl root0: Failed to populate active decoder targets cxl_acpi ACPI0017:00: Failed to add decode range: [mem 0x00000000-0x7fffffff flags 0x200] ...which is a real issue with that one window (to be fixed separately), but ends up failing the entirety of cxl_acpi_probe(). Undo that recent breakage while also removing the confusion about ignoring errors. Update all exits paths to return an error per typical expectations and let an outer wrapper function handle dropping the error. Fixes: 91019b5bc7c2 ("cxl/acpi: Return 'rc' instead of '0' in cxl_parse_cfmws()") [1] Cc: Cc: Breno Leitao Cc: Alison Schofield Cc: Vishal Verma Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index dcf2b39e1048..1a3e6aafbdcc 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -316,31 +316,27 @@ static const struct cxl_root_ops acpi_root_ops = { .qos_class = cxl_acpi_qos_class, }; -static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, - const unsigned long end) +static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, + struct cxl_cfmws_context *ctx) { int target_map[CXL_DECODER_MAX_INTERLEAVE]; - struct cxl_cfmws_context *ctx = arg; struct cxl_port *root_port = ctx->root_port; struct resource *cxl_res = ctx->cxl_res; struct cxl_cxims_context cxims_ctx; struct cxl_root_decoder *cxlrd; struct device *dev = ctx->dev; - struct acpi_cedt_cfmws *cfmws; cxl_calc_hb_fn cxl_calc_hb; struct cxl_decoder *cxld; unsigned int ways, i, ig; struct resource *res; int rc; - cfmws = (struct acpi_cedt_cfmws *) header; - rc = cxl_acpi_cfmws_verify(dev, cfmws); if (rc) { dev_err(dev, "CFMWS range %#llx-%#llx not registered\n", cfmws->base_hpa, cfmws->base_hpa + cfmws->window_size - 1); - return 0; + return rc; } rc = eiw_to_ways(cfmws->interleave_ways, &ways); @@ -376,7 +372,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); if (IS_ERR(cxlrd)) - return 0; + return PTR_ERR(cxlrd); cxld = &cxlrd->cxlsd.cxld; cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions); @@ -420,16 +416,7 @@ err_xormap: put_device(&cxld->dev); else rc = cxl_decoder_autoremove(dev, cxld); - if (rc) { - dev_err(dev, "Failed to add decode range: %pr", res); - return rc; - } - dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n", - dev_name(&cxld->dev), - phys_to_target_node(cxld->hpa_range.start), - cxld->hpa_range.start, cxld->hpa_range.end); - - return 0; + return rc; err_insert: kfree(res->name); @@ -438,6 +425,29 @@ err_name: return -ENOMEM; } +static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header; + struct cxl_cfmws_context *ctx = arg; + struct device *dev = ctx->dev; + int rc; + + rc = __cxl_parse_cfmws(cfmws, ctx); + if (rc) + dev_err(dev, + "Failed to add decode range: [%#llx - %#llx] (%d)\n", + cfmws->base_hpa, + cfmws->base_hpa + cfmws->window_size - 1, rc); + else + dev_dbg(dev, "decode range: node: %d range [%#llx - %#llx]\n", + phys_to_target_node(cfmws->base_hpa), cfmws->base_hpa, + cfmws->base_hpa + cfmws->window_size - 1); + + /* never fail cxl_acpi load for a single window failure */ + return 0; +} + __mock struct acpi_device *to_cxl_host_bridge(struct device *host, struct device *dev) { From fb1e881273f432e593f8789f99e725b09304cc97 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 20 Feb 2024 14:12:51 +0100 Subject: [PATCH 219/327] drm/i915/tv: Fix TV mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1fd4a5a36f9f ("drm/connector: Rename legacy TV property") failed to update all the users of the struct drm_tv_connector_state mode field, which resulted in a build failure in i915. However, a subsequent commit in the same series reintroduced a mode field in that structure, with a different semantic but the same type, with the assumption that all previous users were updated. Since that didn't happen, the i915 driver now compiles, but mixes accesses to the legacy_mode field and the newer mode field, but with the previous semantics. This obviously doesn't work very well, so we need to update the accesses that weren't in the legacy renaming commit. Fixes: 1fd4a5a36f9f ("drm/connector: Rename legacy TV property") Reported-by: Ville Syrjälä Signed-off-by: Maxime Ripard Reviewed-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240220131251.453060-1-mripard@kernel.org (cherry picked from commit bf7626f19d6ff14b9722273e23700400cc4d78ba) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_sdvo.c | 10 +++++----- drivers/gpu/drm/i915/display/intel_tv.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_sdvo.c b/drivers/gpu/drm/i915/display/intel_sdvo.c index acc6b6804105..2915d7afe5cc 100644 --- a/drivers/gpu/drm/i915/display/intel_sdvo.c +++ b/drivers/gpu/drm/i915/display/intel_sdvo.c @@ -1209,7 +1209,7 @@ static bool intel_sdvo_set_tv_format(struct intel_sdvo *intel_sdvo, struct intel_sdvo_tv_format format; u32 format_map; - format_map = 1 << conn_state->tv.mode; + format_map = 1 << conn_state->tv.legacy_mode; memset(&format, 0, sizeof(format)); memcpy(&format, &format_map, min(sizeof(format), sizeof(format_map))); @@ -2298,7 +2298,7 @@ static int intel_sdvo_get_tv_modes(struct drm_connector *connector) * Read the list of supported input resolutions for the selected TV * format. */ - format_map = 1 << conn_state->tv.mode; + format_map = 1 << conn_state->tv.legacy_mode; memcpy(&tv_res, &format_map, min(sizeof(format_map), sizeof(struct intel_sdvo_sdtv_resolution_request))); @@ -2363,7 +2363,7 @@ intel_sdvo_connector_atomic_get_property(struct drm_connector *connector, int i; for (i = 0; i < intel_sdvo_connector->format_supported_num; i++) - if (state->tv.mode == intel_sdvo_connector->tv_format_supported[i]) { + if (state->tv.legacy_mode == intel_sdvo_connector->tv_format_supported[i]) { *val = i; return 0; @@ -2419,7 +2419,7 @@ intel_sdvo_connector_atomic_set_property(struct drm_connector *connector, struct intel_sdvo_connector_state *sdvo_state = to_intel_sdvo_connector_state(state); if (property == intel_sdvo_connector->tv_format) { - state->tv.mode = intel_sdvo_connector->tv_format_supported[val]; + state->tv.legacy_mode = intel_sdvo_connector->tv_format_supported[val]; if (state->crtc) { struct drm_crtc_state *crtc_state = @@ -3076,7 +3076,7 @@ static bool intel_sdvo_tv_create_property(struct intel_sdvo *intel_sdvo, drm_property_add_enum(intel_sdvo_connector->tv_format, i, tv_format_names[intel_sdvo_connector->tv_format_supported[i]]); - intel_sdvo_connector->base.base.state->tv.mode = intel_sdvo_connector->tv_format_supported[0]; + intel_sdvo_connector->base.base.state->tv.legacy_mode = intel_sdvo_connector->tv_format_supported[0]; drm_object_attach_property(&intel_sdvo_connector->base.base.base, intel_sdvo_connector->tv_format, 0); return true; diff --git a/drivers/gpu/drm/i915/display/intel_tv.c b/drivers/gpu/drm/i915/display/intel_tv.c index d4386cb3569e..992a725de751 100644 --- a/drivers/gpu/drm/i915/display/intel_tv.c +++ b/drivers/gpu/drm/i915/display/intel_tv.c @@ -949,7 +949,7 @@ intel_disable_tv(struct intel_atomic_state *state, static const struct tv_mode *intel_tv_mode_find(const struct drm_connector_state *conn_state) { - int format = conn_state->tv.mode; + int format = conn_state->tv.legacy_mode; return &tv_modes[format]; } @@ -1704,7 +1704,7 @@ static void intel_tv_find_better_format(struct drm_connector *connector) break; } - connector->state->tv.mode = i; + connector->state->tv.legacy_mode = i; } static int @@ -1859,7 +1859,7 @@ static int intel_tv_atomic_check(struct drm_connector *connector, old_state = drm_atomic_get_old_connector_state(state, connector); new_crtc_state = drm_atomic_get_new_crtc_state(state, new_state->crtc); - if (old_state->tv.mode != new_state->tv.mode || + if (old_state->tv.legacy_mode != new_state->tv.legacy_mode || old_state->tv.margins.left != new_state->tv.margins.left || old_state->tv.margins.right != new_state->tv.margins.right || old_state->tv.margins.top != new_state->tv.margins.top || @@ -1896,7 +1896,7 @@ static void intel_tv_add_properties(struct drm_connector *connector) conn_state->tv.margins.right = 46; conn_state->tv.margins.bottom = 37; - conn_state->tv.mode = 0; + conn_state->tv.legacy_mode = 0; /* Create TV properties then attach current values */ for (i = 0; i < ARRAY_SIZE(tv_modes); i++) { @@ -1910,7 +1910,7 @@ static void intel_tv_add_properties(struct drm_connector *connector) drm_object_attach_property(&connector->base, i915->drm.mode_config.legacy_tv_mode_property, - conn_state->tv.mode); + conn_state->tv.legacy_mode); drm_object_attach_property(&connector->base, i915->drm.mode_config.tv_left_margin_property, conn_state->tv.margins.left); From 77aebae1ea12de6eae5ce70d05b3d4724eec4023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 9 Feb 2024 12:34:44 +0100 Subject: [PATCH 220/327] drm/xe/uapi: Remove support for persistent exec_queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Persistent exec_queues delays explicit destruction of exec_queues until they are done executing, but destruction on process exit is still immediate. It turns out no UMD is relying on this functionality, so remove it. If there turns out to be a use-case in the future, let's re-add. Persistent exec_queues were never used for LR VMs v2: - Don't add an "UNUSED" define for the missing property (Lucas, Rodrigo) v3: - Remove the remaining struct xe_exec_queue::persistent state (Niranjana, Lucas) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Matthew Brost Cc: David Airlie Cc: Daniel Vetter Cc: Lucas De Marchi Cc: Francois Dugast Signed-off-by: Thomas Hellström Reviewed-by: Lucas De Marchi Acked-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20240209113444.8396-1-thomas.hellstrom@linux.intel.com (cherry picked from commit f1a9abc0cf311375695bede1590364864c05976d) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_device.c | 39 ------------------------ drivers/gpu/drm/xe/xe_device.h | 4 --- drivers/gpu/drm/xe/xe_device_types.h | 8 ----- drivers/gpu/drm/xe/xe_exec_queue.c | 33 +++----------------- drivers/gpu/drm/xe/xe_exec_queue_types.h | 10 ------ drivers/gpu/drm/xe/xe_execlist.c | 2 -- drivers/gpu/drm/xe/xe_guc_submit.c | 2 -- include/uapi/drm/xe_drm.h | 1 - 8 files changed, 5 insertions(+), 94 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 1f0b4b9ce84f..5176c27e4b6a 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -83,9 +83,6 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) return 0; } -static void device_kill_persistent_exec_queues(struct xe_device *xe, - struct xe_file *xef); - static void xe_file_close(struct drm_device *dev, struct drm_file *file) { struct xe_device *xe = to_xe_device(dev); @@ -102,8 +99,6 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) mutex_unlock(&xef->exec_queue.lock); xa_destroy(&xef->exec_queue.xa); mutex_destroy(&xef->exec_queue.lock); - device_kill_persistent_exec_queues(xe, xef); - mutex_lock(&xef->vm.lock); xa_for_each(&xef->vm.xa, idx, vm) xe_vm_close_and_put(vm); @@ -255,9 +250,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, xa_erase(&xe->usm.asid_to_vm, asid); } - drmm_mutex_init(&xe->drm, &xe->persistent_engines.lock); - INIT_LIST_HEAD(&xe->persistent_engines.list); - spin_lock_init(&xe->pinned.lock); INIT_LIST_HEAD(&xe->pinned.kernel_bo_present); INIT_LIST_HEAD(&xe->pinned.external_vram); @@ -570,37 +562,6 @@ void xe_device_shutdown(struct xe_device *xe) { } -void xe_device_add_persistent_exec_queues(struct xe_device *xe, struct xe_exec_queue *q) -{ - mutex_lock(&xe->persistent_engines.lock); - list_add_tail(&q->persistent.link, &xe->persistent_engines.list); - mutex_unlock(&xe->persistent_engines.lock); -} - -void xe_device_remove_persistent_exec_queues(struct xe_device *xe, - struct xe_exec_queue *q) -{ - mutex_lock(&xe->persistent_engines.lock); - if (!list_empty(&q->persistent.link)) - list_del(&q->persistent.link); - mutex_unlock(&xe->persistent_engines.lock); -} - -static void device_kill_persistent_exec_queues(struct xe_device *xe, - struct xe_file *xef) -{ - struct xe_exec_queue *q, *next; - - mutex_lock(&xe->persistent_engines.lock); - list_for_each_entry_safe(q, next, &xe->persistent_engines.list, - persistent.link) - if (q->persistent.xef == xef) { - xe_exec_queue_kill(q); - list_del_init(&q->persistent.link); - } - mutex_unlock(&xe->persistent_engines.lock); -} - void xe_device_wmb(struct xe_device *xe) { struct xe_gt *gt = xe_root_mmio_gt(xe); diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 3da83b233206..08d8b72c7731 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -42,10 +42,6 @@ int xe_device_probe(struct xe_device *xe); void xe_device_remove(struct xe_device *xe); void xe_device_shutdown(struct xe_device *xe); -void xe_device_add_persistent_exec_queues(struct xe_device *xe, struct xe_exec_queue *q); -void xe_device_remove_persistent_exec_queues(struct xe_device *xe, - struct xe_exec_queue *q); - void xe_device_wmb(struct xe_device *xe); static inline struct xe_file *to_xe_file(const struct drm_file *file) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 5dc9127a2029..e8491979a6f2 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -341,14 +341,6 @@ struct xe_device { struct mutex lock; } usm; - /** @persistent_engines: engines that are closed but still running */ - struct { - /** @lock: protects persistent engines */ - struct mutex lock; - /** @list: list of persistent engines */ - struct list_head list; - } persistent_engines; - /** @pinned: pinned BO state */ struct { /** @lock: protected pinned BO list state */ diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 254b1d3af4cb..3acfd4f07666 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -60,7 +60,6 @@ static struct xe_exec_queue *__xe_exec_queue_create(struct xe_device *xe, q->fence_irq = >->fence_irq[hwe->class]; q->ring_ops = gt->ring_ops[hwe->class]; q->ops = gt->exec_queue_ops; - INIT_LIST_HEAD(&q->persistent.link); INIT_LIST_HEAD(&q->compute.link); INIT_LIST_HEAD(&q->multi_gt_link); @@ -326,23 +325,6 @@ static int exec_queue_set_preemption_timeout(struct xe_device *xe, return q->ops->set_preempt_timeout(q, value); } -static int exec_queue_set_persistence(struct xe_device *xe, struct xe_exec_queue *q, - u64 value, bool create) -{ - if (XE_IOCTL_DBG(xe, !create)) - return -EINVAL; - - if (XE_IOCTL_DBG(xe, xe_vm_in_preempt_fence_mode(q->vm))) - return -EINVAL; - - if (value) - q->flags |= EXEC_QUEUE_FLAG_PERSISTENT; - else - q->flags &= ~EXEC_QUEUE_FLAG_PERSISTENT; - - return 0; -} - static int exec_queue_set_job_timeout(struct xe_device *xe, struct xe_exec_queue *q, u64 value, bool create) { @@ -414,7 +396,6 @@ static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = { [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY] = exec_queue_set_priority, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE] = exec_queue_set_timeslice, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT] = exec_queue_set_preemption_timeout, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PERSISTENCE] = exec_queue_set_persistence, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT] = exec_queue_set_job_timeout, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER] = exec_queue_set_acc_trigger, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY] = exec_queue_set_acc_notify, @@ -441,6 +422,9 @@ static int exec_queue_user_ext_set_property(struct xe_device *xe, return -EINVAL; idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs)); + if (!exec_queue_set_property_funcs[idx]) + return -EINVAL; + return exec_queue_set_property_funcs[idx](xe, q, ext.value, create); } @@ -704,9 +688,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, } q = xe_exec_queue_create(xe, vm, logical_mask, - args->width, hwe, - xe_vm_in_lr_mode(vm) ? 0 : - EXEC_QUEUE_FLAG_PERSISTENT); + args->width, hwe, 0); up_read(&vm->lock); xe_vm_put(vm); if (IS_ERR(q)) @@ -728,8 +710,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, goto kill_exec_queue; } - q->persistent.xef = xef; - mutex_lock(&xef->exec_queue.lock); err = xa_alloc(&xef->exec_queue.xa, &id, q, xa_limit_32b, GFP_KERNEL); mutex_unlock(&xef->exec_queue.lock); @@ -872,10 +852,7 @@ int xe_exec_queue_destroy_ioctl(struct drm_device *dev, void *data, if (XE_IOCTL_DBG(xe, !q)) return -ENOENT; - if (!(q->flags & EXEC_QUEUE_FLAG_PERSISTENT)) - xe_exec_queue_kill(q); - else - xe_device_add_persistent_exec_queues(xe, q); + xe_exec_queue_kill(q); trace_xe_exec_queue_close(q); xe_exec_queue_put(q); diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 8d4b7feb8c30..947bbc4b285d 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -105,16 +105,6 @@ struct xe_exec_queue { struct xe_guc_exec_queue *guc; }; - /** - * @persistent: persistent exec queue state - */ - struct { - /** @xef: file which this exec queue belongs to */ - struct xe_file *xef; - /** @link: link in list of persistent exec queues */ - struct list_head link; - } persistent; - union { /** * @parallel: parallel submission state diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c index 96b5224eb478..42d01bbbf7d0 100644 --- a/drivers/gpu/drm/xe/xe_execlist.c +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -378,8 +378,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w) list_del(&exl->active_link); spin_unlock_irqrestore(&exl->port->lock, flags); - if (q->flags & EXEC_QUEUE_FLAG_PERSISTENT) - xe_device_remove_persistent_exec_queues(xe, q); drm_sched_entity_fini(&exl->entity); drm_sched_fini(&exl->sched); kfree(exl); diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 54ffcfcdd41f..f22ae717b0b2 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1028,8 +1028,6 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); - if (q->flags & EXEC_QUEUE_FLAG_PERSISTENT) - xe_device_remove_persistent_exec_queues(gt_to_xe(q->gt), q); release_guc_id(guc, q); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 9fa3ae324731..6d11ee9e571a 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1046,7 +1046,6 @@ struct drm_xe_exec_queue_create { #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY 0 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE 1 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT 2 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PERSISTENCE 3 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT 4 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER 5 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY 6 From 85ce8e1d6d73e8d54cb244d10dd4021771231746 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:52 +0800 Subject: [PATCH 221/327] iommu/vt-d: Track nested domains in parent Today the parent domain (s2_domain) is unaware of which DID's are used by and which devices are attached to nested domains (s1_domain) nested on it. This leads to a problem that some operations (flush iotlb/devtlb and enable dirty tracking) on parent domain only apply to DID's and devices directly tracked in the parent domain hence are incomplete. This tracks the nested domains in list in parent domain. With this, operations on parent domain can loop the nested domains and refer to the devices and iommu_array to ensure the operations on parent domain take effect on all the affected devices and iommus. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 18 ++++++++++++++---- drivers/iommu/intel/iommu.h | 6 ++++++ drivers/iommu/intel/nested.c | 12 +++++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6fb5f6fceea1..e393c62776f3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3883,6 +3883,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; struct iommu_domain *domain; /* Must be NESTING domain */ @@ -3908,11 +3909,16 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (!domain) return ERR_PTR(-ENOMEM); - if (nested_parent) - to_dmar_domain(domain)->nested_parent = true; + dmar_domain = to_dmar_domain(domain); + + if (nested_parent) { + dmar_domain->nested_parent = true; + INIT_LIST_HEAD(&dmar_domain->s1_domains); + spin_lock_init(&dmar_domain->s1_lock); + } if (dirty_tracking) { - if (to_dmar_domain(domain)->use_first_level) { + if (dmar_domain->use_first_level) { iommu_domain_free(domain); return ERR_PTR(-EOPNOTSUPP); } @@ -3924,8 +3930,12 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, static void intel_iommu_domain_free(struct iommu_domain *domain) { + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains)); if (domain != &si_domain->domain) - domain_exit(to_dmar_domain(domain)); + domain_exit(dmar_domain); } int prepare_domain_attach_device(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d02f916d8e59..9b27edb73aa9 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -627,6 +627,10 @@ struct dmar_domain { int agaw; /* maximum mapped address */ u64 max_addr; + /* Protect the s1_domains list */ + spinlock_t s1_lock; + /* Track s1_domains nested on this domain */ + struct list_head s1_domains; }; /* Nested user domain */ @@ -637,6 +641,8 @@ struct dmar_domain { unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; + /* link to parent domain siblings */ + struct list_head s2_link; }; }; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index f26c7f1c46cc..6a75f6eb18f1 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -70,7 +70,13 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, static void intel_nested_domain_free(struct iommu_domain *domain) { - kfree(to_dmar_domain(domain)); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct dmar_domain *s2_domain = dmar_domain->s2_domain; + + spin_lock(&s2_domain->s1_lock); + list_del(&dmar_domain->s2_link); + spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain); } static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, @@ -201,5 +207,9 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, spin_lock_init(&domain->lock); xa_init(&domain->iommu_array); + spin_lock(&s2_domain->s1_lock); + list_add(&domain->s2_link, &s2_domain->s1_domains); + spin_unlock(&s2_domain->s1_lock); + return &domain->domain; } From 0455d317f533e4427ddaa805563ff48711f05810 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:53 +0800 Subject: [PATCH 222/327] iommu/vt-d: Add __iommu_flush_iotlb_psi() Add __iommu_flush_iotlb_psi() to do the psi iotlb flush with a DID input rather than calculating it within the helper. This is useful when flushing cache for parent domain which reuses DIDs of its nested domains. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-3-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 78 ++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e393c62776f3..dbdb8366c42a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1368,6 +1368,46 @@ static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, spin_unlock_irqrestore(&domain->lock, flags); } +static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + unsigned long pfn, unsigned int pages, + int ih) +{ + unsigned int aligned_pages = __roundup_pow_of_two(pages); + unsigned long bitmask = aligned_pages - 1; + unsigned int mask = ilog2(aligned_pages); + u64 addr = (u64)pfn << VTD_PAGE_SHIFT; + + /* + * PSI masks the low order bits of the base address. If the + * address isn't aligned to the mask, then compute a mask value + * needed to ensure the target range is flushed. + */ + if (unlikely(bitmask & pfn)) { + unsigned long end_pfn = pfn + pages - 1, shared_bits; + + /* + * Since end_pfn <= pfn + bitmask, the only way bits + * higher than bitmask can differ in pfn and end_pfn is + * by carrying. This means after masking out bitmask, + * high bits starting with the first set bit in + * shared_bits are all equal in both pfn and end_pfn. + */ + shared_bits = ~(pfn ^ end_pfn) & ~bitmask; + mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; + } + + /* + * Fallback to domain selective flush if no PSI support or + * the size is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, + DMA_TLB_PSI_FLUSH); +} + static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, struct dmar_domain *domain, unsigned long pfn, unsigned int pages, @@ -1384,42 +1424,10 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, if (ih) ih = 1 << 6; - if (domain->use_first_level) { + if (domain->use_first_level) domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); - } else { - unsigned long bitmask = aligned_pages - 1; - - /* - * PSI masks the low order bits of the base address. If the - * address isn't aligned to the mask, then compute a mask value - * needed to ensure the target range is flushed. - */ - if (unlikely(bitmask & pfn)) { - unsigned long end_pfn = pfn + pages - 1, shared_bits; - - /* - * Since end_pfn <= pfn + bitmask, the only way bits - * higher than bitmask can differ in pfn and end_pfn is - * by carrying. This means after masking out bitmask, - * high bits starting with the first set bit in - * shared_bits are all equal in both pfn and end_pfn. - */ - shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; - } - - /* - * Fallback to domain selective flush if no PSI support or - * the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + else + __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih); /* * In caching mode, changes of pages from non-present to present require From 821985301124c0a5e7ca15fc69a9a531e9442d70 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:54 +0800 Subject: [PATCH 223/327] iommu/vt-d: Add missing iotlb flush for parent domain If a domain is used as the parent in nested translation its mappings might be cached using DID of the nested domain. But the existing code ignores this fact to only invalidate the iotlb entries tagged by the domain's own DID. Loop the s1_domains list, if any, to invalidate all iotlb entries related to the target s2 address range. According to VT-d spec there is no need for software to explicitly flush the affected s1 cache. It's implicitly done by HW when s2 cache is invalidated. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-4-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index dbdb8366c42a..e3dbcae95975 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1451,6 +1451,28 @@ static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain * iommu_flush_write_buffer(iommu); } +/* + * Flush the relevant caches in nested translation if the domain + * also serves as a parent + */ +static void parent_domain_flush(struct dmar_domain *domain, + unsigned long pfn, + unsigned long pages, int ih) +{ + struct dmar_domain *s1_domain; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + struct iommu_domain_info *info; + unsigned long i; + + xa_for_each(&s1_domain->iommu_array, i, info) + __iommu_flush_iotlb_psi(info->iommu, info->did, + pfn, pages, ih); + } + spin_unlock(&domain->s1_lock); +} + static void intel_flush_iotlb_all(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -1470,6 +1492,9 @@ static void intel_flush_iotlb_all(struct iommu_domain *domain) if (!cap_caching_mode(iommu->cap)) iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); } + + if (dmar_domain->nested_parent) + parent_domain_flush(dmar_domain, 0, -1, 0); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1993,6 +2018,9 @@ static void switch_to_super_page(struct dmar_domain *domain, iommu_flush_iotlb_psi(info->iommu, domain, start_pfn, lvl_pages, 0, 0); + if (domain->nested_parent) + parent_domain_flush(domain, start_pfn, + lvl_pages, 0); } pte++; @@ -4125,6 +4153,9 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, start_pfn, nrpages, list_empty(&gather->freelist), 0); + if (dmar_domain->nested_parent) + parent_domain_flush(dmar_domain, start_pfn, nrpages, + list_empty(&gather->freelist)); put_pages_list(&gather->freelist); } From 29e10487d6df050afeee886b7c1da208f389cb5b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:55 +0800 Subject: [PATCH 224/327] iommu/vt-d: Update iotlb in nested domain attach Should call domain_update_iotlb() to update the has_iotlb_device flag of the domain after attaching device to nested domain. Without it, this flag is not set properly and would result in missing device TLB flush. Fixes: 9838f2bb6b6b ("iommu/vt-d: Set the nested domain to a device") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-5-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 +--- drivers/iommu/intel/iommu.h | 1 + drivers/iommu/intel/nested.c | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e3dbcae95975..711c3e3fe095 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -396,8 +396,6 @@ static int domain_update_device_node(struct dmar_domain *domain) return nid; } -static void domain_update_iotlb(struct dmar_domain *domain); - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -1218,7 +1216,7 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -static void domain_update_iotlb(struct dmar_domain *domain) +void domain_update_iotlb(struct dmar_domain *domain) { struct dev_pasid_info *dev_pasid; struct device_domain_info *info; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 9b27edb73aa9..4145c04cb1c6 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1066,6 +1066,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 6a75f6eb18f1..d5af5925a31c 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -65,6 +65,8 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_update_iotlb(dmar_domain); + return 0; } From 5e54e861f16f37c84ae68d6d4bbd3de54b668317 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:56 +0800 Subject: [PATCH 225/327] iommu/vt-d: Add missing device iotlb flush for parent domain ATS-capable devices cache the result of nested translation. This result relies on the mappings in s2 domain (a.k.a. parent). When there are modifications in the s2 domain, the related nested translation caches on the device should be flushed. This includes the devices that are attached to the s1 domain. However, the existing code ignores this fact to only loops its own devices. As there is no easy way to identify the exact set of nested translations affected by the change of s2 domain. So, this just flushes the entire device iotlb on the device. As above, driver loops the s2 domain's s1_domains list and loops the devices list of each s1_domain to flush the entire device iotlb on the devices. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-6-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 711c3e3fe095..2eee83b5441b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1461,12 +1461,30 @@ static void parent_domain_flush(struct dmar_domain *domain, spin_lock(&domain->s1_lock); list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + struct device_domain_info *device_info; struct iommu_domain_info *info; + unsigned long flags; unsigned long i; xa_for_each(&s1_domain->iommu_array, i, info) __iommu_flush_iotlb_psi(info->iommu, info->did, pfn, pages, ih); + + if (!s1_domain->has_iotlb_device) + continue; + + spin_lock_irqsave(&s1_domain->lock, flags); + list_for_each_entry(device_info, &s1_domain->devices, link) + /* + * Address translation cache in device side caches the + * result of nested translation. There is no easy way + * to identify the exact set of nested translations + * affected by a change in S2. So just flush the entire + * device cache. + */ + __iommu_flush_dev_iotlb(device_info, 0, + MAX_AGAW_PFN_WIDTH); + spin_unlock_irqrestore(&s1_domain->lock, flags); } spin_unlock(&domain->s1_lock); } From 56ecaf6c5834ace14941d7f13dceb48bc3327111 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:57 +0800 Subject: [PATCH 226/327] iommu/vt-d: Remove domain parameter for intel_pasid_setup_dirty_tracking() The only usage of input @domain is to get the domain id (DID) to flush cache after setting dirty tracking. However, DID can be obtained from the pasid entry. So no need to pass in domain. This can make this helper cleaner when adding the missing dirty tracking for the parent domain, which needs to use the DID of nested domain. Signed-off-by: Yi Liu Reviewed-by: Joao Martins Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-7-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 7 +++---- drivers/iommu/intel/pasid.c | 3 +-- drivers/iommu/intel/pasid.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2eee83b5441b..d286b8554227 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4741,8 +4741,7 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, goto out_unlock; list_for_each_entry(info, &dmar_domain->devices, link) { - ret = intel_pasid_setup_dirty_tracking(info->iommu, - info->domain, info->dev, + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, IOMMU_NO_PASID, enable); if (ret) goto err_unwind; @@ -4756,8 +4755,8 @@ out_unlock: err_unwind: list_for_each_entry(info, &dmar_domain->devices, link) - intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, - info->dev, IOMMU_NO_PASID, + intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 3239cefa4c33..a32d7e509842 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -428,7 +428,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, * Set up dirty tracking on a second only or nested translation type. */ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid, bool enabled) { @@ -445,7 +444,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, return -ENODEV; } - did = domain_id_iommu(domain, iommu); + did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && pgtt != PASID_ENTRY_PGTT_NESTED) { diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 8d40d4c66e31..487ede039bdd 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -307,7 +307,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid, bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, From 0c7f2497b39da44253d7bcf2b41f52b0048859ad Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:58 +0800 Subject: [PATCH 227/327] iommu/vt-d: Wrap the dirty tracking loop to be a helper Add device_set_dirty_tracking() to loop all the devices and set the dirty tracking per the @enable parameter. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20240208082307.15759-8-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d286b8554227..2ad8fbe6dc21 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4729,23 +4729,38 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) return vtd; } +/* + * Set dirty tracking for the device list of a domain. The caller must + * hold the domain->lock when calling it. + */ +static int device_set_dirty_tracking(struct list_head *devices, bool enable) +{ + struct device_domain_info *info; + int ret = 0; + + list_for_each_entry(info, devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + break; + } + + return ret; +} + static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct device_domain_info *info; int ret; spin_lock(&dmar_domain->lock); if (dmar_domain->dirty_tracking == enable) goto out_unlock; - list_for_each_entry(info, &dmar_domain->devices, link) { - ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, - IOMMU_NO_PASID, enable); - if (ret) - goto err_unwind; - } + ret = device_set_dirty_tracking(&dmar_domain->devices, enable); + if (ret) + goto err_unwind; dmar_domain->dirty_tracking = enable; out_unlock: @@ -4754,10 +4769,8 @@ out_unlock: return 0; err_unwind: - list_for_each_entry(info, &dmar_domain->devices, link) - intel_pasid_setup_dirty_tracking(info->iommu, info->dev, - IOMMU_NO_PASID, - dmar_domain->dirty_tracking); + device_set_dirty_tracking(&dmar_domain->devices, + dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; } From f1e1610950eac0af5e40f6ee02315952f78192f7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:15:59 +0800 Subject: [PATCH 228/327] iommu/vt-d: Add missing dirty tracking set for parent domain Setting dirty tracking for a s2 domain requires to loop all the related devices and set the dirty tracking enable bit in the PASID table entry. This includes the devices that are attached to the nested domains of a s2 domain if this s2 domain is used as parent. However, the existing dirty tracking set only loops s2 domain's own devices. It will miss dirty page logs in the parent domain. Now, the parent domain tracks the nested domains, so it can loop the nested domains and the devices attached to the nested domains to ensure dirty tracking on the parent is set completely. Fixes: b41e38e22539 ("iommu/vt-d: Add nested domain allocation") Signed-off-by: Yi Sun Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240208082307.15759-9-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2ad8fbe6dc21..11652e0bcab3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4748,6 +4748,35 @@ static int device_set_dirty_tracking(struct list_head *devices, bool enable) return ret; } +static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, + bool enable) +{ + struct dmar_domain *s1_domain; + unsigned long flags; + int ret; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + ret = device_set_dirty_tracking(&s1_domain->devices, enable); + spin_unlock_irqrestore(&s1_domain->lock, flags); + if (ret) + goto err_unwind; + } + spin_unlock(&domain->s1_lock); + return 0; + +err_unwind: + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + device_set_dirty_tracking(&s1_domain->devices, + domain->dirty_tracking); + spin_unlock_irqrestore(&s1_domain->lock, flags); + } + spin_unlock(&domain->s1_lock); + return ret; +} + static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { @@ -4762,6 +4791,12 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, if (ret) goto err_unwind; + if (dmar_domain->nested_parent) { + ret = parent_domain_set_dirty_tracking(dmar_domain, enable); + if (ret) + goto err_unwind; + } + dmar_domain->dirty_tracking = enable; out_unlock: spin_unlock(&dmar_domain->lock); From 1f0198fce68340e0da2d438f4ea9fc20d2c958da Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 19 Feb 2024 19:16:00 +0800 Subject: [PATCH 229/327] iommu/vt-d: Set SSADE when attaching to a parent with dirty tracking Should set the SSADE (Second Stage Access/Dirty bit Enable) bit of the pasid entry when attaching a device to a nested domain if its parent has already enabled dirty tracking. Fixes: 111bf85c68f6 ("iommu/vt-d: Add helper to setup pasid nested translation") Signed-off-by: Yi Liu Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20240208091414.28133-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index a32d7e509842..108158e2b907 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -657,6 +657,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, pasid_set_domain_id(pte, did); pasid_set_address_width(pte, s2_domain->agaw); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); pasid_set_present(pte); spin_unlock(&iommu->lock); From 4578f989ed6b77c14af86bb882cc5ff0a46a69eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 19 Feb 2024 19:16:01 +0800 Subject: [PATCH 230/327] iommu/vt-d: Fix constant-out-of-range warning On 32-bit builds, the vt-d driver causes a warning with clang: drivers/iommu/intel/nested.c:112:13: error: result of comparison of constant 18446744073709551615 with expression of type 'unsigned long' is always false [-Werror,-Wtautological-constant-out-of-range-compare] 112 | if (npages == U64_MAX) | ~~~~~~ ^ ~~~~~~~ Make the variable a 64-bit type, which matches both the caller and the use anyway. Fixes: f6f3721244a8 ("iommu/vt-d: Add iotlb flush for nested domain") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240213095832.455245-1-arnd@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index d5af5925a31c..a7d68f3d518a 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -103,7 +103,7 @@ static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, } static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, - unsigned long npages, bool ih) + u64 npages, bool ih) { struct iommu_domain_info *info; unsigned int mask; From ecfac05f962f3aa567ae1796b2586a64fb97fe24 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 19 Feb 2024 13:19:40 -0800 Subject: [PATCH 231/327] drm/xe: Fix xe_vma_set_pte_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xe_vma_set_pte_size had a return value and did not set the 4k VMA flag. Both of these were incorrect. Fix these. Fixes: c47794bdd63d ("drm/xe: Set max pte size when skipping rebinds") Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20240219211942.3633795-2-matthew.brost@intel.com (cherry picked from commit 19adaccef8b246182dc89a7470aa7758245efd5d) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 7b00faa67287..0007448aa189 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2198,7 +2198,7 @@ static u64 xe_vma_max_pte_size(struct xe_vma *vma) return SZ_1G; /* Uninitialized, used max size */ } -static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size) +static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size) { switch (size) { case SZ_1G: @@ -2207,9 +2207,10 @@ static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size) case SZ_2M: vma->gpuva.flags |= XE_VMA_PTE_2M; break; + case SZ_4K: + vma->gpuva.flags |= XE_VMA_PTE_4K; + break; } - - return SZ_4K; } static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op) From 4cf8ffeb6625b7afd97b8d6698f1887071335c32 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 19 Feb 2024 13:19:41 -0800 Subject: [PATCH 232/327] drm/xe: Add XE_VMA_PTE_64K VMA flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add XE_VMA_PTE_64K VMA flag to ensure skipping rebinds does not cross 64k page boundaries. Fixes: 8f33b4f054fc ("drm/xe: Avoid doing rebinds") Fixes: c47794bdd63d ("drm/xe: Set max pte size when skipping rebinds") Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20240219211942.3633795-3-matthew.brost@intel.com (cherry picked from commit 15f0e0c2c46dddd8ee56d9b3db679fd302cc4b91) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_pt.c | 6 ++++-- drivers/gpu/drm/xe/xe_vm.c | 5 +++++ drivers/gpu/drm/xe/xe_vm_types.h | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index ac19bfa3f798..d237eeb3513d 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -499,10 +499,12 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset, * this device *requires* 64K PTE size for VRAM, fail. */ if (level == 0 && !xe_parent->is_compact) { - if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) + if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) { + xe_walk->vma->gpuva.flags |= XE_VMA_PTE_64K; pte |= XE_PTE_PS64; - else if (XE_WARN_ON(xe_walk->needs_64K)) + } else if (XE_WARN_ON(xe_walk->needs_64K)) { return -EINVAL; + } } ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 0007448aa189..a0fc3640d3fa 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2192,6 +2192,8 @@ static u64 xe_vma_max_pte_size(struct xe_vma *vma) return SZ_1G; else if (vma->gpuva.flags & XE_VMA_PTE_2M) return SZ_2M; + else if (vma->gpuva.flags & XE_VMA_PTE_64K) + return SZ_64K; else if (vma->gpuva.flags & XE_VMA_PTE_4K) return SZ_4K; @@ -2207,6 +2209,9 @@ static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size) case SZ_2M: vma->gpuva.flags |= XE_VMA_PTE_2M; break; + case SZ_64K: + vma->gpuva.flags |= XE_VMA_PTE_64K; + break; case SZ_4K: vma->gpuva.flags |= XE_VMA_PTE_4K; break; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 5ac9c5bebabc..91800ce70845 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -29,6 +29,7 @@ struct xe_vm; #define XE_VMA_PTE_4K (DRM_GPUVA_USERBITS << 5) #define XE_VMA_PTE_2M (DRM_GPUVA_USERBITS << 6) #define XE_VMA_PTE_1G (DRM_GPUVA_USERBITS << 7) +#define XE_VMA_PTE_64K (DRM_GPUVA_USERBITS << 8) /** struct xe_userptr - User pointer */ struct xe_userptr { From 5b672ec3f5e15062b76d280f8a4df15e763f6abe Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 19 Feb 2024 13:19:42 -0800 Subject: [PATCH 233/327] drm/xe: Return 2MB page size for compact 64k PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compact 64k PTEs are only intended to be used within a single VMA which covers the entire 2MB range of the compact 64k PTEs. Add XE_VMA_PTE_COMPACT VMA flag to indicate compact 64k PTEs are used and update xe_vma_max_pte_size to return at least 2MB if set. v2: Include missing changes Fixes: 8f33b4f054fc ("drm/xe: Avoid doing rebinds") Fixes: c47794bdd63d ("drm/xe: Set max pte size when skipping rebinds") Reported-by: Paulo Zanoni Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/758 Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20240219211942.3633795-4-matthew.brost@intel.com (cherry picked from commit 0f688c0eb63a643ef0568b29b12cefbb23181e1a) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_pt.c | 5 ++++- drivers/gpu/drm/xe/xe_vm.c | 2 +- drivers/gpu/drm/xe/xe_vm_types.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index d237eeb3513d..6653c045f3c9 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -547,13 +547,16 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset, *child = &xe_child->base; /* - * Prefer the compact pagetable layout for L0 if possible. + * Prefer the compact pagetable layout for L0 if possible. Only + * possible if VMA covers entire 2MB region as compact 64k and + * 4k pages cannot be mixed within a 2MB region. * TODO: Suballocate the pt bo to avoid wasting a lot of * memory. */ if (GRAPHICS_VERx100(tile_to_xe(xe_walk->tile)) >= 1250 && level == 1 && covers && xe_pt_scan_64K(addr, next, xe_walk)) { walk->shifts = xe_compact_pt_shifts; + xe_walk->vma->gpuva.flags |= XE_VMA_PTE_COMPACT; flags |= XE_PDE_64K; xe_child->is_compact = true; } diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index a0fc3640d3fa..921ca28d49dd 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2190,7 +2190,7 @@ static u64 xe_vma_max_pte_size(struct xe_vma *vma) { if (vma->gpuva.flags & XE_VMA_PTE_1G) return SZ_1G; - else if (vma->gpuva.flags & XE_VMA_PTE_2M) + else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT)) return SZ_2M; else if (vma->gpuva.flags & XE_VMA_PTE_64K) return SZ_64K; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 91800ce70845..a603cc2eb56b 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -30,6 +30,7 @@ struct xe_vm; #define XE_VMA_PTE_2M (DRM_GPUVA_USERBITS << 6) #define XE_VMA_PTE_1G (DRM_GPUVA_USERBITS << 7) #define XE_VMA_PTE_64K (DRM_GPUVA_USERBITS << 8) +#define XE_VMA_PTE_COMPACT (DRM_GPUVA_USERBITS << 9) /** struct xe_userptr - User pointer */ struct xe_userptr { From e2941a482a5de088b6dd75a985a76ff486383b7e Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Tue, 6 Feb 2024 11:27:31 -0800 Subject: [PATCH 234/327] drm/xe/xe_gt_idle: Drop redundant newline in name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newline in name is redunant and produces an unnecessary empty line during 'cat name'. Newline is added during sysfs_emit. See '27a1a1e2e47d ("drm/xe: stringify the argument to avoid potential vulnerability")'. v2: Add Fixes tag (Riana) Fixes: 7b076d14f21a ("drm/xe/mtl: Add support to get C6 residency/status of MTL") Reviewed-by: Riana Tauro Signed-off-by: Ashutosh Dixit (cherry picked from commit e5626eb80026c4b63f8682cdeca1456303c65791) Link: https://patchwork.freedesktop.org/patch/msgid/20240206192731.3533608-1-ashutosh.dixit@intel.com Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index 9358f7336889..9fcae65b6469 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -145,10 +145,10 @@ void xe_gt_idle_sysfs_init(struct xe_gt_idle *gtidle) } if (xe_gt_is_media_type(gt)) { - sprintf(gtidle->name, "gt%d-mc\n", gt->info.id); + sprintf(gtidle->name, "gt%d-mc", gt->info.id); gtidle->idle_residency = xe_guc_pc_mc6_residency; } else { - sprintf(gtidle->name, "gt%d-rc\n", gt->info.id); + sprintf(gtidle->name, "gt%d-rc", gt->info.id); gtidle->idle_residency = xe_guc_pc_rc6_residency; } From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:31 +0000 Subject: [PATCH 235/327] KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table() vgic_get_irq() may not return a valid descriptor if there is no ITS that holds a valid translation for the specified INTID. If that is the case, it is safe to silently ignore it and continue processing the LPI pending table. Cc: stable@vger.kernel.org Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index e2764d0ffa9f..082448de27ed 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) } irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + if (!irq) + continue; + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:32 +0000 Subject: [PATCH 236/327] KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler It is possible that an LPI mapped in a different ITS gets unmapped while handling the MOVALL command. If that is the case, there is no state that can be migrated to the destination. Silently ignore it and continue migrating other LPIs. Cc: stable@vger.kernel.org Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 082448de27ed..28a93074eca1 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, for (i = 0; i < irq_count; i++) { irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; update_affinity(irq, vcpu2); From 6650d23f3e20ca00482a71a4ef900f0ea776fb15 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Mon, 12 Feb 2024 19:35:48 -0800 Subject: [PATCH 237/327] drm/xe: Fix modpost warning on xe_mocs kunit module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit $ make W=1 -j100 M=drivers/gpu/drm/xe MODPOST drivers/gpu/drm/xe/Module.symvers WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/gpu/drm/xe/tests/xe_mocs_test.o Fix is identical to '1d425066f15f ("drm/xe: Fix modpost warning on kunit modules")'. Fixes: a6a4ea6d7d37 ("drm/xe: Add mocs kunit") Signed-off-by: Ashutosh Dixit Reviewed-by: Rodrigo Vivi (cherry picked from commit bb619d71224ea85ec94e0a83b2bb82ebe7df2a41) Link: https://patchwork.freedesktop.org/patch/msgid/20240213033548.76219-1-ashutosh.dixit@intel.com Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/tests/xe_mocs_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/tests/xe_mocs_test.c b/drivers/gpu/drm/xe/tests/xe_mocs_test.c index ef56bd517b28..421b819fd4ba 100644 --- a/drivers/gpu/drm/xe/tests/xe_mocs_test.c +++ b/drivers/gpu/drm/xe/tests/xe_mocs_test.c @@ -21,4 +21,5 @@ kunit_test_suite(xe_mocs_test_suite); MODULE_AUTHOR("Intel Corporation"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_mocs kunit test"); MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); From 3b1ae9b71c2a97f848b00fb085a2bd29bddbe8d9 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 19 Feb 2024 18:25:14 +0530 Subject: [PATCH 238/327] octeontx2-af: Consider the action set by PF AF reserves MCAM entries for each PF, VF present in the system and populates the entry with DMAC and action with default RSS so that basic packet I/O works. Since PF/VF is not aware of the RSS action installed by AF, AF only fixup the actions of the rules installed by PF/VF with corresponding default RSS action. This worked well for rules installed by PF/VF for features like RX VLAN offload and DMAC filters but rules involving action like drop/forward to queue are also getting modified by AF. Hence fix it by setting the default RSS action only if requested by PF/VF. Fixes: 967db3529eca ("octeontx2-af: add support for multicast/promisc packet replication feature") Signed-off-by: Subbaraya Sundeep Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index e5d6156655ba..516adb50f9f6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -415,6 +415,10 @@ static void npc_fixup_vf_rule(struct rvu *rvu, struct npc_mcam *mcam, return; } + /* AF modifies given action iff PF/VF has requested for it */ + if ((entry->action & 0xFULL) != NIX_RX_ACTION_DEFAULT) + return; + /* copy VF default entry action to the VF mcam entry */ rx_action = npc_get_default_entry_action(rvu, mcam, blkaddr, target_func); From 56667da7399eb19af857e30f41bea89aa6fa812c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2024 14:12:20 +0000 Subject: [PATCH 239/327] net: implement lockless setsockopt(SO_PEEK_OFF) syzbot reported a lockdep violation [1] involving af_unix support of SO_PEEK_OFF. Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket sk_peek_off field), there is really no point to enforce a pointless thread safety in the kernel. After this patch : - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock. - skb_consume_udp() no longer has to acquire the socket lock. - af_unix no longer needs a special version of sk_set_peek_off(), because it does not lock u->iolock anymore. As a followup, we could replace prot->set_peek_off to be a boolean and avoid an indirect call, since we always use sk_set_peek_off(). [1] WARNING: possible circular locking dependency detected 6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0 Not tainted syz-executor.2/30025 is trying to acquire lock: ffff8880765e7d80 (&u->iolock){+.+.}-{3:3}, at: unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789 but task is already holding lock: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline] ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline] ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_UNIX){+.+.}-{0:0}: lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754 lock_sock_nested+0x48/0x100 net/core/sock.c:3524 lock_sock include/net/sock.h:1691 [inline] __unix_dgram_recvmsg+0x1275/0x12c0 net/unix/af_unix.c:2415 sock_recvmsg_nosec+0x18e/0x1d0 net/socket.c:1046 ____sys_recvmsg+0x3c0/0x470 net/socket.c:2801 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x474/0xae0 net/socket.c:2939 __sys_recvmmsg net/socket.c:3018 [inline] __do_sys_recvmmsg net/socket.c:3041 [inline] __se_sys_recvmmsg net/socket.c:3034 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3034 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 -> #0 (&u->iolock){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752 unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789 sk_setsockopt+0x207e/0x3360 do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307 __sys_setsockopt+0x1ad/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_UNIX); lock(&u->iolock); lock(sk_lock-AF_UNIX); lock(&u->iolock); *** DEADLOCK *** 1 lock held by syz-executor.2/30025: #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline] #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline] #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193 stack backtrace: CPU: 0 PID: 30025 Comm: syz-executor.2 Not tainted 6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754 __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752 unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789 sk_setsockopt+0x207e/0x3360 do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307 __sys_setsockopt+0x1ad/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7f78a1c7dda9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f78a0fde0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007f78a1dac050 RCX: 00007f78a1c7dda9 RDX: 000000000000002a RSI: 0000000000000001 RDI: 0000000000000006 RBP: 00007f78a1cca47a R08: 0000000000000004 R09: 0000000000000000 R10: 0000000020000180 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f78a1dac050 R15: 00007ffe5cd81ae8 Fixes: 859051dd165e ("bpf: Implement cgroup sockaddr hooks for unix sockets") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Daan De Meyer Cc: Kuniyuki Iwashima Cc: Martin KaFai Lau Cc: David Ahern Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/sock.c | 23 +++++++++++------------ net/ipv4/udp.c | 7 +------ net/unix/af_unix.c | 19 +++---------------- 3 files changed, 15 insertions(+), 34 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 0a7f46c37f0c..5e78798456fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1188,6 +1188,17 @@ int sk_setsockopt(struct sock *sk, int level, int optname, */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; + case SO_PEEK_OFF: + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + return ret; + } } sockopt_lock_sock(sk); @@ -1430,18 +1441,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; - case SO_PEEK_OFF: - { - int (*set_peek_off)(struct sock *sk, int val); - - set_peek_off = READ_ONCE(sock->ops)->set_peek_off; - if (set_peek_off) - ret = set_peek_off(sk, val); - else - ret = -EOPNOTSUPP; - break; - } - case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f631b0a21af4..e474b201900f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1589,12 +1589,7 @@ int udp_init_sock(struct sock *sk) void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { - if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { - bool slow = lock_sock_fast(sk); - - sk_peek_offset_bwd(sk, len); - unlock_sock_fast(sk, slow); - } + sk_peek_offset_bwd(sk, len); if (!skb_unref(skb)) return; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 30b178ebba60..0748e7ea5210 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -782,19 +782,6 @@ static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); -static int unix_set_peek_off(struct sock *sk, int val) -{ - struct unix_sock *u = unix_sk(sk); - - if (mutex_lock_interruptible(&u->iolock)) - return -EINTR; - - WRITE_ONCE(sk->sk_peek_off, val); - mutex_unlock(&u->iolock); - - return 0; -} - #ifdef CONFIG_PROC_FS static int unix_count_nr_fds(struct sock *sk) { @@ -862,7 +849,7 @@ static const struct proto_ops unix_stream_ops = { .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .splice_read = unix_stream_splice_read, - .set_peek_off = unix_set_peek_off, + .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -886,7 +873,7 @@ static const struct proto_ops unix_dgram_ops = { .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, - .set_peek_off = unix_set_peek_off, + .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -909,7 +896,7 @@ static const struct proto_ops unix_seqpacket_ops = { .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, - .set_peek_off = unix_set_peek_off, + .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; From d80f8e96d47d7374794a30fbed69be43f3388afc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Feb 2024 08:40:15 -0600 Subject: [PATCH 240/327] net: ipa: don't overrun IPA suspend interrupt registers In newer hardware, IPA supports more than 32 endpoints. Some registers--such as IPA interrupt registers--represent endpoints as bits in a 4-byte register, and such registers are repeated as needed to represent endpoints beyond the first 32. In ipa_interrupt_suspend_clear_all(), we clear all pending IPA suspend interrupts by reading all status register(s) and writing corresponding registers to clear interrupt conditions. Unfortunately the number of registers to read/write is calculated incorrectly, and as a result we access *many* more registers than intended. This bug occurs only when the IPA hardware signals a SUSPEND interrupt, which happens when a packet is received for an endpoint (or its underlying GSI channel) that is suspended. This situation is difficult to reproduce, but possible. Fix this by correctly computing the number of interrupt registers to read and write. This is the only place in the code where registers that map endpoints or channels this way perform this calculation. Fixes: f298ba785e2d ("net: ipa: add a parameter to suspend registers") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/ipa_interrupt.c b/drivers/net/ipa/ipa_interrupt.c index 4bc05948f772..a78c692f2d3c 100644 --- a/drivers/net/ipa/ipa_interrupt.c +++ b/drivers/net/ipa/ipa_interrupt.c @@ -212,7 +212,7 @@ void ipa_interrupt_suspend_clear_all(struct ipa_interrupt *interrupt) u32 unit_count; u32 unit; - unit_count = roundup(ipa->endpoint_count, 32); + unit_count = DIV_ROUND_UP(ipa->endpoint_count, 32); for (unit = 0; unit < unit_count; unit++) { const struct reg *reg; u32 val; From aa82ac51d63328714645c827775d64dbfd9941f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 Feb 2024 09:46:57 -0800 Subject: [PATCH 241/327] af_unix: Drop oob_skb ref before purging queue in GC. syzbot reported another task hung in __unix_gc(). [0] The current while loop assumes that all of the left candidates have oob_skb and calling kfree_skb(oob_skb) releases the remaining candidates. However, I missed a case that oob_skb has self-referencing fd and another fd and the latter sk is placed before the former in the candidate list. Then, the while loop never proceeds, resulting the task hung. __unix_gc() has the same loop just before purging the collected skb, so we can call kfree_skb(oob_skb) there and let __skb_queue_purge() release all inflight sockets. [0]: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 2784 Comm: kworker/u4:8 Not tainted 6.8.0-rc4-syzkaller-01028-g71b605d32017 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Workqueue: events_unbound __unix_gc RIP: 0010:__sanitizer_cov_trace_pc+0x0/0x70 kernel/kcov.c:200 Code: 89 fb e8 23 00 00 00 48 8b 3d 84 f5 1a 0c 48 89 de 5b e9 43 26 57 00 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1e fa 48 8b 04 24 65 48 8b 0d 90 52 70 7e 65 8b 15 91 52 70 RSP: 0018:ffffc9000a17fa78 EFLAGS: 00000287 RAX: ffffffff8a0a6108 RBX: ffff88802b6c2640 RCX: ffff88802c0b3b80 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffffc9000a17fbf0 R08: ffffffff89383f1d R09: 1ffff1100ee5ff84 R10: dffffc0000000000 R11: ffffed100ee5ff85 R12: 1ffff110056d84ee R13: ffffc9000a17fae0 R14: 0000000000000000 R15: ffffffff8f47b840 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffef5687ff8 CR3: 0000000029b34000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __unix_gc+0xe69/0xf40 net/unix/garbage.c:343 process_one_work kernel/workqueue.c:2633 [inline] process_scheduled_works+0x913/0x1420 kernel/workqueue.c:2706 worker_thread+0xa5f/0x1000 kernel/workqueue.c:2787 kthread+0x2ef/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 Reported-and-tested-by: syzbot+ecab4d36f920c3574bf9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ecab4d36f920c3574bf9 Fixes: 25236c91b5ab ("af_unix: Fix task hung while purging oob_skb in GC.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/garbage.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2ff7ddbaa782..2a81880dac7b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -284,9 +284,17 @@ void unix_gc(void) * which are creating the cycle(s). */ skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) + list_for_each_entry(u, &gc_candidates, link) { scan_children(&u->sk, inc_inflight, &hitlist); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + } + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ @@ -314,18 +322,6 @@ void unix_gc(void) /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - while (!list_empty(&gc_candidates)) { - u = list_entry(gc_candidates.next, struct unix_sock, link); - if (u->oob_skb) { - struct sk_buff *skb = u->oob_skb; - - u->oob_skb = NULL; - kfree_skb(skb); - } - } -#endif - spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to From 14dec56fdd4c70a0ebe40077368e367421ea6fef Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 19 Feb 2024 17:55:31 +0000 Subject: [PATCH 242/327] MAINTAINERS: Add framer headers to NETWORKING [GENERAL] The cited commit [1] added framer support under drivers/net/wan, which is covered by NETWORKING [GENERAL]. And it is implied that framer-provider.h and framer.h, which were also added buy the same patch, are also maintained as part of NETWORKING [GENERAL]. Make this explicit by adding these files to the corresponding section in MAINTAINERS. [1] 82c944d05b1a ("net: wan: Add framer framework support") Signed-off-by: Simon Horman Reviewed-by: Herve Codina Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a0697e2fb8e8..466a0fc46f76 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15237,6 +15237,8 @@ F: Documentation/networking/ F: Documentation/networking/net_cachelines/ F: Documentation/process/maintainer-netdev.rst F: Documentation/userspace-api/netlink/ +F: include/linux/framer/framer-provider.h +F: include/linux/framer/framer.h F: include/linux/in.h F: include/linux/indirect_call_wrapper.h F: include/linux/net.h From ed683b9bb91fc274383e222ba5873a9ee9033462 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 20 Feb 2024 10:54:12 +0100 Subject: [PATCH 243/327] sparc: Fix undefined reference to fb_is_primary_device Commit 55bffc8170bb ("fbdev: Split frame buffer support in FB and FB_CORE symbols") added a new FB_CORE Kconfig symbol, that can be enabled to only have fbcon/VT and DRM fbdev emulation, but without support for any legacy fbdev driver. Unfortunately, it missed to change the CONFIG_FB in arch/sparc makefiles, which leads to the following linking error in some sparc64 configurations: sparc64-linux-ld: drivers/video/fbdev/core/fbcon.o: in function `fbcon_fb_registered': >> fbcon.c:(.text+0x4f60): undefined reference to `fb_is_primary_device' Fixes: 55bffc8170bb ("fbdev: Split frame buffer support in FB and FB_CORE symbols") Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202401290306.IV8rhJ02-lkp@intel.com/ Signed-off-by: Javier Martinez Canillas Reviewed-by: Thomas Zimmermann Acked-by: Arnd Bergmann Cc: # v6.6+ Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20240220095428.3341195-1-javierm@redhat.com --- arch/sparc/Makefile | 2 +- arch/sparc/video/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index 5f6035936131..2a03daa68f28 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -60,7 +60,7 @@ libs-y += arch/sparc/prom/ libs-y += arch/sparc/lib/ drivers-$(CONFIG_PM) += arch/sparc/power/ -drivers-$(CONFIG_FB) += arch/sparc/video/ +drivers-$(CONFIG_FB_CORE) += arch/sparc/video/ boot := arch/sparc/boot diff --git a/arch/sparc/video/Makefile b/arch/sparc/video/Makefile index 6baddbd58e4d..d4d83f1702c6 100644 --- a/arch/sparc/video/Makefile +++ b/arch/sparc/video/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_FB) += fbdev.o +obj-$(CONFIG_FB_CORE) += fbdev.o From 7adc0c1cfa7732b81bf7bf2ed16ffb99719ceebf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Feb 2024 14:43:54 -0400 Subject: [PATCH 244/327] iommufd: Reject non-zero data_type if no data_len is provided Since the current design doesn't forward the data_type to the driver to check unless there is a data_len/uptr for a driver specific struct we should check and ensure that data_type is 0 if data_len is 0. Otherwise any value is permitted. Fixes: bd529dbb661d ("iommufd: Add a nested HW pagetable object") Link: https://lore.kernel.org/r/0-v1-9b1ea6869554+110c60-iommufd_ck_data_type_jgg@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 3f3f1fa1a0a9..33d142f8057d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -263,7 +263,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (cmd->__reserved) return -EOPNOTSUPP; - if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + if ((cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) || + (cmd->data_type != IOMMU_HWPT_DATA_NONE && !cmd->data_len)) return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); From 723a2cc8d69d4342b47dfddbfe6c19f1b135f09b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 16 Feb 2024 20:48:14 -0400 Subject: [PATCH 245/327] s390: use the correct count for __iowrite64_copy() The signature for __iowrite64_copy() requires the number of 64 bit quantities, not bytes. Multiple by 8 to get to a byte length before invoking zpci_memcpy_toio() Fixes: 87bc359b9822 ("s390/pci: speed up __iowrite64_copy by using pci store block insn") Acked-by: Niklas Schnelle Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-9223d11a7662+1d7785-s390_iowrite64_jgg@nvidia.com Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 676ac74026a8..52a44e353796 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -252,7 +252,7 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, /* combine single writes by using store-block insn */ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) { - zpci_memcpy_toio(to, from, count); + zpci_memcpy_toio(to, from, count * 8); } void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, From e78fb4eac817308027da88d02e5d0213462a7562 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 20 Feb 2024 09:51:12 -0500 Subject: [PATCH 246/327] ring-buffer: Do not let subbuf be bigger than write mask The data on the subbuffer is measured by a write variable that also contains status flags. The counter is just 20 bits in length. If the subbuffer is bigger than then counter, it will fail. Make sure that the subbuffer can not be set to greater than the counter that keeps track of the data on the subbuffer. Link: https://lore.kernel.org/linux-trace-kernel/20240220095112.77e9cb81@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 2808e31ec12e5 ("ring-buffer: Add interface for configuring trace sub buffer size") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fd4bfe3ecf01..0699027b4f4c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5877,6 +5877,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; + /* Size of a subbuf cannot be greater than the write counter */ + if (psize > RB_WRITE_MASK + 1) + return -EINVAL; + old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 15 Feb 2024 12:47:38 -0800 Subject: [PATCH 247/327] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the following kernel warning appears: WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 Call trace: kiocb_set_cancel_fn+0x9c/0xa8 ffs_epfile_read_iter+0x144/0x1d0 io_read+0x19c/0x498 io_issue_sqe+0x118/0x27c io_submit_sqes+0x25c/0x5fc __arm64_sys_io_uring_enter+0x104/0xab0 invoke_syscall+0x58/0x11c el0_svc_common+0xb4/0xf4 do_el0_svc+0x2c/0xb0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x68/0xb4 el0t_64_sync+0x1a4/0x1a8 Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is submitted by libaio. Suggested-by: Jens Axboe Cc: Christoph Hellwig Cc: Avi Kivity Cc: Sandeep Dhavale Cc: Jens Axboe Cc: Greg Kroah-Hartman Cc: Kent Overstreet Cc: stable@vger.kernel.org Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org Signed-off-by: Christian Brauner --- fs/aio.c | 9 ++++++++- include/linux/fs.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index bb2ff48991f3..da18dbcfcb22 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) struct kioctx *ctx = req->ki_ctx; unsigned long flags; + /* + * kiocb didn't come from aio or is neither a read nor a write, hence + * ignore it. + */ + if (!(iocb->ki_flags & IOCB_AIO_RW)) + return; + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; @@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = req->ki_filp->f_iocb_flags; + req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..c2dcc98cb4c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -352,6 +352,8 @@ enum rw_hint { * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) +/* kiocb is a read or write operation submitted by fs/aio.c. */ +#define IOCB_AIO_RW (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ From 4cd12c6065dfcdeba10f49949bffcf383b3952d8 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Mon, 19 Feb 2024 00:09:33 +0900 Subject: [PATCH 248/327] bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready() syzbot reported the following NULL pointer dereference issue [1]: BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] RIP: 0010:0x0 [...] Call Trace: sk_psock_verdict_data_ready+0x232/0x340 net/core/skmsg.c:1230 unix_stream_sendmsg+0x9b4/0x1230 net/unix/af_unix.c:2293 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 If sk_psock_verdict_data_ready() and sk_psock_stop_verdict() are called concurrently, psock->saved_data_ready can be NULL, causing the above issue. This patch fixes this issue by calling the appropriate data ready function using the sk_psock_data_ready() helper and protecting it from concurrency with sk->sk_callback_lock. Fixes: 6df7f764cd3c ("bpf, sockmap: Wake up polling after data copy") Reported-by: syzbot+fd7b34375c1c8ce29c93@syzkaller.appspotmail.com Signed-off-by: Shigeru Yoshida Signed-off-by: Daniel Borkmann Tested-by: syzbot+fd7b34375c1c8ce29c93@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=fd7b34375c1c8ce29c93 [1] Link: https://lore.kernel.org/bpf/20240218150933.6004-1-syoshida@redhat.com --- net/core/skmsg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 93ecfceac1bc..4d75ef9d24bf 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1226,8 +1226,11 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - if (psock) - psock->saved_data_ready(sk); + if (psock) { + read_lock_bh(&sk->sk_callback_lock); + sk_psock_data_ready(sk, psock); + read_unlock_bh(&sk->sk_callback_lock); + } rcu_read_unlock(); } } From 9bd405c48b0ac4de087c0c4440fd79597201b8a7 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Sat, 3 Feb 2024 21:26:40 +0000 Subject: [PATCH 249/327] cache: ax45mp_cache: Align end size to cache boundary in ax45mp_dma_cache_wback() Align the end size to cache boundary size in ax45mp_dma_cache_wback() callback likewise done in ax45mp_dma_cache_inv() callback. Additionally return early in case of start == end. Fixes: d34599bcd2e4 ("cache: Add L2 cache management for Andes AX45MP RISC-V core") Reported-by: Pavel Machek Link: https://lore.kernel.org/cip-dev/ZYsdKDiw7G+kxQ3m@duo.ucw.cz/ Signed-off-by: Lad Prabhakar Signed-off-by: Conor Dooley --- drivers/cache/ax45mp_cache.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cache/ax45mp_cache.c b/drivers/cache/ax45mp_cache.c index 57186c58dc84..1d7dd3d2c101 100644 --- a/drivers/cache/ax45mp_cache.c +++ b/drivers/cache/ax45mp_cache.c @@ -129,8 +129,12 @@ static void ax45mp_dma_cache_wback(phys_addr_t paddr, size_t size) unsigned long line_size; unsigned long flags; + if (unlikely(start == end)) + return; + line_size = ax45mp_priv.ax45mp_cache_line_size; start = start & (~(line_size - 1)); + end = ((end + line_size - 1) & (~(line_size - 1))); local_irq_save(flags); ax45mp_cpu_dcache_wb_range(start, end); local_irq_restore(flags); From fb33a46cd75e18773dd5a414744507d84ae90870 Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Tue, 20 Feb 2024 19:14:29 +0800 Subject: [PATCH 250/327] irqchip/mbigen: Don't use bus_get_dev_root() to find the parent bus_get_dev_root() returns sp->dev_root which is set in subsys_register(), but subsys_register() is not called by platform_bus_init(). Therefor for the platform_bus_type, bus_get_dev_root() always returns NULL. This makes mbigen_of_create_domain() always return -ENODEV. Don't try to retrieve the parent via bus_get_dev_root() and unconditionally hand a NULL pointer to of_platform_device_create() to fix this. Fixes: fea087fc291b ("irqchip/mbigen: move to use bus_get_dev_root()") Signed-off-by: Chen Jun Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240220111429.110666-1-chenjun102@huawei.com --- drivers/irqchip/irq-mbigen.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c index 5101a3fb11df..58881d313979 100644 --- a/drivers/irqchip/irq-mbigen.c +++ b/drivers/irqchip/irq-mbigen.c @@ -235,22 +235,17 @@ static const struct irq_domain_ops mbigen_domain_ops = { static int mbigen_of_create_domain(struct platform_device *pdev, struct mbigen_device *mgn_chip) { - struct device *parent; struct platform_device *child; struct irq_domain *domain; struct device_node *np; u32 num_pins; int ret = 0; - parent = bus_get_dev_root(&platform_bus_type); - if (!parent) - return -ENODEV; - for_each_child_of_node(pdev->dev.of_node, np) { if (!of_property_read_bool(np, "interrupt-controller")) continue; - child = of_platform_device_create(np, NULL, parent); + child = of_platform_device_create(np, NULL, NULL); if (!child) { ret = -ENOMEM; break; @@ -273,7 +268,6 @@ static int mbigen_of_create_domain(struct platform_device *pdev, } } - put_device(parent); if (ret) of_node_put(np); From 9cec467d0502b24660f413a0e8fc782903b46d5b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 19 Feb 2024 16:44:30 +0100 Subject: [PATCH 251/327] ata: libata-core: Do not call ata_dev_power_set_standby() twice For regular system shutdown, ata_dev_power_set_standby() will be executed twice: once the scsi device is removed and another when ata_pci_shutdown_one() executes and EH completes unloading the devices. Make the second call to ata_dev_power_set_standby() do nothing by using ata_dev_power_is_active() and return if the device is already in standby. Fixes: 2da4c5e24e86 ("ata: libata-core: Improve ata_dev_power_set_active()") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 91 ++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index d9f80f4f70f5..be3412cdb22e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2001,51 +2001,6 @@ bool ata_dev_power_init_tf(struct ata_device *dev, struct ata_taskfile *tf, return true; } -/** - * ata_dev_power_set_standby - Set a device power mode to standby - * @dev: target device - * - * Issue a STANDBY IMMEDIATE command to set a device power mode to standby. - * For an HDD device, this spins down the disks. - * - * LOCKING: - * Kernel thread context (may sleep). - */ -void ata_dev_power_set_standby(struct ata_device *dev) -{ - unsigned long ap_flags = dev->link->ap->flags; - struct ata_taskfile tf; - unsigned int err_mask; - - /* If the device is already sleeping, do nothing. */ - if (dev->flags & ATA_DFLAG_SLEEPING) - return; - - /* - * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5) - * causing some drives to spin up and down again. For these, do nothing - * if we are being called on shutdown. - */ - if ((ap_flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) && - system_state == SYSTEM_POWER_OFF) - return; - - if ((ap_flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) && - system_entering_hibernation()) - return; - - /* Issue STANDBY IMMEDIATE command only if supported by the device */ - if (!ata_dev_power_init_tf(dev, &tf, false)) - return; - - ata_dev_notice(dev, "Entering standby power mode\n"); - - err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); - if (err_mask) - ata_dev_err(dev, "STANDBY IMMEDIATE failed (err_mask=0x%x)\n", - err_mask); -} - static bool ata_dev_power_is_active(struct ata_device *dev) { struct ata_taskfile tf; @@ -2073,6 +2028,52 @@ static bool ata_dev_power_is_active(struct ata_device *dev) return tf.nsect == 0xff; } +/** + * ata_dev_power_set_standby - Set a device power mode to standby + * @dev: target device + * + * Issue a STANDBY IMMEDIATE command to set a device power mode to standby. + * For an HDD device, this spins down the disks. + * + * LOCKING: + * Kernel thread context (may sleep). + */ +void ata_dev_power_set_standby(struct ata_device *dev) +{ + unsigned long ap_flags = dev->link->ap->flags; + struct ata_taskfile tf; + unsigned int err_mask; + + /* If the device is already sleeping or in standby, do nothing. */ + if ((dev->flags & ATA_DFLAG_SLEEPING) || + !ata_dev_power_is_active(dev)) + return; + + /* + * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5) + * causing some drives to spin up and down again. For these, do nothing + * if we are being called on shutdown. + */ + if ((ap_flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) && + system_state == SYSTEM_POWER_OFF) + return; + + if ((ap_flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) && + system_entering_hibernation()) + return; + + /* Issue STANDBY IMMEDIATE command only if supported by the device */ + if (!ata_dev_power_init_tf(dev, &tf, false)) + return; + + ata_dev_notice(dev, "Entering standby power mode\n"); + + err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); + if (err_mask) + ata_dev_err(dev, "STANDBY IMMEDIATE failed (err_mask=0x%x)\n", + err_mask); +} + /** * ata_dev_power_set_active - Set a device power mode to active * @dev: target device From ec4308ecfc887128a468f03fb66b767559c57c23 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 19 Feb 2024 18:58:06 +0000 Subject: [PATCH 252/327] irqchip/gic-v3-its: Do not assume vPE tables are preallocated The GIC/ITS code is designed to ensure to pick up any preallocated LPI tables on the redistributors, as enabling LPIs is a one-way switch. There is no such restriction for vLPIs, and for GICv4.1 it is expected to allocate a new vPE table at boot. This works as intended when initializing an ITS, however when setting up a redistributor in cpu_init_lpis() the early return for preallocated RD tables skips straight past the GICv4 setup. This all comes to a head when trying to kexec() into a new kernel, as the new kernel silently fails to set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs. Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always initializes GICv4 on the way out, even if the other RD tables were preallocated. Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop") Reported-by: George Cherian Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Oliver Upton Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240219185809.286724-2-oliver.upton@linux.dev --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 53abd4779914..b822752c4261 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3181,6 +3181,7 @@ static void its_cpu_init_lpis(void) val |= GICR_CTLR_ENABLE_LPIS; writel_relaxed(val, rbase + GICR_CTLR); +out: if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) { void __iomem *vlpi_base = gic_data_rdist_vlpi_base(); @@ -3216,7 +3217,6 @@ static void its_cpu_init_lpis(void) /* Make sure the GIC has seen the above */ dsb(sy); -out: gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED; pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n", smp_processor_id(), From b7b2ffc3ca59b06397550f96febe95f3f153eb1e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 15 Feb 2024 07:41:09 +0100 Subject: [PATCH 253/327] docs: translations: use attribute to store current language Akira Yokosawa reported [1] that the "translations" extension we added in commit 7418ec5b151f ("docs: translations: add translations links when they exist") broke the build on Sphinx versions v6.1.3 through 7.1.2 (possibly others) with the following error: Exception occurred: File "/usr/lib/python3.12/site-packages/sphinx/util/nodes.py", line 624, in _copy_except__document newnode = self.__class__(rawsource=self.rawsource, **self.attributes) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: LanguagesNode.__init__() missing 1 required positional argument: 'current_language' The full traceback has been saved in /tmp/sphinx-err-7xmwytuu.log, if you want to report the issue to the developers. Solve this problem by making 'current_language' a true element attribute of the LanguagesNode element, which is probably the more correct way to do it anyway. Tested on Sphinx 2.x, 3.x, 6.x, and 7.x. [1]: https://lore.kernel.org/all/54a56c2e-a27c-45a0-b712-02a7bc7d2673@gmail.com/ Fixes: 7418ec5b151f ("docs: translations: add translations links when they exist") Reported-by: Akira Yokosawa Signed-off-by: Vegard Nossum Closes: https://lore.kernel.org/all/54a56c2e-a27c-45a0-b712-02a7bc7d2673@gmail.com/ Tested-by: Akira Yokosawa # Sphinx 4.3.2, 5.3.0 and 6.2.1 Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240215064109.1193556-1-vegard.nossum@oracle.com --- Documentation/sphinx/translations.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Documentation/sphinx/translations.py b/Documentation/sphinx/translations.py index 47161e6eba99..32c2b32b2b5e 100644 --- a/Documentation/sphinx/translations.py +++ b/Documentation/sphinx/translations.py @@ -29,10 +29,7 @@ all_languages = { } class LanguagesNode(nodes.Element): - def __init__(self, current_language, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.current_language = current_language + pass class TranslationsTransform(Transform): default_priority = 900 @@ -49,7 +46,8 @@ class TranslationsTransform(Transform): # normalize docname to be the untranslated one docname = os.path.join(*components[2:]) - new_nodes = LanguagesNode(all_languages[this_lang_code]) + new_nodes = LanguagesNode() + new_nodes['current_language'] = all_languages[this_lang_code] for lang_code, lang_name in all_languages.items(): if lang_code == this_lang_code: @@ -84,7 +82,7 @@ def process_languages(app, doctree, docname): html_content = app.builder.templates.render('translations.html', context={ - 'current_language': node.current_language, + 'current_language': node['current_language'], 'languages': languages, }) From d56e460e19ea8382f813eb489730248ec8d7eb73 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 21 Feb 2024 06:01:20 -0800 Subject: [PATCH 254/327] hwmon: (nct6775) Fix access to temperature configuration registers The number of temperature configuration registers does not always match the total number of temperature registers. This can result in access errors reported if KASAN is enabled. BUG: KASAN: global-out-of-bounds in nct6775_probe+0x5654/0x6fe9 nct6775_core Reported-by: Erhard Furtner Closes: https://lore.kernel.org/linux-hwmon/d51181d1-d26b-42b2-b002-3f5a4037721f@roeck-us.net/ Fixes: b7f1f7b2523a ("hwmon: (nct6775) Additional TEMP registers for nct6799") Cc: Ahmad Khalifa Tested-by: Ahmad Khalifa Signed-off-by: Guenter Roeck --- drivers/hwmon/nct6775-core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c index 8d2ef3145bca..9fbab8f02334 100644 --- a/drivers/hwmon/nct6775-core.c +++ b/drivers/hwmon/nct6775-core.c @@ -3512,6 +3512,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, const u16 *reg_temp_mon, *reg_temp_alternate, *reg_temp_crit; const u16 *reg_temp_crit_l = NULL, *reg_temp_crit_h = NULL; int num_reg_temp, num_reg_temp_mon, num_reg_tsi_temp; + int num_reg_temp_config; struct device *hwmon_dev; struct sensor_template_group tsi_temp_tg; @@ -3594,6 +3595,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6106_REG_TEMP_OVER; reg_temp_hyst = NCT6106_REG_TEMP_HYST; reg_temp_config = NCT6106_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6106_REG_TEMP_CONFIG); reg_temp_alternate = NCT6106_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6106_REG_TEMP_CRIT; reg_temp_crit_l = NCT6106_REG_TEMP_CRIT_L; @@ -3669,6 +3671,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6106_REG_TEMP_OVER; reg_temp_hyst = NCT6106_REG_TEMP_HYST; reg_temp_config = NCT6106_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6106_REG_TEMP_CONFIG); reg_temp_alternate = NCT6106_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6106_REG_TEMP_CRIT; reg_temp_crit_l = NCT6106_REG_TEMP_CRIT_L; @@ -3746,6 +3749,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6775_REG_TEMP_OVER; reg_temp_hyst = NCT6775_REG_TEMP_HYST; reg_temp_config = NCT6775_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6775_REG_TEMP_CONFIG); reg_temp_alternate = NCT6775_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6775_REG_TEMP_CRIT; @@ -3821,6 +3825,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6775_REG_TEMP_OVER; reg_temp_hyst = NCT6775_REG_TEMP_HYST; reg_temp_config = NCT6776_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6776_REG_TEMP_CONFIG); reg_temp_alternate = NCT6776_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6776_REG_TEMP_CRIT; @@ -3900,6 +3905,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6779_REG_TEMP_OVER; reg_temp_hyst = NCT6779_REG_TEMP_HYST; reg_temp_config = NCT6779_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6779_REG_TEMP_CONFIG); reg_temp_alternate = NCT6779_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6779_REG_TEMP_CRIT; @@ -4034,6 +4040,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6779_REG_TEMP_OVER; reg_temp_hyst = NCT6779_REG_TEMP_HYST; reg_temp_config = NCT6779_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6779_REG_TEMP_CONFIG); reg_temp_alternate = NCT6779_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6779_REG_TEMP_CRIT; @@ -4123,6 +4130,7 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, reg_temp_over = NCT6798_REG_TEMP_OVER; reg_temp_hyst = NCT6798_REG_TEMP_HYST; reg_temp_config = NCT6779_REG_TEMP_CONFIG; + num_reg_temp_config = ARRAY_SIZE(NCT6779_REG_TEMP_CONFIG); reg_temp_alternate = NCT6798_REG_TEMP_ALTERNATE; reg_temp_crit = NCT6798_REG_TEMP_CRIT; @@ -4204,7 +4212,8 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, = reg_temp_crit[src - 1]; if (reg_temp_crit_l && reg_temp_crit_l[i]) data->reg_temp[4][src - 1] = reg_temp_crit_l[i]; - data->reg_temp_config[src - 1] = reg_temp_config[i]; + if (i < num_reg_temp_config) + data->reg_temp_config[src - 1] = reg_temp_config[i]; data->temp_src[src - 1] = src; continue; } @@ -4217,7 +4226,8 @@ int nct6775_probe(struct device *dev, struct nct6775_data *data, data->reg_temp[0][s] = reg_temp[i]; data->reg_temp[1][s] = reg_temp_over[i]; data->reg_temp[2][s] = reg_temp_hyst[i]; - data->reg_temp_config[s] = reg_temp_config[i]; + if (i < num_reg_temp_config) + data->reg_temp_config[s] = reg_temp_config[i]; if (reg_temp_crit_h && reg_temp_crit_h[i]) data->reg_temp[3][s] = reg_temp_crit_h[i]; else if (reg_temp_crit[src - 1]) From 136cfaca22567a03bbb3bf53a43d8cb5748b80ec Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Wed, 14 Feb 2024 19:27:33 +0300 Subject: [PATCH 255/327] gtp: fix use-after-free and null-ptr-deref in gtp_genl_dump_pdp() The gtp_net_ops pernet operations structure for the subsystem must be registered before registering the generic netlink family. Syzkaller hit 'general protection fault in gtp_genl_dump_pdp' bug: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 1 PID: 5826 Comm: gtp Not tainted 6.8.0-rc3-std-def-alt1 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-alt1 04/01/2014 RIP: 0010:gtp_genl_dump_pdp+0x1be/0x800 [gtp] Code: c6 89 c6 e8 64 e9 86 df 58 45 85 f6 0f 85 4e 04 00 00 e8 c5 ee 86 df 48 8b 54 24 18 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 de 05 00 00 48 8b 44 24 18 4c 8b 30 4c 39 f0 74 RSP: 0018:ffff888014107220 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff88800fcda588 R14: 0000000000000001 R15: 0000000000000000 FS: 00007f1be4eb05c0(0000) GS:ffff88806ce80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1be4e766cf CR3: 000000000c33e000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? show_regs+0x90/0xa0 ? die_addr+0x50/0xd0 ? exc_general_protection+0x148/0x220 ? asm_exc_general_protection+0x22/0x30 ? gtp_genl_dump_pdp+0x1be/0x800 [gtp] ? __alloc_skb+0x1dd/0x350 ? __pfx___alloc_skb+0x10/0x10 genl_dumpit+0x11d/0x230 netlink_dump+0x5b9/0xce0 ? lockdep_hardirqs_on_prepare+0x253/0x430 ? __pfx_netlink_dump+0x10/0x10 ? kasan_save_track+0x10/0x40 ? __kasan_kmalloc+0x9b/0xa0 ? genl_start+0x675/0x970 __netlink_dump_start+0x6fc/0x9f0 genl_family_rcv_msg_dumpit+0x1bb/0x2d0 ? __pfx_genl_family_rcv_msg_dumpit+0x10/0x10 ? genl_op_from_small+0x2a/0x440 ? cap_capable+0x1d0/0x240 ? __pfx_genl_start+0x10/0x10 ? __pfx_genl_dumpit+0x10/0x10 ? __pfx_genl_done+0x10/0x10 ? security_capable+0x9d/0xe0 Cc: stable@vger.kernel.org Signed-off-by: Vasiliy Kovalev Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Link: https://lore.kernel.org/r/20240214162733.34214-1-kovalev@altlinux.org Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index b1919278e931..2129ae42c703 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1907,20 +1907,20 @@ static int __init gtp_init(void) if (err < 0) goto error_out; - err = genl_register_family(>p_genl_family); + err = register_pernet_subsys(>p_net_ops); if (err < 0) goto unreg_rtnl_link; - err = register_pernet_subsys(>p_net_ops); + err = genl_register_family(>p_genl_family); if (err < 0) - goto unreg_genl_family; + goto unreg_pernet_subsys; pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", sizeof(struct pdp_ctx)); return 0; -unreg_genl_family: - genl_unregister_family(>p_genl_family); +unreg_pernet_subsys: + unregister_pernet_subsys(>p_net_ops); unreg_rtnl_link: rtnl_link_unregister(>p_link_ops); error_out: From 10f41d0710fc81b7af93fa6106678d57b1ff24a7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 15 Feb 2024 17:17:29 +0100 Subject: [PATCH 256/327] tls: break out of main loop when PEEK gets a non-data record PEEK needs to leave decrypted records on the rx_list so that we can receive them later on, so it jumps back into the async code that queues the skb. Unfortunately that makes us skip the TLS_RECORD_TYPE_DATA check at the bottom of the main loop, so if two records of the same (non-DATA) type are queued, we end up merging them. Add the same record type check, and make it unlikely to not penalize the async fastpath. Async decrypt only applies to data record, so this check is only needed for PEEK. process_rx_list also has similar issues. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/3df2eef4fdae720c55e69472b5bea668772b45a2.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9fbc70200cd0..78aedfc682ba 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2064,6 +2064,8 @@ put_on_rx_list: decrypted += chunk; len -= chunk; __skb_queue_tail(&ctx->rx_list, skb); + if (unlikely(control != TLS_RECORD_TYPE_DATA)) + break; continue; } From fdfbaec5923d9359698cbb286bc0deadbb717504 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 15 Feb 2024 17:17:30 +0100 Subject: [PATCH 257/327] tls: stop recv() if initial process_rx_list gave us non-DATA If we have a non-DATA record on the rx_list and another record of the same type still on the queue, we will end up merging them: - process_rx_list copies the non-DATA record - we start the loop and process the first available record since it's of the same type - we break out of the loop since the record was not DATA Just check the record type and jump to the end in case process_rx_list did some work. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/bd31449e43bd4b6ff546f5c51cf958c31c511deb.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 78aedfc682ba..43dd0d82b6ed 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1971,7 +1971,7 @@ int tls_sw_recvmsg(struct sock *sk, goto end; copied = err; - if (len <= copied) + if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); From ec823bf3a479d42c589dc0f28ef4951c49cd2d2a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 15 Feb 2024 17:17:31 +0100 Subject: [PATCH 258/327] tls: don't skip over different type records from the rx_list If we queue 3 records: - record 1, type DATA - record 2, some other type - record 3, type DATA and do a recv(PEEK), the rx_list will contain the first two records. The next large recv will walk through the rx_list and copy data from record 1, then stop because record 2 is a different type. Since we haven't filled up our buffer, we will process the next available record. It's also DATA, so we can merge it with the current read. We shouldn't do that, since there was a record in between that we ignored. Add a flag to let process_rx_list inform tls_sw_recvmsg that it had more data available. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/f00c0c0afa080c60f016df1471158c1caf983c34.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 43dd0d82b6ed..de96959336c4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1772,7 +1772,8 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, u8 *control, size_t skip, size_t len, - bool is_peek) + bool is_peek, + bool *more) { struct sk_buff *skb = skb_peek(&ctx->rx_list); struct tls_msg *tlm; @@ -1785,7 +1786,7 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, err = tls_record_content_type(msg, tlm, control); if (err <= 0) - goto out; + goto more; if (skip < rxm->full_len) break; @@ -1803,12 +1804,12 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, err = tls_record_content_type(msg, tlm, control); if (err <= 0) - goto out; + goto more; err = skb_copy_datagram_msg(skb, rxm->offset + skip, msg, chunk); if (err < 0) - goto out; + goto more; len = len - chunk; copied = copied + chunk; @@ -1844,6 +1845,10 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, out: return copied ? : err; +more: + if (more) + *more = true; + goto out; } static bool @@ -1947,6 +1952,7 @@ int tls_sw_recvmsg(struct sock *sk, int target, err; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool rx_more = false; bool released = true; bool bpf_strp_enabled; bool zc_capable; @@ -1966,12 +1972,12 @@ int tls_sw_recvmsg(struct sock *sk, goto end; /* Process pending decrypted records. It must be non-zero-copy */ - err = process_rx_list(ctx, msg, &control, 0, len, is_peek); + err = process_rx_list(ctx, msg, &control, 0, len, is_peek, &rx_more); if (err < 0) goto end; copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA)) + if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -2130,10 +2136,10 @@ recv_end: /* Drain records from the rx_list & copy if required */ if (is_peek || is_kvec) err = process_rx_list(ctx, msg, &control, copied, - decrypted, is_peek); + decrypted, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, - async_copy_bytes, is_peek); + async_copy_bytes, is_peek, NULL); } copied += decrypted; From 7b2a4c2a623a9f9c5fd9cff499bb9457fa2192ec Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 15 Feb 2024 17:17:32 +0100 Subject: [PATCH 259/327] selftests: tls: add test for merging of same-type control messages Two consecutive control messages of the same type should never be merged into one large received blob of data. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/018f1633d5471684c65def5fe390de3b15c3d683.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 49c84602707f..2714c230a0f9 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1485,6 +1485,32 @@ TEST_F(tls, control_msg) EXPECT_EQ(memcmp(buf, test_str, send_len), 0); } +TEST_F(tls, control_msg_nomerge) +{ + char *rec1 = "1111"; + char *rec2 = "2222"; + int send_len = 5; + char buf[15]; + + if (self->notls) + SKIP(return, "no TLS support"); + + EXPECT_EQ(tls_send_cmsg(self->fd, 100, rec1, send_len, 0), send_len); + EXPECT_EQ(tls_send_cmsg(self->fd, 100, rec2, send_len, 0), send_len); + + EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, 100, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, rec1, send_len), 0); + + EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, 100, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, rec1, send_len), 0); + + EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, 100, buf, sizeof(buf), 0), send_len); + EXPECT_EQ(memcmp(buf, rec1, send_len), 0); + + EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, 100, buf, sizeof(buf), 0), send_len); + EXPECT_EQ(memcmp(buf, rec2, send_len), 0); +} + TEST_F(tls, shutdown) { char const *test_str = "test_read"; From 2bf6172632e1d4a6ec7ee7e734d53a43a145f5c6 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 15 Feb 2024 17:17:33 +0100 Subject: [PATCH 260/327] selftests: tls: add test for peeking past a record of a different type If we queue 3 records: - record 1, type DATA - record 2, some other type - record 3, type DATA the current code can look past the 2nd record and merge the 2 data records. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/4623550f8617c239581030c13402d3262f2bd14f.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 2714c230a0f9..b95c249f81c2 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1511,6 +1511,25 @@ TEST_F(tls, control_msg_nomerge) EXPECT_EQ(memcmp(buf, rec2, send_len), 0); } +TEST_F(tls, data_control_data) +{ + char *rec1 = "1111"; + char *rec2 = "2222"; + char *rec3 = "3333"; + int send_len = 5; + char buf[15]; + + if (self->notls) + SKIP(return, "no TLS support"); + + EXPECT_EQ(send(self->fd, rec1, send_len, 0), send_len); + EXPECT_EQ(tls_send_cmsg(self->fd, 100, rec2, send_len, 0), send_len); + EXPECT_EQ(send(self->fd, rec3, send_len, 0), send_len); + + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); +} + TEST_F(tls, shutdown) { char const *test_str = "test_read"; From bccebf64701735533c8db37773eeacc6566cc8ec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 19 Feb 2024 16:58:04 +0100 Subject: [PATCH 261/327] netfilter: nf_tables: set dormant flag on hook register failure We need to set the dormant flag again if we fail to register the hooks. During memory pressure hook registration can fail and we end up with a table marked as active but no registered hooks. On table/base chain deletion, nf_tables will attempt to unregister the hook again which yields a warn splat from the nftables core. Reported-and-tested-by: syzbot+de4025c006ec68ac56fc@syzkaller.appspotmail.com Fixes: 179d9ba5559a ("netfilter: nf_tables: fix table flag updates") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f8e3f70c35bd..90038d778f37 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1251,6 +1251,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return 0; err_register_hooks: + ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } From 9e0f0430389be7696396c62f037be4bf72cf93e3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Feb 2024 12:32:58 +0100 Subject: [PATCH 262/327] netfilter: nft_flow_offload: reset dst in route object after setting up flow dst is transferred to the flow object, route object does not own it anymore. Reset dst in route object, otherwise if flow_offload_add() fails, error path releases dst twice, leading to a refcount underflow. Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 956c752ceb31..a763dd327c6e 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -276,7 +276,7 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, } void flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route); + struct nf_flow_route *route); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); void flow_offload_refresh(struct nf_flowtable *flow_table, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 920a5a29ae1d..7502d6d73a60 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -87,12 +87,22 @@ static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) return 0; } +static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct dst_entry *dst = route->tuple[dir].dst; + + route->tuple[dir].dst = NULL; + + return dst; +} + static int flow_offload_fill_route(struct flow_offload *flow, - const struct nf_flow_route *route, + struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *dst = route->tuple[dir].dst; + struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { @@ -146,7 +156,7 @@ static void nft_flow_dst_release(struct flow_offload *flow, } void flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route) + struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); From 8762785f459be1cfe6fcf7285c123aad6a3703f0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 Feb 2024 21:36:39 +0100 Subject: [PATCH 263/327] netfilter: nft_flow_offload: release dst in case direct xmit path is used Direct xmit does not use it since it calls dev_queue_xmit() to send packets, hence it calls dst_release(). kmemleak reports: unreferenced object 0xffff88814f440900 (size 184): comm "softirq", pid 0, jiffies 4294951896 hex dump (first 32 bytes): 00 60 5b 04 81 88 ff ff 00 e6 e8 82 ff ff ff ff .`[............. 21 0b 50 82 ff ff ff ff 00 00 00 00 00 00 00 00 !.P............. backtrace (crc cb2bf5d6): [<000000003ee17107>] kmem_cache_alloc+0x286/0x340 [<0000000021a5de2c>] dst_alloc+0x43/0xb0 [<00000000f0671159>] rt_dst_alloc+0x2e/0x190 [<00000000fe5092c9>] __mkroute_output+0x244/0x980 [<000000005fb96fb0>] ip_route_output_flow+0xc0/0x160 [<0000000045367433>] nf_ip_route+0xf/0x30 [<0000000085da1d8e>] nf_route+0x2d/0x60 [<00000000d1ecd1cb>] nft_flow_route+0x171/0x6a0 [nft_flow_offload] [<00000000d9b2fb60>] nft_flow_offload_eval+0x4e8/0x700 [nft_flow_offload] [<000000009f447dbb>] expr_call_ops_eval+0x53/0x330 [nf_tables] [<00000000072e1be6>] nft_do_chain+0x17c/0x840 [nf_tables] [<00000000d0551029>] nft_do_chain_inet+0xa1/0x210 [nf_tables] [<0000000097c9d5c6>] nf_hook_slow+0x5b/0x160 [<0000000005eccab1>] ip_forward+0x8b6/0x9b0 [<00000000553a269b>] ip_rcv+0x221/0x230 [<00000000412872e5>] __netif_receive_skb_one_core+0xfe/0x110 Fixes: fa502c865666 ("netfilter: flowtable: simplify route logic") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 7502d6d73a60..a0571339239c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -132,6 +132,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; + dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: From d472e9853d7b46a6b094224d131d09ccd3a03daf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Feb 2024 19:43:53 +0100 Subject: [PATCH 264/327] netfilter: nf_tables: register hooks last when adding new chain/flowtable Register hooks last when adding chain/flowtable to ensure that packets do not walk over datastructure that is being released in the error path without waiting for the rcu grace period. Fixes: 91c7b38dc9f0 ("netfilter: nf_tables: use new transaction infrastructure to handle chain") Fixes: 3b49e2e94e6e ("netfilter: nf_tables: add flow table netlink frontend") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 78 ++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 90038d778f37..485e047d5f98 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -684,15 +684,16 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; } -static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, - struct nft_flowtable *flowtable) +static struct nft_trans * +nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); @@ -701,22 +702,22 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); - return 0; + return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { - int err; + struct nft_trans *trans; - err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); - if (err < 0) - return err; + trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (IS_ERR(trans)) + return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); nft_use_dec(&ctx->table->use); - return err; + return 0; } static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) @@ -2504,19 +2505,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, RCU_INIT_POINTER(chain->blob_gen_0, blob); RCU_INIT_POINTER(chain->blob_gen_1, blob); - err = nf_tables_register_hook(net, table, chain); - if (err < 0) - goto err_destroy_chain; - if (!nft_use_inc(&table->use)) { err = -EMFILE; - goto err_use; + goto err_destroy_chain; } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto err_unregister_hook; + goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; @@ -2524,17 +2521,22 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_trans_chain_policy(trans) = policy; err = nft_chain_add(table, chain); - if (err < 0) { - nft_trans_destroy(trans); - goto err_unregister_hook; - } + if (err < 0) + goto err_chain_add; + + /* This must be LAST to ensure no packets are walking over this chain. */ + err = nf_tables_register_hook(net, table, chain); + if (err < 0) + goto err_register_hook; return 0; -err_unregister_hook: +err_register_hook: + nft_chain_del(chain); +err_chain_add: + nft_trans_destroy(trans); +err_trans: nft_use_dec_restore(&table->use); -err_use: - nf_tables_unregister_hook(net, table, chain); err_destroy_chain: nf_tables_chain_destroy(ctx); @@ -8456,9 +8458,9 @@ static int nf_tables_newflowtable(struct sk_buff *skb, u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; struct nft_flowtable *flowtable; - struct nft_hook *hook, *next; struct net *net = info->net; struct nft_table *table; + struct nft_trans *trans; struct nft_ctx ctx; int err; @@ -8538,34 +8540,34 @@ static int nf_tables_newflowtable(struct sk_buff *skb, err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, extack, true); if (err < 0) - goto err4; + goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; + trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto err_flowtable_trans; + } + + /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); - if (err < 0) { - nft_hooks_destroy(&flowtable->hook_list); - goto err4; - } - - err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; + goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); return 0; -err5: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - nft_unregister_flowtable_hook(net, flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } -err4: + +err_flowtable_hooks: + nft_trans_destroy(trans); +err_flowtable_trans: + nft_hooks_destroy(&flowtable->hook_list); +err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); From 195e5f88c2e48330ba5483e0bad2de3b3fad484f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Feb 2024 18:38:45 +0100 Subject: [PATCH 265/327] netfilter: nf_tables: use kzalloc for hook allocation KMSAN reports unitialized variable when registering the hook, reg->hook_ops_type == NF_HOOK_OP_BPF) ~~~~~~~~~~~ undefined This is a small structure, just use kzalloc to make sure this won't happen again when new fields get added to nf_hook_ops. Fixes: 7b4b2fa37587 ("netfilter: annotate nf_tables base hook ops") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 485e047d5f98..7e938c7397dd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2082,7 +2082,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, struct nft_hook *hook; int err; - hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); + hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); if (!hook) { err = -ENOMEM; goto err_hook_alloc; From 9990889be14288d4f1743e4768222d5032a79c27 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 15 Feb 2024 15:53:08 +0800 Subject: [PATCH 266/327] net: mctp: put sock on tag allocation failure We may hold an extra reference on a socket if a tag allocation fails: we optimistically allocate the sk_key, and take a ref there, but do not drop if we end up not using the allocated key. Ensure we're dropping the sock on this failure by doing a proper unref rather than directly kfree()ing. Fixes: de8a6b15d965 ("net: mctp: add an explicit reference from a mctp_sk_key to sock") Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/ce9b61e44d1cdae7797be0c5e3141baf582d23a0.1707983487.git.jk@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index 7a47a58aa54b..6218dcd07e18 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -663,7 +663,7 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, spin_unlock_irqrestore(&mns->keys_lock, flags); if (!tagbits) { - kfree(key); + mctp_key_unref(key); return ERR_PTR(-EBUSY); } From e4fe082c38cd74a8fa384bc7542cf3edf1cb7318 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Feb 2024 08:11:11 -0800 Subject: [PATCH 267/327] tools: ynl: make sure we always pass yarg to mnl_cb_run There is one common error handler in ynl - ynl_cb_error(). It expects priv to be a pointer to struct ynl_parse_arg AKA yarg. To avoid potential crashes if we encounter a stray NLMSG_ERROR always pass yarg as priv (or a struct which has it as the first member). ynl_cb_null() has a similar problem directly - it expects yarg but priv passed by the caller is ys. Found by code inspection. Fixes: 86878f14d71a ("tools: ynl: user space helpers") Acked-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20240220161112.2735195-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index c82a7f41b31c..9e41c8c0cc99 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -466,6 +466,8 @@ ynl_gemsg_start_dump(struct ynl_sock *ys, __u32 id, __u8 cmd, __u8 version) int ynl_recv_ack(struct ynl_sock *ys, int ret) { + struct ynl_parse_arg yarg = { .ys = ys, }; + if (!ret) { yerr(ys, YNL_ERROR_EXPECT_ACK, "Expecting an ACK but nothing received"); @@ -478,7 +480,7 @@ int ynl_recv_ack(struct ynl_sock *ys, int ret) return ret; } return mnl_cb_run(ys->rx_buf, ret, ys->seq, ys->portid, - ynl_cb_null, ys); + ynl_cb_null, &yarg); } int ynl_cb_null(const struct nlmsghdr *nlh, void *data) @@ -741,11 +743,14 @@ err_free: static int ynl_ntf_trampoline(const struct nlmsghdr *nlh, void *data) { - return ynl_ntf_parse((struct ynl_sock *)data, nlh); + struct ynl_parse_arg *yarg = data; + + return ynl_ntf_parse(yarg->ys, nlh); } int ynl_ntf_check(struct ynl_sock *ys) { + struct ynl_parse_arg yarg = { .ys = ys, }; ssize_t len; int err; @@ -767,7 +772,7 @@ int ynl_ntf_check(struct ynl_sock *ys) return len; err = mnl_cb_run2(ys->rx_buf, len, ys->seq, ys->portid, - ynl_ntf_trampoline, ys, + ynl_ntf_trampoline, &yarg, ynl_cb_array, NLMSG_MIN_TYPE); if (err < 0) return err; From 5d78b73e851455d525a064f3b042b29fdc0c1a4a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Feb 2024 08:11:12 -0800 Subject: [PATCH 268/327] tools: ynl: don't leak mcast_groups on init error Make sure to free the already-parsed mcast_groups if we don't get an ack from the kernel when reading family info. This is part of the ynl_sock_create() error path, so we won't get a call to ynl_sock_destroy() to free them later. Fixes: 86878f14d71a ("tools: ynl: user space helpers") Acked-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20240220161112.2735195-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index 9e41c8c0cc99..6e6d474c8366 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -588,7 +588,13 @@ static int ynl_sock_read_family(struct ynl_sock *ys, const char *family_name) return err; } - return ynl_recv_ack(ys, err); + err = ynl_recv_ack(ys, err); + if (err < 0) { + free(ys->mcast_groups); + return err; + } + + return 0; } struct ynl_sock * From 90d07e36d40079a22ca99edd2412e7e9c2382968 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 20 Feb 2024 09:22:46 +0100 Subject: [PATCH 269/327] net: stmmac: Fix EST offset for dwmac 5.10 Fix EST offset for dwmac 5.10. Currently configuring Qbv doesn't work as expected. The schedule is configured, but never confirmed: |[ 128.250219] imx-dwmac 428a0000.ethernet eth1: configured EST The reason seems to be the refactoring of the EST code which set the wrong EST offset for the dwmac 5.10. After fixing this it works as before: |[ 106.359577] imx-dwmac 428a0000.ethernet eth1: configured EST |[ 128.430715] imx-dwmac 428a0000.ethernet eth1: EST: SWOL has been switched Tested on imx93. Fixes: c3f3b97238f6 ("net: stmmac: Refactor EST implementation") Signed-off-by: Kurt Kanzenbach Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/20240220-stmmac_est-v1-1-c41f9ae2e7b7@linutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/hwif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 1bd34b2a47e8..29367105df54 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -224,7 +224,7 @@ static const struct stmmac_hwif_entry { .regs = { .ptp_off = PTP_GMAC4_OFFSET, .mmc_off = MMC_GMAC4_OFFSET, - .est_off = EST_XGMAC_OFFSET, + .est_off = EST_GMAC4_OFFSET, }, .desc = &dwmac4_desc_ops, .dma = &dwmac410_dma_ops, From 61c43780e9444123410cd48c2483e01d2b8f75e8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 20 Feb 2024 08:52:45 +0100 Subject: [PATCH 270/327] devlink: fix port dump cmd type Unlike other commands, due to a c&p error, port dump fills-up cmd with wrong value, different from port-get request cmd, port-get doit reply and port notification. Fix it by filling cmd with value DEVLINK_CMD_PORT_NEW. Skimmed through devlink userspace implementations, none of them cares about this cmd value. Only ynl, for which, this is actually a fix, as it expects doit and dumpit ops rsp_value to be the same. Omit the fixes tag, even thought this is fix, better to target this for next release. Fixes: bfcd3a466172 ("Introduce devlink infrastructure") Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240220075245.75416-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/devlink/port.c b/net/devlink/port.c index 78592912f657..4b2d46ccfe48 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -583,7 +583,7 @@ devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, + DEVLINK_CMD_PORT_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); From 1fde0ca3a0de7e9f917668941156959dd5e9108b Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 20 Feb 2024 08:59:28 +0000 Subject: [PATCH 271/327] net/sched: flower: Add lock protection when remove filter handle As IDR can't protect itself from the concurrent modification, place idr_remove() under the protection of tp->lock. Fixes: 08a0063df3ae ("net/sched: flower: Move filter handle initialization earlier") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Gal Pressman Reviewed-by: Jiri Pirko Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240220085928.9161-1-jianbol@nvidia.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flower.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index efb9d2811b73..6ee7064c82fc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2460,8 +2460,11 @@ unbind_filter: } errout_idr: - if (!fold) + if (!fold) { + spin_lock(&tp->lock); idr_remove(&head->handle_idr, fnew->handle); + spin_unlock(&tp->lock); + } __fl_put(fnew); errout_tb: kfree(tb); From 603ead96582d85903baec2d55f021b8dac5c25d2 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 19 Feb 2024 09:00:43 +0100 Subject: [PATCH 272/327] net: sparx5: Add spinlock for frame transmission from CPU Both registers used when doing manual injection or fdma injection are shared between all the net devices of the switch. It was noticed that when having two process which each of them trying to inject frames on different ethernet ports, that the HW started to behave strange, by sending out more frames then expected. When doing fdma injection it is required to set the frame in the DCB and then make sure that the next pointer of the last DCB is invalid. But because there is no locks for this, then easily this pointer between the DCB can be broken and then it would create a loop of DCBs. And that means that the HW will continuously transmit these frames in a loop. Until the SW will break this loop. Therefore to fix this issue, add a spin lock for when accessing the registers for manual or fdma injection. Signed-off-by: Horatiu Vultur Reviewed-by: Daniel Machon Fixes: f3cad2611a77 ("net: sparx5: add hostmode with phylink support") Link: https://lore.kernel.org/r/20240219080043.1561014-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_main.c | 1 + drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 1 + drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c index d1f7fc8b1b71..3c066b62e689 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c @@ -757,6 +757,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sparx5); sparx5->pdev = pdev; sparx5->dev = &pdev->dev; + spin_lock_init(&sparx5->tx_lock); /* Do switch core reset if available */ reset = devm_reset_control_get_optional_shared(&pdev->dev, "switch"); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index 6f565c0c0c3d..316fed5f2735 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -280,6 +280,7 @@ struct sparx5 { int xtr_irq; /* Frame DMA */ int fdma_irq; + spinlock_t tx_lock; /* lock for frame transmission */ struct sparx5_rx rx; struct sparx5_tx tx; /* PTP */ diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index 6db6ac6a3bbc..ac7e1cffbcec 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -244,10 +244,12 @@ netdev_tx_t sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev) } skb_tx_timestamp(skb); + spin_lock(&sparx5->tx_lock); if (sparx5->fdma_irq > 0) ret = sparx5_fdma_xmit(sparx5, ifh, skb); else ret = sparx5_inject(sparx5, ifh, skb, dev); + spin_unlock(&sparx5->tx_lock); if (ret == -EBUSY) goto busy; From 56ee7db31187dc36d501622cb5f1415e88e01c2a Mon Sep 17 00:00:00 2001 From: Sandeep Dhavale Date: Wed, 21 Feb 2024 13:03:47 -0800 Subject: [PATCH 273/327] erofs: fix refcount on the metabuf used for inode lookup In erofs_find_target_block() when erofs_dirnamecmp() returns 0, we do not assign the target metabuf. This causes the caller erofs_namei()'s erofs_put_metabuf() at the end to be not effective leaving the refcount on the page. As the page from metabuf (buf->page) is never put, such page cannot be migrated or reclaimed. Fix it now by putting the metabuf from previous loop and assigning the current metabuf to target before returning so caller erofs_namei() can do the final put as it was intended. Fixes: 500edd095648 ("erofs: use meta buffers for inode lookup") Cc: # 5.18+ Signed-off-by: Sandeep Dhavale Reviewed-by: Gao Xiang Reviewed-by: Jingbo Xu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240221210348.3667795-1-dhavale@google.com Signed-off-by: Gao Xiang --- fs/erofs/namei.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index d4f631d39f0f..f0110a78acb2 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target, /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); - if (!diff) { - *_ndirents = 0; - goto out; - } else if (diff > 0) { - head = mid + 1; - startprfx = matched; - - if (!IS_ERR(candidate)) - erofs_put_metabuf(target); - *target = buf; - candidate = de; - *_ndirents = ndirents; - } else { + if (diff < 0) { erofs_put_metabuf(&buf); - back = mid - 1; endprfx = matched; + continue; } + + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + *target = buf; + if (!diff) { + *_ndirents = 0; + return de; + } + head = mid + 1; + startprfx = matched; + candidate = de; + *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ From 3b2d9bc4d4acdf15a876eae2c0d83149250e85ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Sun, 18 Feb 2024 10:12:13 +0200 Subject: [PATCH 274/327] phonet: take correct lock to peek at the RX queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The receive queue is protected by its embedded spin-lock, not the socket lock, so we need the former lock here (and only that one). Fixes: 107d0d9b8d9a ("Phonet: Phonet datagram transport protocol") Reported-by: Luosili Signed-off-by: Rémi Denis-Courmont Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240218081214.4806-1-remi@remlab.net Signed-off-by: Paolo Abeni --- net/phonet/datagram.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 3aa50dc7535b..976fe250b509 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -34,10 +34,10 @@ static int pn_ioctl(struct sock *sk, int cmd, int *karg) switch (cmd) { case SIOCINQ: - lock_sock(sk); + spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; - release_sock(sk); + spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; case SIOCPNADDRESOURCE: From 7d2a894d7f487dcb894df023e9d3014cf5b93fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Sun, 18 Feb 2024 10:12:14 +0200 Subject: [PATCH 275/327] phonet/pep: fix racy skb_queue_empty() use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The receive queues are protected by their respective spin-lock, not the socket lock. This could lead to skb_peek() unexpectedly returning NULL or a pointer to an already dequeued socket buffer. Fixes: 9641458d3ec4 ("Phonet: Pipe End Point for Phonet Pipes protocol") Signed-off-by: Rémi Denis-Courmont Link: https://lore.kernel.org/r/20240218081214.4806-2-remi@remlab.net Signed-off-by: Paolo Abeni --- net/phonet/pep.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/net/phonet/pep.c b/net/phonet/pep.c index faba31f2eff2..3dd5f52bc1b5 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -917,6 +917,37 @@ static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) return 0; } +static unsigned int pep_first_packet_length(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + struct sk_buff_head *q; + struct sk_buff *skb; + unsigned int len = 0; + bool found = false; + + if (sock_flag(sk, SOCK_URGINLINE)) { + q = &pn->ctrlreq_queue; + spin_lock_bh(&q->lock); + skb = skb_peek(q); + if (skb) { + len = skb->len; + found = true; + } + spin_unlock_bh(&q->lock); + } + + if (likely(!found)) { + q = &sk->sk_receive_queue; + spin_lock_bh(&q->lock); + skb = skb_peek(q); + if (skb) + len = skb->len; + spin_unlock_bh(&q->lock); + } + + return len; +} + static int pep_ioctl(struct sock *sk, int cmd, int *karg) { struct pep_sock *pn = pep_sk(sk); @@ -929,15 +960,7 @@ static int pep_ioctl(struct sock *sk, int cmd, int *karg) break; } - lock_sock(sk); - if (sock_flag(sk, SOCK_URGINLINE) && - !skb_queue_empty(&pn->ctrlreq_queue)) - *karg = skb_peek(&pn->ctrlreq_queue)->len; - else if (!skb_queue_empty(&sk->sk_receive_queue)) - *karg = skb_peek(&sk->sk_receive_queue)->len; - else - *karg = 0; - release_sock(sk); + *karg = pep_first_packet_length(sk); ret = 0; break; From f198d933c2e4f8f89e0620fbaf1ea7eac384a0eb Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Mon, 19 Feb 2024 14:52:54 +0100 Subject: [PATCH 276/327] Fix write to cloned skb in ipv6_hop_ioam() ioam6_fill_trace_data() writes inside the skb payload without ensuring it's writeable (e.g., not cloned). This function is called both from the input and output path. The output path (ioam6_iptunnel) already does the check. This commit provides a fix for the input path, inside ipv6_hop_ioam(). It also updates ip6_parse_tlv() to refresh the network header pointer ("nh") when returning from ipv6_hop_ioam(). Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace") Reported-by: Paolo Abeni Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/exthdrs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 4952ae792450..02e9ffb63af1 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -177,6 +177,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; + + nh = skb_network_header(skb); break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) @@ -943,6 +945,14 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (!skb_valid_dst(skb)) ip6_route_input(skb); + /* About to mangle packet header */ + if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len)) + goto drop; + + /* Trace pointer may have changed */ + trace = (struct ioam6_trace_hdr *)(skb_network_header(skb) + + optoff + sizeof(*hdr)); + ioam6_fill_trace_data(skb, ns, trace, true); break; default: From 187bbb6968af5400e436c193765c5d845a07364f Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Mon, 19 Feb 2024 14:52:55 +0100 Subject: [PATCH 277/327] selftests: ioam: refactoring to align with the fix ioam6_parser uses a packet socket. After the fix to prevent writing to cloned skb's, the receiver does not see its IOAM data anymore, which makes input/forward ioam-selftests to fail. As a workaround, ioam6_parser now uses an IPv6 raw socket and leverages ancillary data to get hop-by-hop options. As a consequence, the hook is "after" the IOAM data insertion by the receiver and all tests are working again. Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/ioam6.sh | 36 ++++---- tools/testing/selftests/net/ioam6_parser.c | 95 +++++++++++----------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index fe59ca3e5596..12491850ae98 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -367,14 +367,12 @@ run_test() local desc=$2 local node_src=$3 local node_dst=$4 - local ip6_src=$5 - local ip6_dst=$6 - local if_dst=$7 - local trace_type=$8 - local ioam_ns=$9 + local ip6_dst=$5 + local trace_type=$6 + local ioam_ns=$7 + local type=$8 - ip netns exec $node_dst ./ioam6_parser $if_dst $name $ip6_src $ip6_dst \ - $trace_type $ioam_ns & + ip netns exec $node_dst ./ioam6_parser $name $trace_type $ioam_ns $type & local spid=$! sleep 0.1 @@ -489,7 +487,7 @@ out_undef_ns() trace prealloc type 0x800000 ns 0 size 4 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0x800000 0 + db01::1 0x800000 0 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -509,7 +507,7 @@ out_no_room() trace prealloc type 0xc00000 ns 123 size 4 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0xc00000 123 + db01::1 0xc00000 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -543,14 +541,14 @@ out_bits() if [ $cmd_res != 0 ] then npassed=$((npassed+1)) - log_test_passed "$descr" + log_test_passed "$descr ($1 mode)" else nfailed=$((nfailed+1)) - log_test_failed "$descr" + log_test_failed "$descr ($1 mode)" fi else run_test "out_bit$i" "$descr ($1 mode)" $ioam_node_alpha \ - $ioam_node_beta db01::2 db01::1 veth0 ${bit2type[$i]} 123 + $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1 fi done @@ -574,7 +572,7 @@ out_full_supp_trace() trace prealloc type 0xfff002 ns 123 size 100 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0xfff002 123 + db01::1 0xfff002 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -604,7 +602,7 @@ in_undef_ns() trace prealloc type 0x800000 ns 0 size 4 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0x800000 0 + db01::1 0x800000 0 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -624,7 +622,7 @@ in_no_room() trace prealloc type 0xc00000 ns 123 size 4 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0xc00000 123 + db01::1 0xc00000 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -651,7 +649,7 @@ in_bits() dev veth0 run_test "in_bit$i" "${desc//$i} ($1 mode)" $ioam_node_alpha \ - $ioam_node_beta db01::2 db01::1 veth0 ${bit2type[$i]} 123 + $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1 done [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down @@ -679,7 +677,7 @@ in_oflag() trace prealloc type 0xc00000 ns 123 size 4 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0xc00000 123 + db01::1 0xc00000 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down @@ -703,7 +701,7 @@ in_full_supp_trace() trace prealloc type 0xfff002 ns 123 size 80 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \ - db01::2 db01::1 veth0 0xfff002 123 + db01::1 0xfff002 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down } @@ -731,7 +729,7 @@ fwd_full_supp_trace() trace prealloc type 0xfff002 ns 123 size 244 via db01::1 dev veth0 run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_gamma \ - db01::2 db02::2 veth0 0xfff002 123 + db02::2 0xfff002 123 $1 [ "$1" = "encap" ] && ip -netns $ioam_node_gamma link set ip6tnl0 down } diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c index d9d1d4190126..895e5bb5044b 100644 --- a/tools/testing/selftests/net/ioam6_parser.c +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -512,14 +511,6 @@ static int str2id(const char *tname) return -1; } -static int ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) -{ - return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | - (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | - (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | - (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; -} - static int get_u32(__u32 *val, const char *arg, int base) { unsigned long res; @@ -603,70 +594,80 @@ static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { int main(int argc, char **argv) { - int fd, size, hoplen, tid, ret = 1; - struct in6_addr src, dst; + int fd, size, hoplen, tid, ret = 1, on = 1; struct ioam6_hdr *opt; - struct ipv6hdr *ip6h; - __u8 buffer[400], *p; - __u16 ioam_ns; + struct cmsghdr *cmsg; + struct msghdr msg; + struct iovec iov; + __u8 buffer[512]; __u32 tr_type; + __u16 ioam_ns; + __u8 *ptr; - if (argc != 7) + if (argc != 5) goto out; - tid = str2id(argv[2]); + tid = str2id(argv[1]); if (tid < 0 || !func[tid]) goto out; - if (inet_pton(AF_INET6, argv[3], &src) != 1 || - inet_pton(AF_INET6, argv[4], &dst) != 1) + if (get_u32(&tr_type, argv[2], 16) || + get_u16(&ioam_ns, argv[3], 0)) goto out; - if (get_u32(&tr_type, argv[5], 16) || - get_u16(&ioam_ns, argv[6], 0)) + fd = socket(PF_INET6, SOCK_RAW, + !strcmp(argv[4], "encap") ? IPPROTO_IPV6 : IPPROTO_ICMPV6); + if (fd < 0) goto out; - fd = socket(AF_PACKET, SOCK_DGRAM, __cpu_to_be16(ETH_P_IPV6)); - if (!fd) - goto out; + setsockopt(fd, IPPROTO_IPV6, IPV6_RECVHOPOPTS, &on, sizeof(on)); - if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, - argv[1], strlen(argv[1]))) + iov.iov_len = 1; + iov.iov_base = malloc(CMSG_SPACE(sizeof(buffer))); + if (!iov.iov_base) goto close; - recv: - size = recv(fd, buffer, sizeof(buffer), 0); + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = buffer; + msg.msg_controllen = CMSG_SPACE(sizeof(buffer)); + + size = recvmsg(fd, &msg, 0); if (size <= 0) goto close; - ip6h = (struct ipv6hdr *)buffer; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != IPPROTO_IPV6 || + cmsg->cmsg_type != IPV6_HOPOPTS || + cmsg->cmsg_len < sizeof(struct ipv6_hopopt_hdr)) + continue; - if (!ipv6_addr_equal(&ip6h->saddr, &src) || - !ipv6_addr_equal(&ip6h->daddr, &dst)) - goto recv; + ptr = (__u8 *)CMSG_DATA(cmsg); - if (ip6h->nexthdr != IPPROTO_HOPOPTS) - goto close; + hoplen = (ptr[1] + 1) << 3; + ptr += sizeof(struct ipv6_hopopt_hdr); - p = buffer + sizeof(*ip6h); - hoplen = (p[1] + 1) << 3; - p += sizeof(struct ipv6_hopopt_hdr); + while (hoplen > 0) { + opt = (struct ioam6_hdr *)ptr; - while (hoplen > 0) { - opt = (struct ioam6_hdr *)p; + if (opt->opt_type == IPV6_TLV_IOAM && + opt->type == IOAM6_TYPE_PREALLOC) { + ptr += sizeof(*opt); + ret = func[tid](tid, + (struct ioam6_trace_hdr *)ptr, + tr_type, ioam_ns); + goto close; + } - if (opt->opt_type == IPV6_TLV_IOAM && - opt->type == IOAM6_TYPE_PREALLOC) { - p += sizeof(*opt); - ret = func[tid](tid, (struct ioam6_trace_hdr *)p, - tr_type, ioam_ns); - break; + ptr += opt->opt_len + 2; + hoplen -= opt->opt_len + 2; } - - p += opt->opt_len + 2; - hoplen -= opt->opt_len + 2; } + + goto recv; close: + free(iov.iov_base); close(fd); out: return ret; From 3489182b11d35f1944c1245fc9c4867cf622c50f Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Tue, 20 Feb 2024 12:30:07 +0530 Subject: [PATCH 278/327] net: phy: realtek: Fix rtl8211f_config_init() for RTL8211F(D)(I)-VD-CG PHY Commit bb726b753f75 ("net: phy: realtek: add support for RTL8211F(D)(I)-VD-CG") extended support of the driver from the existing support for RTL8211F(D)(I)-CG PHY to the newer RTL8211F(D)(I)-VD-CG PHY. While that commit indicated that the RTL8211F_PHYCR2 register is not supported by the "VD-CG" PHY model and therefore updated the corresponding section in rtl8211f_config_init() to be invoked conditionally, the call to "genphy_soft_reset()" was left as-is, when it should have also been invoked conditionally. This is because the call to "genphy_soft_reset()" was first introduced by the commit 0a4355c2b7f8 ("net: phy: realtek: add dt property to disable CLKOUT clock") since the RTL8211F guide indicates that a PHY reset should be issued after setting bits in the PHYCR2 register. As the PHYCR2 register is not applicable to the "VD-CG" PHY model, fix the rtl8211f_config_init() function by invoking "genphy_soft_reset()" conditionally based on the presence of the "PHYCR2" register. Fixes: bb726b753f75 ("net: phy: realtek: add support for RTL8211F(D)(I)-VD-CG") Signed-off-by: Siddharth Vadapalli Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240220070007.968762-1-s-vadapalli@ti.com Signed-off-by: Paolo Abeni --- drivers/net/phy/realtek.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 894172a3e15f..337899c69738 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -421,9 +421,11 @@ static int rtl8211f_config_init(struct phy_device *phydev) ERR_PTR(ret)); return ret; } + + return genphy_soft_reset(phydev); } - return genphy_soft_reset(phydev); + return 0; } static int rtl821x_suspend(struct phy_device *phydev) From 359e54a93ab43d32ee1bff3c2f9f10cb9f6b6e79 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 20 Feb 2024 12:21:56 +0000 Subject: [PATCH 279/327] l2tp: pass correct message length to ip6_append_data l2tp_ip6_sendmsg needs to avoid accounting for the transport header twice when splicing more data into an already partially-occupied skbuff. To manage this, we check whether the skbuff contains data using skb_queue_empty when deciding how much data to append using ip6_append_data. However, the code which performed the calculation was incorrect: ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0; ...due to C operator precedence, this ends up setting ulen to transhdrlen for messages with a non-zero length, which results in corrupted packets on the wire. Add parentheses to correct the calculation in line with the original intent. Fixes: 9d4c75800f61 ("ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data()") Cc: David Howells Cc: stable@vger.kernel.org Signed-off-by: Tom Parkin Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240220122156.43131-1-tparkin@katalix.com Signed-off-by: Paolo Abeni --- net/l2tp/l2tp_ip6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index dd3153966173..7bf14cf9ffaa 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -627,7 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); - ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0; + ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, From 8c987693dc2d292d777f1be63cb35233049ae25e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Feb 2024 15:57:42 +0100 Subject: [PATCH 280/327] ARM: dts: renesas: rcar-gen2: Add missing #interrupt-cells to DA9063 nodes make dtbs_check W=2: arch/arm/boot/dts/renesas/r8a7790-lager.dts:444.11-458.5: Warning (interrupt_provider): /i2c-mux4/pmic@58: Missing '#interrupt-cells' in interrupt provider ... Fix this by adding the missing #interrupt-cells properties. Reported-by: Rob Herring Signed-off-by: Geert Uytterhoeven Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/a351e503ea97fb1af68395843f513925ff1bdf26.1707922460.git.geert+renesas@glider.be --- arch/arm/boot/dts/renesas/r8a7790-lager.dts | 1 + arch/arm/boot/dts/renesas/r8a7790-stout.dts | 1 + arch/arm/boot/dts/renesas/r8a7791-koelsch.dts | 1 + arch/arm/boot/dts/renesas/r8a7791-porter.dts | 1 + arch/arm/boot/dts/renesas/r8a7792-blanche.dts | 1 + arch/arm/boot/dts/renesas/r8a7793-gose.dts | 1 + arch/arm/boot/dts/renesas/r8a7794-alt.dts | 1 + arch/arm/boot/dts/renesas/r8a7794-silk.dts | 1 + 8 files changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/renesas/r8a7790-lager.dts b/arch/arm/boot/dts/renesas/r8a7790-lager.dts index 2fba4d084001..8590981245a6 100644 --- a/arch/arm/boot/dts/renesas/r8a7790-lager.dts +++ b/arch/arm/boot/dts/renesas/r8a7790-lager.dts @@ -447,6 +447,7 @@ interrupt-parent = <&irqc0>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; rtc { compatible = "dlg,da9063-rtc"; diff --git a/arch/arm/boot/dts/renesas/r8a7790-stout.dts b/arch/arm/boot/dts/renesas/r8a7790-stout.dts index f9bc5b4f019d..683f7395fab0 100644 --- a/arch/arm/boot/dts/renesas/r8a7790-stout.dts +++ b/arch/arm/boot/dts/renesas/r8a7790-stout.dts @@ -347,6 +347,7 @@ interrupt-parent = <&irqc0>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; onkey { compatible = "dlg,da9063-onkey"; diff --git a/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts b/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts index e9c13bb03772..0efd9f98c75a 100644 --- a/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts +++ b/arch/arm/boot/dts/renesas/r8a7791-koelsch.dts @@ -819,6 +819,7 @@ interrupt-parent = <&irqc0>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; rtc { compatible = "dlg,da9063-rtc"; diff --git a/arch/arm/boot/dts/renesas/r8a7791-porter.dts b/arch/arm/boot/dts/renesas/r8a7791-porter.dts index 7e8bc06715f6..93c86e921645 100644 --- a/arch/arm/boot/dts/renesas/r8a7791-porter.dts +++ b/arch/arm/boot/dts/renesas/r8a7791-porter.dts @@ -413,6 +413,7 @@ interrupt-parent = <&irqc0>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; watchdog { compatible = "dlg,da9063-watchdog"; diff --git a/arch/arm/boot/dts/renesas/r8a7792-blanche.dts b/arch/arm/boot/dts/renesas/r8a7792-blanche.dts index 4f9838cf97ee..540a9ad28f28 100644 --- a/arch/arm/boot/dts/renesas/r8a7792-blanche.dts +++ b/arch/arm/boot/dts/renesas/r8a7792-blanche.dts @@ -381,6 +381,7 @@ interrupt-parent = <&irqc>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; rtc { compatible = "dlg,da9063-rtc"; diff --git a/arch/arm/boot/dts/renesas/r8a7793-gose.dts b/arch/arm/boot/dts/renesas/r8a7793-gose.dts index 1744fdbf9e0c..1ea6c757893b 100644 --- a/arch/arm/boot/dts/renesas/r8a7793-gose.dts +++ b/arch/arm/boot/dts/renesas/r8a7793-gose.dts @@ -759,6 +759,7 @@ interrupt-parent = <&irqc0>; interrupts = <2 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; rtc { compatible = "dlg,da9063-rtc"; diff --git a/arch/arm/boot/dts/renesas/r8a7794-alt.dts b/arch/arm/boot/dts/renesas/r8a7794-alt.dts index c0d067df22a0..b5ecafbb2e4d 100644 --- a/arch/arm/boot/dts/renesas/r8a7794-alt.dts +++ b/arch/arm/boot/dts/renesas/r8a7794-alt.dts @@ -453,6 +453,7 @@ interrupt-parent = <&gpio3>; interrupts = <31 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; rtc { compatible = "dlg,da9063-rtc"; diff --git a/arch/arm/boot/dts/renesas/r8a7794-silk.dts b/arch/arm/boot/dts/renesas/r8a7794-silk.dts index 43d480a7f3ea..595e074085eb 100644 --- a/arch/arm/boot/dts/renesas/r8a7794-silk.dts +++ b/arch/arm/boot/dts/renesas/r8a7794-silk.dts @@ -439,6 +439,7 @@ interrupt-parent = <&gpio3>; interrupts = <31 IRQ_TYPE_LEVEL_LOW>; interrupt-controller; + #interrupt-cells = <2>; onkey { compatible = "dlg,da9063-onkey"; From 40510a941d27d405a82dc3320823d875f94625df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 21 Feb 2024 08:33:24 +0100 Subject: [PATCH 281/327] drm/ttm: Fix an invalid freeing on already freed page in error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If caching mode change fails due to, for example, OOM we free the allocated pages in a two-step process. First the pages for which the caching change has already succeeded. Secondly the pages for which a caching change did not succeed. However the second step was incorrectly freeing the pages already freed in the first step. Fix. Signed-off-by: Thomas Hellström Fixes: 379989e7cbdc ("drm/ttm/pool: Fix ttm_pool_alloc error path") Cc: Christian König Cc: Dave Airlie Cc: Christian Koenig Cc: Huang Rui Cc: dri-devel@lists.freedesktop.org Cc: # v6.4+ Reviewed-by: Matthew Auld Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240221073324.3303-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index b62f420a9f96..112438d965ff 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -387,7 +387,7 @@ static void ttm_pool_free_range(struct ttm_pool *pool, struct ttm_tt *tt, enum ttm_caching caching, pgoff_t start_page, pgoff_t end_page) { - struct page **pages = tt->pages; + struct page **pages = &tt->pages[start_page]; unsigned int order; pgoff_t i, nr; From 3c43177ffb54ea5be97505eb8e2690e99ac96bc9 Mon Sep 17 00:00:00 2001 From: Erik Kurzinger Date: Fri, 19 Jan 2024 08:32:06 -0800 Subject: [PATCH 282/327] drm/syncobj: call drm_syncobj_fence_add_wait when WAIT_AVAILABLE flag is set When waiting for a syncobj timeline point whose fence has not yet been submitted with the WAIT_FOR_SUBMIT flag, a callback is registered using drm_syncobj_fence_add_wait and the thread is put to sleep until the timeout expires. If the fence is submitted before then, drm_syncobj_add_point will wake up the sleeping thread immediately which will proceed to wait for the fence to be signaled. However, if the WAIT_AVAILABLE flag is used instead, drm_syncobj_fence_add_wait won't get called, meaning the waiting thread will always sleep for the full timeout duration, even if the fence gets submitted earlier. If it turns out that the fence *has* been submitted by the time it eventually wakes up, it will still indicate to userspace that the wait completed successfully (it won't return -ETIME), but it will have taken much longer than it should have. To fix this, we must call drm_syncobj_fence_add_wait if *either* the WAIT_FOR_SUBMIT flag or the WAIT_AVAILABLE flag is set. The only difference being that with WAIT_FOR_SUBMIT we will also wait for the fence to be signaled after it has been submitted while with WAIT_AVAILABLE we will return immediately. IGT test patch: https://lists.freedesktop.org/archives/igt-dev/2024-January/067537.html v1 -> v2: adjust lockdep_assert_none_held_once condition (cherry picked from commit 8c44ea81634a4a337df70a32621a5f3791be23df) Fixes: 01d6c3578379 ("drm/syncobj: add support for timeline point wait v8") Signed-off-by: Erik Kurzinger Signed-off-by: Simon Ser Reviewed-by: Daniel Vetter Reviewed-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240119163208.3723457-1-ekurzinger@nvidia.com --- drivers/gpu/drm/drm_syncobj.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 84101baeecc6..56b0677acfa3 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1040,7 +1040,8 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, uint64_t *points; uint32_t signaled_count, i; - if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) + if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) lockdep_assert_none_held_once(); points = kmalloc_array(count, sizeof(*points), GFP_KERNEL); @@ -1109,7 +1110,8 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, * fallthough and try a 0 timeout wait! */ - if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) { + if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) { for (i = 0; i < count; ++i) drm_syncobj_fence_add_wait(syncobjs[i], &entries[i]); } From b5bf7778b722105d7a04b1d51e884497b542638b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 21 Feb 2024 20:27:02 -0400 Subject: [PATCH 283/327] iommu/arm-smmu-v3: Do not use GFP_KERNEL under as spinlock If the SMMU is configured to use a two level CD table then arm_smmu_write_ctx_desc() allocates a CD table leaf internally using GFP_KERNEL. Due to recent changes this is being done under a spinlock to iterate over the device list - thus it will trigger a sleeping while atomic warning: arm_smmu_sva_set_dev_pasid() mutex_lock(&sva_lock); __arm_smmu_sva_bind() arm_smmu_mmu_notifier_get() spin_lock_irqsave() arm_smmu_write_ctx_desc() arm_smmu_get_cd_ptr() arm_smmu_alloc_cd_leaf_table() dmam_alloc_coherent(GFP_KERNEL) This is a 64K high order allocation and really should not be done atomically. At the moment the rework of the SVA to follow the new API is half finished. Recently the CD table memory was moved from the domain to the master, however we have the confusing situation where the SVA code is wrongly using the RID domains device's list to track which CD tables the SVA is installed in. Remove the logic to replicate the CD across all the domain's masters during attach. We know which master and which CD table the PASID should be installed in. Right now SVA only works when dma-iommu.c is in control of the RID translation, which means we have a single iommu_domain shared across the entire group and that iommu_domain is not shared outside the group. Critically this means that the iommu_group->devices list and RID's smmu_domain->devices list describe the same set of masters. For PCI cases the core code also insists on singleton groups so there is only one entry in the smmu_domain->devices list that is equal to the master being passed in to arm_smmu_sva_set_dev_pasid(). Only non-PCI cases may have multi-device groups. However, the core code will repeat the calls to arm_smmu_sva_set_dev_pasid() across the entire iommu_group->devices list. Instead of having arm_smmu_mmu_notifier_get() indirectly loop over all the devices in the group via the RID's smmu_domain, rely on __arm_smmu_sva_bind() to be called for each device in the group and install the repeated CD entry that way. This avoids taking the spinlock to access the devices list and permits the arm_smmu_write_ctx_desc() to use a sleeping allocation. Leave the arm_smmu_mm_release() as a confusing situation, this requires tracking attached masters inside the SVA domain. Removing the loop allows arm_smmu_write_ctx_desc() to be called outside the spinlock and thus is safe to use GFP_KERNEL. Move the clearing of the CD into arm_smmu_sva_remove_dev_pasid() so that arm_smmu_mmu_notifier_get/put() remain paired functions. Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/ Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Shavit Link: https://lore.kernel.org/r/0-v3-11978fc67151+112-smmu_cd_atomic_jgg@nvidia.com Signed-off-by: Will Deacon --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 05722121f00e..4a27fbdb2d84 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -292,10 +292,8 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, struct mm_struct *mm) { int ret; - unsigned long flags; struct arm_smmu_ctx_desc *cd; struct arm_smmu_mmu_notifier *smmu_mn; - struct arm_smmu_master *master; list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) { if (smmu_mn->mn.mm == mm) { @@ -325,28 +323,9 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, goto err_free_cd; } - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { - ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), - cd); - if (ret) { - list_for_each_entry_from_reverse( - master, &smmu_domain->devices, domain_head) - arm_smmu_write_ctx_desc( - master, mm_get_enqcmd_pasid(mm), NULL); - break; - } - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - if (ret) - goto err_put_notifier; - list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); return smmu_mn; -err_put_notifier: - /* Frees smmu_mn */ - mmu_notifier_put(&smmu_mn->mn); err_free_cd: arm_smmu_free_shared_cd(cd); return ERR_PTR(ret); @@ -363,9 +342,6 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); - arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), - NULL); - /* * If we went through clear(), we've already invalidated, and no * new TLB entry can have been formed. @@ -381,7 +357,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) arm_smmu_free_shared_cd(cd); } -static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) +static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, + struct mm_struct *mm) { int ret; struct arm_smmu_bond *bond; @@ -404,9 +381,15 @@ static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) goto err_free_bond; } + ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd); + if (ret) + goto err_put_notifier; + list_add(&bond->list, &master->bonds); return 0; +err_put_notifier: + arm_smmu_mmu_notifier_put(bond->smmu_mn); err_free_bond: kfree(bond); return ret; @@ -568,6 +551,9 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); mutex_lock(&sva_lock); + + arm_smmu_write_ctx_desc(master, id, NULL); + list_for_each_entry(t, &master->bonds, list) { if (t->mm == mm) { bond = t; @@ -590,7 +576,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, struct mm_struct *mm = domain->mm; mutex_lock(&sva_lock); - ret = __arm_smmu_sva_bind(dev, mm); + ret = __arm_smmu_sva_bind(dev, id, mm); mutex_unlock(&sva_lock); return ret; From 2aa6f5b0fd052e363bb9d4b547189f0bf6b3d6d3 Mon Sep 17 00:00:00 2001 From: Erik Kurzinger Date: Wed, 21 Feb 2024 10:44:28 -0800 Subject: [PATCH 284/327] drm/syncobj: handle NULL fence in syncobj_eventfd_entry_func During syncobj_eventfd_entry_func, dma_fence_chain_find_seqno may set the fence to NULL if the given seqno is signaled and a later seqno has already been submitted. In that case, the eventfd should be signaled immediately which currently does not happen. This is a similar issue to the one addressed by commit b19926d4f3a6 ("drm/syncobj: Deal with signalled fences in drm_syncobj_find_fence."). As a fix, if the return value of dma_fence_chain_find_seqno indicates success but it sets the fence to NULL, we will assign a stub fence to ensure the following code still signals the eventfd. v1 -> v2: assign a stub fence instead of signaling the eventfd Signed-off-by: Erik Kurzinger Fixes: c7a472297169 ("drm/syncobj: add IOCTL to register an eventfd") Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240221184527.37667-1-ekurzinger@nvidia.com --- drivers/gpu/drm/drm_syncobj.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 56b0677acfa3..a6c19de46292 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1418,10 +1418,21 @@ syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, /* This happens inside the syncobj lock */ fence = dma_fence_get(rcu_dereference_protected(syncobj->fence, 1)); + if (!fence) + return; + ret = dma_fence_chain_find_seqno(&fence, entry->point); - if (ret != 0 || !fence) { + if (ret != 0) { + /* The given seqno has not been submitted yet. */ dma_fence_put(fence); return; + } else if (!fence) { + /* If dma_fence_chain_find_seqno returns 0 but sets the fence + * to NULL, it implies that the given seqno is signaled and a + * later seqno has already been submitted. Assign a stub fence + * so that the eventfd still gets signaled below. + */ + fence = dma_fence_get_stub(); } list_del_init(&entry->node); From 510325e5ac5f45c1180189d3bfc108c54bf64544 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 22 Feb 2024 12:49:33 +0500 Subject: [PATCH 285/327] selftests/iommu: fix the config fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The config fragment doesn't follow the correct format to enable those config options which make the config options getting missed while merging with other configs. ➜ merge_config.sh -m .config tools/testing/selftests/iommu/config Using .config as base Merging tools/testing/selftests/iommu/config ➜ make olddefconfig .config:5295:warning: unexpected data: CONFIG_IOMMUFD .config:5296:warning: unexpected data: CONFIG_IOMMUFD_TEST While at it, add CONFIG_FAULT_INJECTION as well which is needed for CONFIG_IOMMUFD_TEST. If CONFIG_FAULT_INJECTION isn't present in base config (such as x86 defconfig), CONFIG_IOMMUFD_TEST doesn't get enabled. Fixes: 57f0988706fe ("iommufd: Add a selftest") Link: https://lore.kernel.org/r/20240222074934.71380-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 6c4f901d6fed..110d73917615 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,2 +1,3 @@ -CONFIG_IOMMUFD -CONFIG_IOMMUFD_TEST +CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION=y +CONFIG_IOMMUFD_TEST=y From 5ef1dc40ffa6a6cb968b0fdc43c3a61727a9e950 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 14 Feb 2024 16:06:28 +0100 Subject: [PATCH 286/327] s390/cio: fix invalid -EBUSY on ccw_device_start The s390 common I/O layer (CIO) returns an unexpected -EBUSY return code when drivers try to start I/O while a path-verification (PV) process is pending. This can lead to failed device initialization attempts with symptoms like broken network connectivity after boot. Fix this by replacing the -EBUSY return code with a deferred condition code 1 reply to make path-verification handling consistent from a driver's point of view. The problem can be reproduced semi-regularly using the following process, while repeating steps 2-3 as necessary (example assumes an OSA device with bus-IDs 0.0.a000-0.0.a002 on CHPID 0.02): 1. echo 0.0.a000,0.0.a001,0.0.a002 >/sys/bus/ccwgroup/drivers/qeth/group 2. echo 0 > /sys/bus/ccwgroup/devices/0.0.a000/online 3. echo 1 > /sys/bus/ccwgroup/devices/0.0.a000/online ; \ echo on > /sys/devices/css0/chp0.02/status Background information: The common I/O layer starts path-verification I/Os when it receives indications about changes in a device path's availability. This occurs for example when hardware events indicate a change in channel-path status, or when a manual operation such as a CHPID vary or configure operation is performed. If a driver attempts to start I/O while a PV is running, CIO reports a successful I/O start (ccw_device_start() return code 0). Then, after completion of PV, CIO synthesizes an interrupt response that indicates an asynchronous status condition that prevented the start of the I/O (deferred condition code 1). If a PV indication arrives while a device is busy with driver-owned I/O, PV is delayed until after I/O completion was reported to the driver's interrupt handler. To ensure that PV can be started eventually, CIO reports a device busy condition (ccw_device_start() return code -EBUSY) if a driver tries to start another I/O while PV is pending. In some cases this -EBUSY return code causes device drivers to consider a device not operational, resulting in failed device initialization. Note: The code that introduced the problem was added in 2003. Symptoms started appearing with the following CIO commit that causes a PV indication when a device is removed from the cio_ignore list after the associated parent subchannel device was probed, but before online processing of the CCW device has started: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") During boot, the cio_ignore list is modified by the cio_ignore dracut module [1] as well as Linux vendor-specific systemd service scripts[2]. When combined, this commit and boot scripts cause a frequent occurrence of the problem during boot. [1] https://github.com/dracutdevs/dracut/tree/master/modules.d/81cio_ignore [2] https://github.com/SUSE/s390-tools/blob/master/cio_ignore.service Cc: stable@vger.kernel.org # v5.15+ Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Tested-By: Thorsten Winkler Reviewed-by: Thorsten Winkler Signed-off-by: Peter Oberparleiter Signed-off-by: Heiko Carstens --- drivers/s390/cio/device_ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index c533d1dadc6b..a5dba3829769 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -202,7 +202,8 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, return -EINVAL; if (cdev->private->state == DEV_STATE_NOT_OPER) return -ENODEV; - if (cdev->private->state == DEV_STATE_VERIFY) { + if (cdev->private->state == DEV_STATE_VERIFY || + cdev->private->flags.doverify) { /* Remember to fake irb when finished. */ if (!cdev->private->flags.fake_irb) { cdev->private->flags.fake_irb = FAKE_CMD_IRB; @@ -214,8 +215,7 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, } if (cdev->private->state != DEV_STATE_ONLINE || ((sch->schib.scsw.cmd.stctl & SCSW_STCTL_PRIM_STATUS) && - !(sch->schib.scsw.cmd.stctl & SCSW_STCTL_SEC_STATUS)) || - cdev->private->flags.doverify) + !(sch->schib.scsw.cmd.stctl & SCSW_STCTL_SEC_STATUS))) return -EBUSY; ret = cio_set_options (sch, flags); if (ret) From 22e1dc4b2fec17af70f297a4295c5f19a0f3fbeb Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Fri, 2 Feb 2024 17:34:11 +0800 Subject: [PATCH 287/327] drm/amd/display: adjust few initialization order in dm [Why] Observe error message "Can't retrieve aconnector in hpd_rx_irq_offload_work" when boot up with a mst tbt4 dock connected. After analyzing, there are few parts needed to be adjusted: 1. hpd_rx_offload_wq[].aconnector is not initialzed before the dmub outbox hpd_irq handler get registered which causes the error message. 2. registeration of hpd and hpd_rx_irq event for usb4 dp tunneling is not aligned with legacy interface sequence [How] Put DMUB_NOTIFICATION_HPD and DMUB_NOTIFICATION_HPD_IRQ handler registration into register_hpd_handlers() to align other interfaces and get hpd_rx_offload_wq[].aconnector initialized earlier than that. Leave DMUB_NOTIFICATION_AUX_REPLY registered as it was since we need that while calling dc_link_detect(). USB4 connection status will be proactively detected by dc_link_detect_connection_type() in amdgpu_dm_initialize_drm_device() Cc: Stable Reviewed-by: Aurabindo Pillai Acked-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Wayne Lin Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index cf875751971f..377d532ca761 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1843,21 +1843,12 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu: fail to register dmub aux callback"); goto error; } - if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD, dmub_hpd_callback, true)) { - DRM_ERROR("amdgpu: fail to register dmub hpd callback"); - goto error; - } - if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_IRQ, dmub_hpd_callback, true)) { - DRM_ERROR("amdgpu: fail to register dmub hpd callback"); - goto error; - } - } - - /* Enable outbox notification only after IRQ handlers are registered and DMUB is alive. - * It is expected that DMUB will resend any pending notifications at this point, for - * example HPD from DPIA. - */ - if (dc_is_dmub_outbox_supported(adev->dm.dc)) { + /* Enable outbox notification only after IRQ handlers are registered and DMUB is alive. + * It is expected that DMUB will resend any pending notifications at this point. Note + * that hpd and hpd_irq handler registration are deferred to register_hpd_handlers() to + * align legacy interface initialization sequence. Connection status will be proactivly + * detected once in the amdgpu_dm_initialize_drm_device. + */ dc_enable_dmub_outbox(adev->dm.dc); /* DPIA trace goes to dmesg logs only if outbox is enabled */ @@ -3536,6 +3527,14 @@ static void register_hpd_handlers(struct amdgpu_device *adev) int_params.requested_polarity = INTERRUPT_POLARITY_DEFAULT; int_params.current_polarity = INTERRUPT_POLARITY_DEFAULT; + if (dc_is_dmub_outbox_supported(adev->dm.dc)) { + if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD, dmub_hpd_callback, true)) + DRM_ERROR("amdgpu: fail to register dmub hpd callback"); + + if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_IRQ, dmub_hpd_callback, true)) + DRM_ERROR("amdgpu: fail to register dmub hpd callback"); + } + list_for_each_entry(connector, &dev->mode_config.connector_list, head) { @@ -3564,10 +3563,6 @@ static void register_hpd_handlers(struct amdgpu_device *adev) handle_hpd_rx_irq, (void *) aconnector); } - - if (adev->dm.hpd_rx_offload_wq) - adev->dm.hpd_rx_offload_wq[connector->index].aconnector = - aconnector; } } @@ -4561,6 +4556,10 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) goto fail; } + if (dm->hpd_rx_offload_wq) + dm->hpd_rx_offload_wq[aconnector->base.index].aconnector = + aconnector; + if (!dc_link_detect_connection_type(link, &new_connection_type)) DRM_ERROR("KMS: Failed to detect connector\n"); From 4e73826089ce899357580bbf6e0afe4e6f9900b7 Mon Sep 17 00:00:00 2001 From: Lewis Huang Date: Wed, 31 Jan 2024 17:20:17 +0800 Subject: [PATCH 288/327] drm/amd/display: Only allow dig mapping to pwrseq in new asic [Why] The old asic only have 1 pwrseq hw. We don't need to map the diginst to pwrseq inst in old asic. [How] 1. Only mapping dig to pwrseq for new asic. 2. Move mapping function into dcn specific panel control component Cc: Stable # v6.6+ Cc: Mario Limonciello Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3122 Reviewed-by: Anthony Koo Acked-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Lewis Huang Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dce/dce_panel_cntl.c | 1 + .../amd/display/dc/dcn301/dcn301_panel_cntl.c | 1 + .../amd/display/dc/dcn31/dcn31_panel_cntl.c | 18 ++++++++++++- .../drm/amd/display/dc/inc/hw/panel_cntl.h | 2 +- .../drm/amd/display/dc/link/link_factory.c | 26 +------------------ 5 files changed, 21 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c b/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c index e8570060d007..5bca67407c5b 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_panel_cntl.c @@ -290,4 +290,5 @@ void dce_panel_cntl_construct( dce_panel_cntl->base.funcs = &dce_link_panel_cntl_funcs; dce_panel_cntl->base.ctx = init_data->ctx; dce_panel_cntl->base.inst = init_data->inst; + dce_panel_cntl->base.pwrseq_inst = 0; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c index ad0df1a72a90..9e96a3ace207 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_panel_cntl.c @@ -215,4 +215,5 @@ void dcn301_panel_cntl_construct( dcn301_panel_cntl->base.funcs = &dcn301_link_panel_cntl_funcs; dcn301_panel_cntl->base.ctx = init_data->ctx; dcn301_panel_cntl->base.inst = init_data->inst; + dcn301_panel_cntl->base.pwrseq_inst = 0; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c index 03248422d6ff..281be20b1a10 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c @@ -154,8 +154,24 @@ void dcn31_panel_cntl_construct( struct dcn31_panel_cntl *dcn31_panel_cntl, const struct panel_cntl_init_data *init_data) { + uint8_t pwrseq_inst = 0xF; + dcn31_panel_cntl->base.funcs = &dcn31_link_panel_cntl_funcs; dcn31_panel_cntl->base.ctx = init_data->ctx; dcn31_panel_cntl->base.inst = init_data->inst; - dcn31_panel_cntl->base.pwrseq_inst = init_data->pwrseq_inst; + + switch (init_data->eng_id) { + case ENGINE_ID_DIGA: + pwrseq_inst = 0; + break; + case ENGINE_ID_DIGB: + pwrseq_inst = 1; + break; + default: + DC_LOG_WARNING("Unsupported pwrseq engine id: %d!\n", init_data->eng_id); + ASSERT(false); + break; + } + + dcn31_panel_cntl->base.pwrseq_inst = pwrseq_inst; } diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h index 5dcbaa2db964..e97d964a1791 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h @@ -57,7 +57,7 @@ struct panel_cntl_funcs { struct panel_cntl_init_data { struct dc_context *ctx; uint32_t inst; - uint32_t pwrseq_inst; + uint32_t eng_id; }; struct panel_cntl { diff --git a/drivers/gpu/drm/amd/display/dc/link/link_factory.c b/drivers/gpu/drm/amd/display/dc/link/link_factory.c index 37d3027c32dc..cf22b8f28ba6 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_factory.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_factory.c @@ -370,30 +370,6 @@ static enum transmitter translate_encoder_to_transmitter( } } -static uint8_t translate_dig_inst_to_pwrseq_inst(struct dc_link *link) -{ - uint8_t pwrseq_inst = 0xF; - struct dc_context *dc_ctx = link->dc->ctx; - - DC_LOGGER_INIT(dc_ctx->logger); - - switch (link->eng_id) { - case ENGINE_ID_DIGA: - pwrseq_inst = 0; - break; - case ENGINE_ID_DIGB: - pwrseq_inst = 1; - break; - default: - DC_LOG_WARNING("Unsupported pwrseq engine id: %d!\n", link->eng_id); - ASSERT(false); - break; - } - - return pwrseq_inst; -} - - static void link_destruct(struct dc_link *link) { int i; @@ -657,7 +633,7 @@ static bool construct_phy(struct dc_link *link, link->link_id.id == CONNECTOR_ID_LVDS)) { panel_cntl_init_data.ctx = dc_ctx; panel_cntl_init_data.inst = panel_cntl_init_data.ctx->dc_edp_id_count; - panel_cntl_init_data.pwrseq_inst = translate_dig_inst_to_pwrseq_inst(link); + panel_cntl_init_data.eng_id = link->eng_id; link->panel_cntl = link->dc->res_pool->funcs->panel_cntl_create( &panel_cntl_init_data); From d2b48f340d9e4a8fbeb1cdc84cd8da6ad143a907 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Mon, 19 Feb 2024 11:43:16 +0530 Subject: [PATCH 289/327] drm/amd/display: Fix potential null pointer dereference in dc_dmub_srv Fixes potential null pointer dereference warnings in the dc_dmub_srv_cmd_list_queue_execute() and dc_dmub_srv_is_hw_pwr_up() functions. In both functions, the 'dc_dmub_srv' variable was being dereferenced before it was checked for null. This could lead to a null pointer dereference if 'dc_dmub_srv' is null. The fix is to check if 'dc_dmub_srv' is null before dereferencing it. Thus moving the null checks for 'dc_dmub_srv' to the beginning of the functions to ensure that 'dc_dmub_srv' is not null when it is dereferenced. Found by smatch & thus fixing the below: drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:133 dc_dmub_srv_cmd_list_queue_execute() warn: variable dereferenced before check 'dc_dmub_srv' (see line 128) drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:1167 dc_dmub_srv_is_hw_pwr_up() warn: variable dereferenced before check 'dc_dmub_srv' (see line 1164) Fixes: 028bac583449 ("drm/amd/display: decouple dmcub execution to reduce lock granularity") Fixes: 65138eb72e1f ("drm/amd/display: Add DCN35 DMUB") Cc: JinZe.Xu Cc: Hersen Wu Cc: Josip Pavic Cc: Roman Li Cc: Qingqing Zhuo Cc: Harry Wentland Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Cc: Tom Chung Signed-off-by: Srinivasan Shanmugam Reviewed-by: Tom Chung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index 2b79a0e5638e..363d522603a2 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -125,7 +125,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_srv, unsigned int count, union dmub_rb_cmd *cmd_list) { - struct dc_context *dc_ctx = dc_dmub_srv->ctx; + struct dc_context *dc_ctx; struct dmub_srv *dmub; enum dmub_status status; int i; @@ -133,6 +133,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_srv, if (!dc_dmub_srv || !dc_dmub_srv->dmub) return false; + dc_ctx = dc_dmub_srv->ctx; dmub = dc_dmub_srv->dmub; for (i = 0 ; i < count; i++) { @@ -1161,7 +1162,7 @@ void dc_dmub_srv_subvp_save_surf_addr(const struct dc_dmub_srv *dc_dmub_srv, con bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait) { - struct dc_context *dc_ctx = dc_dmub_srv->ctx; + struct dc_context *dc_ctx; enum dmub_status status; if (!dc_dmub_srv || !dc_dmub_srv->dmub) @@ -1170,6 +1171,8 @@ bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait) if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation) return true; + dc_ctx = dc_dmub_srv->ctx; + if (wait) { if (dc_dmub_srv->ctx->dc->debug.disable_timeout) { do { From 27a6c49394b1a203beeb94752c9a1d6318f24ddf Mon Sep 17 00:00:00 2001 From: Swapnil Patel Date: Tue, 6 Feb 2024 11:40:20 -0500 Subject: [PATCH 290/327] drm/amd/display: fix input states translation error for dcn35 & dcn351 [Why] Currently there is an error while translating input clock sates into output clock states. The highest fclk setting from output sates is being dropped because of this error. [How] For dcn35 and dcn351, make output_states equal to input states. Reviewed-by: Charlene Liu Acked-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Swapnil Patel Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dml2/dml2_translation_helper.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index 23a608274096..1ba6933d2b36 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -398,7 +398,6 @@ void dml2_init_soc_states(struct dml2_context *dml2, const struct dc *in_dc, /* Copy clocks tables entries, if available */ if (dml2->config.bbox_overrides.clks_table.num_states) { p->in_states->num_states = dml2->config.bbox_overrides.clks_table.num_states; - for (i = 0; i < dml2->config.bbox_overrides.clks_table.num_entries_per_clk.num_dcfclk_levels; i++) { p->in_states->state_array[i].dcfclk_mhz = dml2->config.bbox_overrides.clks_table.clk_entries[i].dcfclk_mhz; } @@ -437,6 +436,14 @@ void dml2_init_soc_states(struct dml2_context *dml2, const struct dc *in_dc, } dml2_policy_build_synthetic_soc_states(s, p); + if (dml2->v20.dml_core_ctx.project == dml_project_dcn35 || + dml2->v20.dml_core_ctx.project == dml_project_dcn351) { + // Override last out_state with data from last in_state + // This will ensure that out_state contains max fclk + memcpy(&p->out_states->state_array[p->out_states->num_states - 1], + &p->in_states->state_array[p->in_states->num_states - 1], + sizeof(struct soc_state_bounding_box_st)); + } } void dml2_translate_ip_params(const struct dc *in, struct ip_params_st *out) From bae67893578d608e35691dcdfa90c4957debf1d3 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 13 Feb 2024 01:50:50 +0100 Subject: [PATCH 291/327] drm/amd/display: Fix memory leak in dm_sw_fini() After destroying dmub_srv, the memory associated with it is not freed, causing a memory leak: unreferenced object 0xffff896302b45800 (size 1024): comm "(udev-worker)", pid 222, jiffies 4294894636 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 6265fd77): [] kmalloc_trace+0x29d/0x340 [] dm_dmub_sw_init+0xb4/0x450 [amdgpu] [] dm_sw_init+0x15/0x2b0 [amdgpu] [] amdgpu_device_init+0x1417/0x24e0 [amdgpu] [] amdgpu_driver_load_kms+0x15/0x190 [amdgpu] [] amdgpu_pci_probe+0x187/0x4e0 [amdgpu] [] local_pci_probe+0x3e/0x90 [] pci_device_probe+0xc3/0x230 [] really_probe+0xe2/0x480 [] __driver_probe_device+0x78/0x160 [] driver_probe_device+0x1f/0x90 [] __driver_attach+0xce/0x1c0 [] bus_for_each_dev+0x70/0xc0 [] bus_add_driver+0x112/0x210 [] driver_register+0x55/0x100 [] do_one_initcall+0x41/0x300 Fix this by freeing dmub_srv after destroying it. Fixes: 743b9786b14a ("drm/amd/display: Hook up the DMUB service in DM") Signed-off-by: Armin Wolf Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 377d532ca761..ef27dd71cbbc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2278,6 +2278,7 @@ static int dm_sw_fini(void *handle) if (adev->dm.dmub_srv) { dmub_srv_destroy(adev->dm.dmub_srv); + kfree(adev->dm.dmub_srv); adev->dm.dmub_srv = NULL; } From 9671761792156f2339627918bafcd713a8a6f777 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Fri, 16 Feb 2024 09:23:19 -0300 Subject: [PATCH 292/327] drm/amd/display: fix null-pointer dereference on edid reading Use i2c adapter when there isn't aux_mode in dc_link to fix a null-pointer derefence that happens when running igt@kms_force_connector_basic in a system with DCN2.1 and HDMI connector detected as below: [ +0.178146] BUG: kernel NULL pointer dereference, address: 00000000000004c0 [ +0.000010] #PF: supervisor read access in kernel mode [ +0.000005] #PF: error_code(0x0000) - not-present page [ +0.000004] PGD 0 P4D 0 [ +0.000006] Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000006] CPU: 15 PID: 2368 Comm: kms_force_conne Not tainted 6.5.0-asdn+ #152 [ +0.000005] Hardware name: HP HP ENVY x360 Convertible 13-ay1xxx/8929, BIOS F.01 07/14/2021 [ +0.000004] RIP: 0010:i2c_transfer+0xd/0x100 [ +0.000011] Code: ea fc ff ff 66 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 55 53 <48> 8b 47 10 48 89 fb 48 83 38 00 0f 84 b3 00 00 00 83 3d 2f 80 16 [ +0.000004] RSP: 0018:ffff9c4f89c0fad0 EFLAGS: 00010246 [ +0.000005] RAX: 0000000000000000 RBX: 0000000000000005 RCX: 0000000000000080 [ +0.000003] RDX: 0000000000000002 RSI: ffff9c4f89c0fb20 RDI: 00000000000004b0 [ +0.000003] RBP: ffff9c4f89c0fb80 R08: 0000000000000080 R09: ffff8d8e0b15b980 [ +0.000003] R10: 00000000000380e0 R11: 0000000000000000 R12: 0000000000000080 [ +0.000002] R13: 0000000000000002 R14: ffff9c4f89c0fb0e R15: ffff9c4f89c0fb0f [ +0.000004] FS: 00007f9ad2176c40(0000) GS:ffff8d90fe9c0000(0000) knlGS:0000000000000000 [ +0.000003] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000004] CR2: 00000000000004c0 CR3: 0000000121bc4000 CR4: 0000000000750ee0 [ +0.000003] PKRU: 55555554 [ +0.000003] Call Trace: [ +0.000006] [ +0.000006] ? __die+0x23/0x70 [ +0.000011] ? page_fault_oops+0x17d/0x4c0 [ +0.000008] ? preempt_count_add+0x6e/0xa0 [ +0.000008] ? srso_alias_return_thunk+0x5/0x7f [ +0.000011] ? exc_page_fault+0x7f/0x180 [ +0.000009] ? asm_exc_page_fault+0x26/0x30 [ +0.000013] ? i2c_transfer+0xd/0x100 [ +0.000010] drm_do_probe_ddc_edid+0xc2/0x140 [drm] [ +0.000067] ? srso_alias_return_thunk+0x5/0x7f [ +0.000006] ? _drm_do_get_edid+0x97/0x3c0 [drm] [ +0.000043] ? __pfx_drm_do_probe_ddc_edid+0x10/0x10 [drm] [ +0.000042] edid_block_read+0x3b/0xd0 [drm] [ +0.000043] _drm_do_get_edid+0xb6/0x3c0 [drm] [ +0.000041] ? __pfx_drm_do_probe_ddc_edid+0x10/0x10 [drm] [ +0.000043] drm_edid_read_custom+0x37/0xd0 [drm] [ +0.000044] amdgpu_dm_connector_mode_valid+0x129/0x1d0 [amdgpu] [ +0.000153] drm_connector_mode_valid+0x3b/0x60 [drm_kms_helper] [ +0.000000] __drm_helper_update_and_validate+0xfe/0x3c0 [drm_kms_helper] [ +0.000000] ? amdgpu_dm_connector_get_modes+0xb6/0x520 [amdgpu] [ +0.000000] ? srso_alias_return_thunk+0x5/0x7f [ +0.000000] drm_helper_probe_single_connector_modes+0x2ab/0x540 [drm_kms_helper] [ +0.000000] status_store+0xb2/0x1f0 [drm] [ +0.000000] kernfs_fop_write_iter+0x136/0x1d0 [ +0.000000] vfs_write+0x24d/0x440 [ +0.000000] ksys_write+0x6f/0xf0 [ +0.000000] do_syscall_64+0x60/0xc0 [ +0.000000] ? srso_alias_return_thunk+0x5/0x7f [ +0.000000] ? syscall_exit_to_user_mode+0x2b/0x40 [ +0.000000] ? srso_alias_return_thunk+0x5/0x7f [ +0.000000] ? do_syscall_64+0x6c/0xc0 [ +0.000000] ? do_syscall_64+0x6c/0xc0 [ +0.000000] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ +0.000000] RIP: 0033:0x7f9ad46b4b00 [ +0.000000] Code: 40 00 48 8b 15 19 b3 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d e1 3a 0e 00 00 74 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89 [ +0.000000] RSP: 002b:00007ffcbd3bd6d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ +0.000000] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9ad46b4b00 [ +0.000000] RDX: 0000000000000002 RSI: 00007f9ad48a7417 RDI: 0000000000000009 [ +0.000000] RBP: 0000000000000002 R08: 0000000000000064 R09: 0000000000000000 [ +0.000000] R10: 0000000000000000 R11: 0000000000000202 R12: 00007f9ad48a7417 [ +0.000000] R13: 0000000000000009 R14: 00007ffcbd3bd760 R15: 0000000000000001 [ +0.000000] [ +0.000000] Modules linked in: ctr ccm rfcomm snd_seq_dummy snd_hrtimer snd_seq snd_seq_device cmac algif_hash algif_skcipher af_alg bnep btusb btrtl btbcm btintel btmtk bluetooth uvcvideo videobuf2_vmalloc sha3_generic videobuf2_memops uvc jitterentropy_rng videobuf2_v4l2 videodev drbg videobuf2_common ansi_cprng mc ecdh_generic ecc qrtr binfmt_misc hid_sensor_accel_3d hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_trigger industrialio_triggered_buffer kfifo_buf industrialio snd_ctl_led joydev hid_sensor_iio_common rtw89_8852ae rtw89_8852a rtw89_pci snd_hda_codec_realtek rtw89_core snd_hda_codec_generic intel_rapl_msr ledtrig_audio intel_rapl_common snd_hda_codec_hdmi mac80211 snd_hda_intel snd_intel_dspcfg kvm_amd snd_hda_codec snd_soc_dmic snd_acp3x_rn snd_acp3x_pdm_dma libarc4 snd_hwdep snd_soc_core kvm snd_hda_core cfg80211 snd_pci_acp6x snd_pcm nls_ascii snd_timer hp_wmi snd_pci_acp5x nls_cp437 snd_rn_pci_acp3x ucsi_acpi sparse_keymap ccp snd platform_profile snd_acp_config typec_ucsi irqbypass vfat sp5100_tco [ +0.000000] snd_soc_acpi fat rapl pcspkr wmi_bmof roles rfkill rng_core snd_pci_acp3x soundcore k10temp watchdog typec battery ac amd_pmc acpi_tad button hid_sensor_hub hid_multitouch evdev serio_raw msr parport_pc ppdev lp parport fuse loop efi_pstore configfs ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 btrfs blake2b_generic dm_crypt dm_mod efivarfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c crc32c_generic xor raid6_pq raid1 raid0 multipath linear md_mod amdgpu amdxcp i2c_algo_bit drm_ttm_helper ttm crc32_pclmul crc32c_intel drm_exec gpu_sched drm_suballoc_helper nvme ghash_clmulni_intel drm_buddy drm_display_helper sha512_ssse3 nvme_core ahci xhci_pci sha512_generic hid_generic xhci_hcd libahci rtsx_pci_sdmmc t10_pi i2c_hid_acpi drm_kms_helper i2c_hid mmc_core libata aesni_intel crc64_rocksoft_generic crypto_simd amd_sfh crc64_rocksoft scsi_mod usbcore cryptd crc_t10dif cec drm crct10dif_generic hid rtsx_pci crct10dif_pclmul scsi_common rc_core crc64 i2c_piix4 [ +0.000000] usb_common crct10dif_common video wmi [ +0.000000] CR2: 00000000000004c0 [ +0.000000] ---[ end trace 0000000000000000 ]--- Fixes: 0e859faf8670 ("drm/amd/display: Remove unwanted drm edid references") Signed-off-by: Melissa Wen Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ef27dd71cbbc..5853cf022917 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6534,10 +6534,15 @@ amdgpu_dm_connector_late_register(struct drm_connector *connector) static void amdgpu_dm_connector_funcs_force(struct drm_connector *connector) { struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector); - struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); struct dc_link *dc_link = aconnector->dc_link; struct dc_sink *dc_em_sink = aconnector->dc_em_sink; struct edid *edid; + struct i2c_adapter *ddc; + + if (dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; /* * Note: drm_get_edid gets edid in the following order: @@ -6545,7 +6550,7 @@ static void amdgpu_dm_connector_funcs_force(struct drm_connector *connector) * 2) firmware EDID if set via edid_firmware module parameter * 3) regular DDC read. */ - edid = drm_get_edid(connector, &amdgpu_connector->ddc_bus->aux.ddc); + edid = drm_get_edid(connector, ddc); if (!edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); return; @@ -6586,12 +6591,18 @@ static int get_modes(struct drm_connector *connector) static void create_eml_sink(struct amdgpu_dm_connector *aconnector) { struct drm_connector *connector = &aconnector->base; - struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(&aconnector->base); + struct dc_link *dc_link = aconnector->dc_link; struct dc_sink_init_data init_params = { .link = aconnector->dc_link, .sink_signal = SIGNAL_TYPE_VIRTUAL }; struct edid *edid; + struct i2c_adapter *ddc; + + if (dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; /* * Note: drm_get_edid gets edid in the following order: @@ -6599,7 +6610,7 @@ static void create_eml_sink(struct amdgpu_dm_connector *aconnector) * 2) firmware EDID if set via edid_firmware module parameter * 3) regular DDC read. */ - edid = drm_get_edid(connector, &amdgpu_connector->ddc_bus->aux.ddc); + edid = drm_get_edid(connector, ddc); if (!edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); return; From bbfaf2aea7164db59739728d62d9cc91d64ff856 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Wed, 21 Feb 2024 17:16:49 +0800 Subject: [PATCH 293/327] drm/amdgpu: Fix the runtime resume failure issue Don't set power state flag when system enter runtime suspend, or it may cause runtime resume failure issue. Fixes: 3a9626c816db ("drm/amd: Stop evicting resources on APUs in suspend") Signed-off-by: Ma Jun Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index cc21ed67a330..7099ff9cf8c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1528,6 +1528,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) */ void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { + if (adev->in_runpm) + return; + if (amdgpu_acpi_is_s0ix_active(adev)) adev->in_s0ix = true; else if (amdgpu_acpi_is_s3_active(adev)) From 1d492944d3d06047793fa2e7606868f6d7480f87 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 14 Feb 2024 14:06:32 +1000 Subject: [PATCH 294/327] nouveau/gsp: add kconfig option to enable GSP paths by default Turing and Ampere will continue to use the old paths by default, but we should allow distros to decide what the policy is. Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240214040632.661069-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/Kconfig | 8 ++++++++ drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 1e6aaf95ff7c..ceef470c9fbf 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -100,3 +100,11 @@ config DRM_NOUVEAU_SVM help Say Y here if you want to enable experimental support for Shared Virtual Memory (SVM). + +config DRM_NOUVEAU_GSP_DEFAULT + bool "Use GSP firmware for Turing/Ampere (needs firmware installed)" + depends on DRM_NOUVEAU + default n + help + Say Y here if you want to use the GSP codepaths by default on + Turing and Ampere GPUs. diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index a41735ab6068..a64c81385682 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -2312,8 +2312,12 @@ r535_gsp_load(struct nvkm_gsp *gsp, int ver, const struct nvkm_gsp_fwif *fwif) { struct nvkm_subdev *subdev = &gsp->subdev; int ret; + bool enable_gsp = fwif->enable; - if (!nvkm_boolopt(subdev->device->cfgopt, "NvGspRm", fwif->enable)) +#if IS_ENABLED(CONFIG_DRM_NOUVEAU_GSP_DEFAULT) + enable_gsp = true; +#endif + if (!nvkm_boolopt(subdev->device->cfgopt, "NvGspRm", enable_gsp)) return -EINVAL; if ((ret = r535_gsp_load_fw(gsp, "gsp", fwif->ver, &gsp->fws.rm)) || From 3f4d8aac6e768c2215ce68275256971c2f54f0c8 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 24 Jan 2024 13:50:58 +1000 Subject: [PATCH 295/327] nouveau: add an ioctl to return vram bar size. This returns the BAR resources size so userspace can make decisions based on rebar support. userspace using this has been proposed for nvk, but it's a rather trivial uapi addition. Reviewed-by: Faith Ekstrand Signed-off-by: Dave Airlie --- drivers/gpu/drm/nouveau/nouveau_abi16.c | 4 ++++ include/uapi/drm/nouveau_drm.h | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c index d1bb8151a1df..4cb323bc3233 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c @@ -199,6 +199,7 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) struct nouveau_cli *cli = nouveau_cli(file_priv); struct nouveau_drm *drm = nouveau_drm(dev); struct nvif_device *device = &drm->client.device; + struct nvkm_device *nvkm_device = nvxx_device(&drm->client.device); struct nvkm_gr *gr = nvxx_gr(device); struct drm_nouveau_getparam *getparam = data; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -263,6 +264,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) getparam->value = nouveau_exec_push_max_from_ib_max(ib_max); break; } + case NOUVEAU_GETPARAM_VRAM_BAR_SIZE: + getparam->value = nvkm_device->func->resource_size(nvkm_device, 1); + break; default: NV_PRINTK(dbg, cli, "unknown parameter %lld\n", getparam->param); return -EINVAL; diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index 0bade1592f34..10a917639d8d 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -54,6 +54,13 @@ extern "C" { */ #define NOUVEAU_GETPARAM_EXEC_PUSH_MAX 17 +/* + * NOUVEAU_GETPARAM_VRAM_BAR_SIZE - query bar size + * + * Query the VRAM BAR size. + */ +#define NOUVEAU_GETPARAM_VRAM_BAR_SIZE 18 + struct drm_nouveau_getparam { __u64 param; __u64 value; From 72fa02fdf83306c52bc1eede28359e3fa32a151a Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 24 Jan 2024 14:24:25 +1000 Subject: [PATCH 296/327] nouveau: add an ioctl to report vram usage This reports the currently used vram allocations. userspace using this has been proposed for nvk, but it's a rather trivial uapi addition. Reviewed-by: Faith Ekstrand Signed-off-by: Dave Airlie --- drivers/gpu/drm/nouveau/nouveau_abi16.c | 5 +++++ include/uapi/drm/nouveau_drm.h | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c index 4cb323bc3233..cd14f993bdd1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c @@ -267,6 +267,11 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) case NOUVEAU_GETPARAM_VRAM_BAR_SIZE: getparam->value = nvkm_device->func->resource_size(nvkm_device, 1); break; + case NOUVEAU_GETPARAM_VRAM_USED: { + struct ttm_resource_manager *vram_mgr = ttm_manager_type(&drm->ttm.bdev, TTM_PL_VRAM); + getparam->value = (u64)ttm_resource_manager_usage(vram_mgr) << PAGE_SHIFT; + break; + } default: NV_PRINTK(dbg, cli, "unknown parameter %lld\n", getparam->param); return -EINVAL; diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index 10a917639d8d..77d7ff0d5b11 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -61,6 +61,13 @@ extern "C" { */ #define NOUVEAU_GETPARAM_VRAM_BAR_SIZE 18 +/* + * NOUVEAU_GETPARAM_VRAM_USED + * + * Get remaining VRAM size. + */ +#define NOUVEAU_GETPARAM_VRAM_USED 19 + struct drm_nouveau_getparam { __u64 param; __u64 value; From 1001db6c42e4012b55e5ee19405490f23e033b5a Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 297/327] LoongArch: Disable IRQ before init_fn() for nonboot CPUs Disable IRQ before init_fn() for nonboot CPUs when hotplug, in order to silence such warnings (and also avoid potential errors due to unexpected interrupts): WARNING: CPU: 1 PID: 0 at kernel/rcu/tree.c:4503 rcu_cpu_starting+0x214/0x280 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.6.17+ #1198 pc 90000000048e3334 ra 90000000047bd56c tp 900000010039c000 sp 900000010039fdd0 a0 0000000000000001 a1 0000000000000006 a2 900000000802c040 a3 0000000000000000 a4 0000000000000001 a5 0000000000000004 a6 0000000000000000 a7 90000000048e3f4c t0 0000000000000001 t1 9000000005c70968 t2 0000000004000000 t3 000000000005e56e t4 00000000000002e4 t5 0000000000001000 t6 ffffffff80000000 t7 0000000000040000 t8 9000000007931638 u0 0000000000000006 s9 0000000000000004 s0 0000000000000001 s1 9000000006356ac0 s2 9000000007244000 s3 0000000000000001 s4 0000000000000001 s5 900000000636f000 s6 7fffffffffffffff s7 9000000002123940 s8 9000000001ca55f8 ra: 90000000047bd56c tlb_init+0x24c/0x528 ERA: 90000000048e3334 rcu_cpu_starting+0x214/0x280 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000000 (PPLV0 -PIE -PWE) EUEN: 00000000 (-FPE -SXE -ASXE -BTE) ECFG: 00071000 (LIE=12 VS=7) ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.6.17+ #1198 Stack : 0000000000000000 9000000006375000 9000000005b61878 900000010039c000 900000010039fa30 0000000000000000 900000010039fa38 900000000619a140 9000000006456888 9000000006456880 900000010039f950 0000000000000001 0000000000000001 cb0cb028ec7e52e1 0000000002b90000 9000000100348700 0000000000000000 0000000000000001 ffffffff916d12f1 0000000000000003 0000000000040000 9000000007930370 0000000002b90000 0000000000000004 9000000006366000 900000000619a140 0000000000000000 0000000000000004 0000000000000000 0000000000000009 ffffffffffc681f2 9000000002123940 9000000001ca55f8 9000000006366000 90000000047a4828 00007ffff057ded8 00000000000000b0 0000000000000000 0000000000000000 0000000000071000 ... Call Trace: [<90000000047a4828>] show_stack+0x48/0x1a0 [<9000000005b61874>] dump_stack_lvl+0x84/0xcc [<90000000047f60ac>] __warn+0x8c/0x1e0 [<9000000005b0ab34>] report_bug+0x1b4/0x280 [<9000000005b63110>] do_bp+0x2d0/0x480 [<90000000047a2e20>] handle_bp+0x120/0x1c0 [<90000000048e3334>] rcu_cpu_starting+0x214/0x280 [<90000000047bd568>] tlb_init+0x248/0x528 [<90000000047a4c44>] per_cpu_trap_init+0x124/0x160 [<90000000047a19f4>] cpu_probe+0x494/0xa00 [<90000000047b551c>] start_secondary+0x3c/0xc0 [<9000000005b66134>] smpboot_entry+0x50/0x58 Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/kernel/smp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 2b49d30eb7c0..87b7190fe48e 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -337,6 +337,7 @@ void __noreturn arch_cpu_idle_dead(void) addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0); } while (addr == 0); + local_irq_disable(); init_fn = (void *)TO_CACHE(addr); iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR); From 752cd08da320a667a833803a8fd6bb266114cce5 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 298/327] LoongArch: Update cpu_sibling_map when disabling nonboot CPUs Update cpu_sibling_map when disabling nonboot CPUs by defining & calling clear_cpu_sibling_map(), otherwise we get such errors on SMT systems: jump label: negative count! WARNING: CPU: 6 PID: 45 at kernel/jump_label.c:263 __static_key_slow_dec_cpuslocked+0xec/0x100 CPU: 6 PID: 45 Comm: cpuhp/6 Not tainted 6.8.0-rc5+ #1340 pc 90000000004c302c ra 90000000004c302c tp 90000001005bc000 sp 90000001005bfd20 a0 000000000000001b a1 900000000224c278 a2 90000001005bfb58 a3 900000000224c280 a4 900000000224c278 a5 90000001005bfb50 a6 0000000000000001 a7 0000000000000001 t0 ce87a4763eb5234a t1 ce87a4763eb5234a t2 0000000000000000 t3 0000000000000000 t4 0000000000000006 t5 0000000000000000 t6 0000000000000064 t7 0000000000001964 t8 000000000009ebf6 u0 9000000001f2a068 s9 0000000000000000 s0 900000000246a2d8 s1 ffffffffffffffff s2 ffffffffffffffff s3 90000000021518c0 s4 0000000000000040 s5 9000000002151058 s6 9000000009828e40 s7 00000000000000b4 s8 0000000000000006 ra: 90000000004c302c __static_key_slow_dec_cpuslocked+0xec/0x100 ERA: 90000000004c302c __static_key_slow_dec_cpuslocked+0xec/0x100 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000004 (PPLV0 +PIE -PWE) EUEN: 00000000 (-FPE -SXE -ASXE -BTE) ECFG: 00071c1c (LIE=2-4,10-12 VS=7) ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) PRID: 0014d000 (Loongson-64bit, Loongson-3A6000-HV) CPU: 6 PID: 45 Comm: cpuhp/6 Not tainted 6.8.0-rc5+ #1340 Stack : 0000000000000000 900000000203f258 900000000179afc8 90000001005bc000 90000001005bf980 0000000000000000 90000001005bf988 9000000001fe0be0 900000000224c280 900000000224c278 90000001005bf8c0 0000000000000001 0000000000000001 ce87a4763eb5234a 0000000007f38000 90000001003f8cc0 0000000000000000 0000000000000006 0000000000000000 4c206e6f73676e6f 6f4c203a656d616e 000000000009ec99 0000000007f38000 0000000000000000 900000000214b000 9000000001fe0be0 0000000000000004 0000000000000000 0000000000000107 0000000000000009 ffffffffffafdabe 00000000000000b4 0000000000000006 90000000004c302c 9000000000224528 00005555939a0c7c 00000000000000b0 0000000000000004 0000000000000000 0000000000071c1c ... Call Trace: [<9000000000224528>] show_stack+0x48/0x1a0 [<900000000179afc8>] dump_stack_lvl+0x78/0xa0 [<9000000000263ed0>] __warn+0x90/0x1a0 [<90000000017419b8>] report_bug+0x1b8/0x280 [<900000000179c564>] do_bp+0x264/0x420 [<90000000004c302c>] __static_key_slow_dec_cpuslocked+0xec/0x100 [<90000000002b4d7c>] sched_cpu_deactivate+0x2fc/0x300 [<9000000000266498>] cpuhp_invoke_callback+0x178/0x8a0 [<9000000000267f70>] cpuhp_thread_fun+0xf0/0x240 [<90000000002a117c>] smpboot_thread_fn+0x1dc/0x2e0 [<900000000029a720>] kthread+0x140/0x160 [<9000000000222288>] ret_from_kernel_thread+0xc/0xa4 Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/kernel/smp.c | 121 ++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 87b7190fe48e..aabee0b280fe 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -88,6 +88,73 @@ void show_ipi_list(struct seq_file *p, int prec) } } +static inline void set_cpu_core_map(int cpu) +{ + int i; + + cpumask_set_cpu(cpu, &cpu_core_setup_map); + + for_each_cpu(i, &cpu_core_setup_map) { + if (cpu_data[cpu].package == cpu_data[i].package) { + cpumask_set_cpu(i, &cpu_core_map[cpu]); + cpumask_set_cpu(cpu, &cpu_core_map[i]); + } + } +} + +static inline void set_cpu_sibling_map(int cpu) +{ + int i; + + cpumask_set_cpu(cpu, &cpu_sibling_setup_map); + + for_each_cpu(i, &cpu_sibling_setup_map) { + if (cpus_are_siblings(cpu, i)) { + cpumask_set_cpu(i, &cpu_sibling_map[cpu]); + cpumask_set_cpu(cpu, &cpu_sibling_map[i]); + } + } +} + +static inline void clear_cpu_sibling_map(int cpu) +{ + int i; + + for_each_cpu(i, &cpu_sibling_setup_map) { + if (cpus_are_siblings(cpu, i)) { + cpumask_clear_cpu(i, &cpu_sibling_map[cpu]); + cpumask_clear_cpu(cpu, &cpu_sibling_map[i]); + } + } + + cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); +} + +/* + * Calculate a new cpu_foreign_map mask whenever a + * new cpu appears or disappears. + */ +void calculate_cpu_foreign_map(void) +{ + int i, k, core_present; + cpumask_t temp_foreign_map; + + /* Re-calculate the mask */ + cpumask_clear(&temp_foreign_map); + for_each_online_cpu(i) { + core_present = 0; + for_each_cpu(k, &temp_foreign_map) + if (cpus_are_siblings(i, k)) + core_present = 1; + if (!core_present) + cpumask_set_cpu(i, &temp_foreign_map); + } + + for_each_online_cpu(i) + cpumask_andnot(&cpu_foreign_map[i], + &temp_foreign_map, &cpu_sibling_map[i]); +} + /* Send mailbox buffer via Mail_Send */ static void csr_mail_send(uint64_t data, int cpu, int mailbox) { @@ -303,6 +370,7 @@ int loongson_cpu_disable(void) numa_remove_cpu(cpu); #endif set_cpu_online(cpu, false); + clear_cpu_sibling_map(cpu); calculate_cpu_foreign_map(); local_irq_save(flags); irq_migrate_all_off_this_cpu(); @@ -380,59 +448,6 @@ static int __init ipi_pm_init(void) core_initcall(ipi_pm_init); #endif -static inline void set_cpu_sibling_map(int cpu) -{ - int i; - - cpumask_set_cpu(cpu, &cpu_sibling_setup_map); - - for_each_cpu(i, &cpu_sibling_setup_map) { - if (cpus_are_siblings(cpu, i)) { - cpumask_set_cpu(i, &cpu_sibling_map[cpu]); - cpumask_set_cpu(cpu, &cpu_sibling_map[i]); - } - } -} - -static inline void set_cpu_core_map(int cpu) -{ - int i; - - cpumask_set_cpu(cpu, &cpu_core_setup_map); - - for_each_cpu(i, &cpu_core_setup_map) { - if (cpu_data[cpu].package == cpu_data[i].package) { - cpumask_set_cpu(i, &cpu_core_map[cpu]); - cpumask_set_cpu(cpu, &cpu_core_map[i]); - } - } -} - -/* - * Calculate a new cpu_foreign_map mask whenever a - * new cpu appears or disappears. - */ -void calculate_cpu_foreign_map(void) -{ - int i, k, core_present; - cpumask_t temp_foreign_map; - - /* Re-calculate the mask */ - cpumask_clear(&temp_foreign_map); - for_each_online_cpu(i) { - core_present = 0; - for_each_cpu(k, &temp_foreign_map) - if (cpus_are_siblings(i, k)) - core_present = 1; - if (!core_present) - cpumask_set_cpu(i, &temp_foreign_map); - } - - for_each_online_cpu(i) - cpumask_andnot(&cpu_foreign_map[i], - &temp_foreign_map, &cpu_sibling_map[i]); -} - /* Preload SMP state for boot cpu */ void smp_prepare_boot_cpu(void) { From 9fa304b9f8ec440e614af6d35826110c633c4074 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 299/327] LoongArch: Call early_init_fdt_scan_reserved_mem() earlier The unflatten_and_copy_device_tree() function contains a call to memblock_alloc(). This means that memblock is allocating memory before any of the reserved memory regions are set aside in the arch_mem_init() function which calls early_init_fdt_scan_reserved_mem(). Therefore, there is a possibility for memblock to allocate from any of the reserved memory regions. Hence, move the call to early_init_fdt_scan_reserved_mem() to be earlier in the init sequence, so that the reserved memory regions are set aside before any allocations are done using memblock. Cc: stable@vger.kernel.org Fixes: 88d4d957edc707e ("LoongArch: Add FDT booting support from efi system table") Signed-off-by: Oreoluwa Babatunde Signed-off-by: Huacai Chen --- arch/loongarch/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index edf2bba80130..634ef17fd38b 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -357,6 +357,8 @@ void __init platform_init(void) acpi_gbl_use_default_register_widths = false; acpi_boot_table_init(); #endif + + early_init_fdt_scan_reserved_mem(); unflatten_and_copy_device_tree(); #ifdef CONFIG_NUMA @@ -390,8 +392,6 @@ static void __init arch_mem_init(char **cmdline_p) check_kernel_sections_mem(); - early_init_fdt_scan_reserved_mem(); - /* * In order to reduce the possibility of kernel panic when failed to * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate From f661ca40787396a3b7f324ae2a215bd67cc92955 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 300/327] LoongArch: dts: Minor whitespace cleanup The DTS code coding style expects exactly one space before '{' character. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k0500-ref.dts | 2 +- arch/loongarch/boot/dts/loongson-2k1000-ref.dts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts index b38071a4d0b0..8aefb0c12672 100644 --- a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts @@ -60,7 +60,7 @@ #address-cells = <1>; #size-cells = <0>; - eeprom@57{ + eeprom@57 { compatible = "atmel,24c16"; reg = <0x57>; pagesize = <16>; diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts index 132a2d1ea8bc..ed4d32434041 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts @@ -78,7 +78,7 @@ #address-cells = <1>; #size-cells = <0>; - eeprom@57{ + eeprom@57 { compatible = "atmel,24c16"; reg = <0x57>; pagesize = <16>; From 179af5751af59100305358ee0ee51eec9a7f3953 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 301/327] LoongArch: KVM: Fix input validation of _kvm_get_cpucfg() & kvm_check_cpucfg() The range check for the CPUCFG ID is wrong (should have been a || instead of &&) and useless in effect, so fix the obvious mistake. Furthermore, the juggling of the temp return value is unnecessary, because it is semantically equivalent and more readable to just return at every switch case's end. This is done too to avoid potential bugs in the future related to the unwanted complexity. Also, the return value of _kvm_get_cpucfg is meant to be checked, but this was not done, so bad CPUCFG IDs wrongly fall back to the default case and 0 is incorrectly returned; check the return value to fix the UAPI behavior. While at it, also remove the redundant range check in kvm_check_cpucfg, because out-of-range CPUCFG IDs are already rejected by the -EINVAL as returned by _kvm_get_cpucfg(). Fixes: db1ecca22edf ("LoongArch: KVM: Add LSX (128bit SIMD) support") Fixes: 118e10cd893d ("LoongArch: KVM: Add LASX (256bit SIMD) support") Reviewed-by: Bibo Mao Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 27701991886d..c8452aa5c11a 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -300,9 +300,7 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) static int _kvm_get_cpucfg(int id, u64 *v) { - int ret = 0; - - if (id < 0 && id >= KVM_MAX_CPUCFG_REGS) + if (id < 0 || id >= KVM_MAX_CPUCFG_REGS) return -EINVAL; switch (id) { @@ -324,32 +322,35 @@ static int _kvm_get_cpucfg(int id, u64 *v) if (cpu_has_lasx) *v |= CPUCFG2_LASX; - break; + return 0; default: - ret = -EINVAL; - break; + /* + * No restrictions on other valid CPUCFG IDs' values, but + * CPUCFG data is limited to 32 bits as the LoongArch ISA + * manual says (Volume 1, Section 2.2.10.5 "CPUCFG"). + */ + *v = U32_MAX; + return 0; } - return ret; } static int kvm_check_cpucfg(int id, u64 val) { - u64 mask; - int ret = 0; + int ret; + u64 mask = 0; - if (id < 0 && id >= KVM_MAX_CPUCFG_REGS) - return -EINVAL; - - if (_kvm_get_cpucfg(id, &mask)) + ret = _kvm_get_cpucfg(id, &mask); + if (ret) return ret; + if (val & ~mask) + /* Unsupported features and/or the higher 32 bits should not be set */ + return -EINVAL; + switch (id) { case 2: /* CPUCFG2 features checking */ - if (val & ~mask) - /* The unsupported features should not be set */ - ret = -EINVAL; - else if (!(val & CPUCFG2_LLFTP)) + if (!(val & CPUCFG2_LLFTP)) /* The LLFTP must be set, as guest must has a constant timer */ ret = -EINVAL; else if ((val & CPUCFG2_FP) && (!(val & CPUCFG2_FPSP) || !(val & CPUCFG2_FPDP))) From ec83f39d2b078d6dd029bbde601835b5368fc886 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 302/327] LoongArch: KVM: Rename _kvm_get_cpucfg() to _kvm_get_cpucfg_mask() The function is not actually a getter of guest CPUCFG, but rather validation of the input CPUCFG ID plus information about the supported bit flags of that CPUCFG leaf. So rename it to avoid confusion. Reviewed-by: Bibo Mao Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index c8452aa5c11a..98c4290af9c4 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -298,7 +298,7 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) return ret; } -static int _kvm_get_cpucfg(int id, u64 *v) +static int _kvm_get_cpucfg_mask(int id, u64 *v) { if (id < 0 || id >= KVM_MAX_CPUCFG_REGS) return -EINVAL; @@ -339,7 +339,7 @@ static int kvm_check_cpucfg(int id, u64 val) int ret; u64 mask = 0; - ret = _kvm_get_cpucfg(id, &mask); + ret = _kvm_get_cpucfg_mask(id, &mask); if (ret) return ret; @@ -567,7 +567,7 @@ static int kvm_loongarch_get_cpucfg_attr(struct kvm_vcpu *vcpu, uint64_t val; uint64_t __user *uaddr = (uint64_t __user *)attr->addr; - ret = _kvm_get_cpucfg(attr->attr, &val); + ret = _kvm_get_cpucfg_mask(attr->attr, &val); if (ret) return ret; From f0f5c4894f89bac9074b45bccc447c3659a0fa6f Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Fri, 23 Feb 2024 14:36:31 +0800 Subject: [PATCH 303/327] LoongArch: KVM: Streamline kvm_check_cpucfg() and improve comments All the checks currently done in kvm_check_cpucfg can be realized with early returns, so just do that to avoid extra cognitive burden related to the return value handling. While at it, clean up comments of _kvm_get_cpucfg_mask() and kvm_check_cpucfg(), by removing comments that are merely restatement of the code nearby, and paraphrasing the rest so they read more natural for English speakers (that likely are not familiar with the actual Chinese- influenced grammar). No functional changes intended. Reviewed-by: Bibo Mao Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 42 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 98c4290af9c4..36106922b5d7 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -305,20 +305,16 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) switch (id) { case 2: - /* Return CPUCFG2 features which have been supported by KVM */ + /* CPUCFG2 features unconditionally supported by KVM */ *v = CPUCFG2_FP | CPUCFG2_FPSP | CPUCFG2_FPDP | CPUCFG2_FPVERS | CPUCFG2_LLFTP | CPUCFG2_LLFTPREV | CPUCFG2_LAM; /* - * If LSX is supported by CPU, it is also supported by KVM, - * as we implement it. + * For the ISA extensions listed below, if one is supported + * by the host, then it is also supported by KVM. */ if (cpu_has_lsx) *v |= CPUCFG2_LSX; - /* - * if LASX is supported by CPU, it is also supported by KVM, - * as we implement it. - */ if (cpu_has_lasx) *v |= CPUCFG2_LASX; @@ -349,24 +345,26 @@ static int kvm_check_cpucfg(int id, u64 val) switch (id) { case 2: - /* CPUCFG2 features checking */ if (!(val & CPUCFG2_LLFTP)) - /* The LLFTP must be set, as guest must has a constant timer */ - ret = -EINVAL; - else if ((val & CPUCFG2_FP) && (!(val & CPUCFG2_FPSP) || !(val & CPUCFG2_FPDP))) - /* Single and double float point must both be set when enable FP */ - ret = -EINVAL; - else if ((val & CPUCFG2_LSX) && !(val & CPUCFG2_FP)) - /* FP should be set when enable LSX */ - ret = -EINVAL; - else if ((val & CPUCFG2_LASX) && !(val & CPUCFG2_LSX)) - /* LSX, FP should be set when enable LASX, and FP has been checked before. */ - ret = -EINVAL; - break; + /* Guests must have a constant timer */ + return -EINVAL; + if ((val & CPUCFG2_FP) && (!(val & CPUCFG2_FPSP) || !(val & CPUCFG2_FPDP))) + /* Single and double float point must both be set when FP is enabled */ + return -EINVAL; + if ((val & CPUCFG2_LSX) && !(val & CPUCFG2_FP)) + /* LSX architecturally implies FP but val does not satisfy that */ + return -EINVAL; + if ((val & CPUCFG2_LASX) && !(val & CPUCFG2_LSX)) + /* LASX architecturally implies LSX and FP but val does not satisfy that */ + return -EINVAL; + return 0; default: - break; + /* + * Values for the other CPUCFG IDs are not being further validated + * besides the mask check above. + */ + return 0; } - return ret; } static int kvm_get_one_reg(struct kvm_vcpu *vcpu, From 65d4418c5002ec5b0e529455bf4152fd43459079 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Feb 2024 10:07:41 -0400 Subject: [PATCH 304/327] iommu/sva: Restore SVA handle sharing Prior to commit 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") the code allowed a SVA handle to be bound multiple times to the same (mm, device) pair. This was alluded to in the kdoc comment, but we had understood this to be more a remark about allowing multiple devices, not a literal same-driver re-opening the same SVA. It turns out uacce and idxd were both relying on the core code to handle reference counting for same-device same-mm scenarios. As this looks hard to resolve in the drivers bring it back to the core code. The new design has changed the meaning of the domain->users refcount to refer to the number of devices that are sharing that domain for the same mm. This is part of the design to lift the SVA domain de-duplication out of the drivers. Return the old behavior by explicitly de-duplicating the struct iommu_sva handle. The same (mm, device) will return the same handle pointer and the core code will handle tracking this. The last unbind of the handle will destroy it. Fixes: 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") Reported-by: Zhangfei Gao Closes: https://lore.kernel.org/all/20240221110658.529-1-zhangfei.gao@linaro.org/ Tested-by: Zhangfei Gao Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-9455fc497a6f+3b4-iommu_sva_sharing_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-sva.c | 17 +++++++++++++++++ include/linux/iommu.h | 3 +++ 2 files changed, 20 insertions(+) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index c3fc9201d0be..7f91c8d0064b 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -41,6 +41,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de } iommu_mm->pasid = pasid; INIT_LIST_HEAD(&iommu_mm->sva_domains); + INIT_LIST_HEAD(&iommu_mm->sva_handles); /* * Make sure the write to mm->iommu_mm is not reordered in front of * initialization to iommu_mm fields. If it does, readers may see a @@ -82,6 +83,14 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_unlock; } + list_for_each_entry(handle, &mm->iommu_mm->sva_handles, handle_item) { + if (handle->dev == dev) { + refcount_inc(&handle->users); + mutex_unlock(&iommu_sva_lock); + return handle; + } + } + handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) { ret = -ENOMEM; @@ -108,7 +117,9 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; + refcount_set(&handle->users, 1); list_add(&domain->next, &mm->iommu_mm->sva_domains); + list_add(&handle->handle_item, &mm->iommu_mm->sva_handles); out: mutex_unlock(&iommu_sva_lock); @@ -141,6 +152,12 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); + if (!refcount_dec_and_test(&handle->users)) { + mutex_unlock(&iommu_sva_lock); + return; + } + list_del(&handle->handle_item); + iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { list_del(&domain->next); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1ea2a820e1eb..5e27cb3a3be9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -892,11 +892,14 @@ struct iommu_fwspec { struct iommu_sva { struct device *dev; struct iommu_domain *domain; + struct list_head handle_item; + refcount_t users; }; struct iommu_mm_data { u32 pasid; struct list_head sva_domains; + struct list_head sva_handles; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, From 87aec499368d488c20292952d6d4be7cb9e49c5e Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 21 Feb 2024 20:27:13 +0100 Subject: [PATCH 305/327] i2c: imx: when being a target, mark the last read as processed When being a target, NAK from the controller means that all bytes have been transferred. So, the last byte needs also to be marked as 'processed'. Otherwise index registers of backends may not increase. Fixes: f7414cd6923f ("i2c: imx: support slave mode for imx I2C driver") Signed-off-by: Corey Minyard Tested-by: Andrew Manley Reviewed-by: Andrew Manley Reviewed-by: Oleksij Rempel [wsa: fixed comment and commit message to properly describe the case] Signed-off-by: Wolfram Sang Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 88a053987403..60e813137f84 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -803,6 +803,11 @@ static irqreturn_t i2c_imx_slave_handle(struct imx_i2c_struct *i2c_imx, ctl &= ~I2CR_MTX; imx_i2c_write_reg(ctl, i2c_imx, IMX_I2C_I2CR); imx_i2c_read_reg(i2c_imx, IMX_I2C_I2DR); + + /* flag the last byte as processed */ + i2c_imx_slave_event(i2c_imx, + I2C_SLAVE_READ_PROCESSED, &value); + i2c_imx_slave_finish_op(i2c_imx); return IRQ_HANDLED; } From 66ad2fbcdbeab0edfd40c5d94f32f053b98c2320 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 24 Feb 2024 14:48:03 +0100 Subject: [PATCH 306/327] dm-integrity, dm-verity: reduce stack usage for recheck The newly added integrity_recheck() function has another larger stack allocation, just like its caller integrity_metadata(). When it gets inlined, the combination of the two exceeds the warning limit for 32-bit architectures and possibly risks an overflow when this is called from a deep call chain through a file system: drivers/md/dm-integrity.c:1767:13: error: stack frame size (1048) exceeds limit (1024) in 'integrity_metadata' [-Werror,-Wframe-larger-than] 1767 | static void integrity_metadata(struct work_struct *w) Since the caller at this point is done using its checksum buffer, just reuse the same buffer in the new function to avoid the double allocation. [Mikulas: add "noinline" to integrity_recheck and verity_recheck. These functions are only called on error, so they shouldn't bloat the stack frame or code size of the caller.] Fixes: c88f5e553fe3 ("dm-integrity: recheck the integrity tag after a failure") Fixes: 9177f3c0dea6 ("dm-verity: recheck the hash after a failure") Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 10 ++++------ drivers/md/dm-verity-target.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 44a1dcc06dd7..1fc901df84eb 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1691,14 +1691,13 @@ failed: get_random_bytes(result, ic->tag_size); } -static void integrity_recheck(struct dm_integrity_io *dio) +static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum) { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); struct dm_integrity_c *ic = dio->ic; struct bvec_iter iter; struct bio_vec bv; sector_t sector, logical_sector, area, offset; - char checksum_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; struct page *page; void *buffer; @@ -1734,9 +1733,8 @@ static void integrity_recheck(struct dm_integrity_io *dio) goto free_ret; } - integrity_sector_checksum(ic, logical_sector, buffer, - checksum_onstack); - r = dm_integrity_rw_tag(ic, checksum_onstack, &dio->metadata_block, + integrity_sector_checksum(ic, logical_sector, buffer, checksum); + r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block, &dio->metadata_offset, ic->tag_size, TAG_CMP); if (r) { if (r > 0) { @@ -1851,7 +1849,7 @@ again: checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - integrity_recheck(dio); + integrity_recheck(dio, checksums); goto skip_io; } if (likely(checksums != checksums_onstack)) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 06c8b637849c..1b591bfa90d5 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -491,8 +491,8 @@ static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, return 0; } -static int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter start, sector_t cur_block) +static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter start, sector_t cur_block) { struct page *page; void *buffer; From 1f626223a0c8753f8f5c651bf7bfc9e3cfdef7f7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Feb 2024 22:16:00 -0500 Subject: [PATCH 307/327] bcachefs: fix backpointer_to_text() when dev does not exist Fixes: Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index b4dc319bcb2b..569b97904da4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -68,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); - prt_str(out, " "); + if (bch2_dev_exists2(c, k.k->p.inode)) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } From 04fee68dd99a53dbf0716e99270b66da26519daf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 Feb 2024 21:39:13 -0500 Subject: [PATCH 308/327] bcachefs: Kill __GFP_NOFAIL in buffered read path Recently, we fixed our __GFP_NOFAIL usage in the readahead path, but the easy one in read_single_folio() (where wa can return an error) was missed - oops. Fixes: Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 73c12e565af5..27710cdd5710 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -303,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl) darray_exit(&readpages_iter.folios); } -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); -} - static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); @@ -329,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) int ret; DECLARE_COMPLETION_ONSTACK(done); + if (!bch2_folio_create(folio, GFP_KERNEL)) + return -ENOMEM; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), @@ -336,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); From 204f45140faa0772d2ca1b3de96d1c0fb3db8e77 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 Feb 2024 19:14:36 -0500 Subject: [PATCH 309/327] bcachefs: Fix BTREE_ITER_FILTER_SNAPSHOTS on inodes btree If we're in FILTER_SNAPSHOTS mode and we start scanning a range of the keyspace where no keys are visible in the current snapshot, we have a problem - we'll scan for a very long time before scanning terminates. Awhile back, this was fixed for most cases with peek_upto() (and assertions that enforce that it's being used). But the fix missed the fact that the inodes btree is different - every key offset is in a different snapshot tree, not just the inode field. Fixes: Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5467a8635be1..3ef338df82f5 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2156,7 +2156,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (k.k->p.inode > end.inode) + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) goto end; if (iter->update_path && From b58b1b883b9b702e25204dbe2b221eecc8ecd159 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 15 Feb 2024 12:16:05 -0500 Subject: [PATCH 310/327] bcachefs: fix iov_iter count underflow on sub-block dio read bch2_direct_IO_read() checks the request offset and size for sector alignment and then falls through to a couple calculations to shrink the size of the request based on the inode size. The problem is that these checks round up to the fs block size, which runs the risk of underflowing iter->count if the block size happens to be large enough. This is triggered by fstest generic/361 with a 4k block size, which subsequently leads to a crash. To avoid this crash, check that the shorten length doesn't exceed the overall length of the iter. Fixes: Signed-off-by: Brian Foster Reviewed-by: Su Yue Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index e3b219e19e10..33cb6da3a5ad 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -88,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) return ret; shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + if (shorten >= iter->count) + shorten = 0; iter->count -= shorten; bio = bio_alloc_bioset(NULL, From 097471f9e458dbbe41e25394c1fb1ccd751f0bee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Feb 2024 20:38:47 -0500 Subject: [PATCH 311/327] bcachefs: Fix bch2_journal_flush_device_pins() If a journal write errored, the list of devices it was written to could be empty - we're not supposed to mark an empty replicas list. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 2cf626315652..c33dca641575 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -892,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) journal_seq_pin(j, seq)->devs); seq++; - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } } spin_unlock(&j->lock); err: From c4333eb541b92d91be57f757dccf6d4368516746 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 Feb 2024 01:18:45 -0500 Subject: [PATCH 312/327] bcachefs: Fix check_snapshot() memcpy check_snapshot() copies the bch_snapshot to a temporary to easily handle older versions that don't have all the fields of the current version, but it lacked a min() to correctly handle keys newer and larger than the current version. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 45f67e8b29eb..ac6ba04d5521 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -728,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans, return 0; memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, bkey_val_bytes(k.k)); + memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); id = le32_to_cpu(s.parent); if (id) { From 583340de1d8b2d6a474eccd5e7d9f7f42f061e1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Feb 2024 21:10:01 -0500 Subject: [PATCH 313/327] fs/super.c: don't drop ->s_user_ns until we free struct super_block itself Avoids fun races in RCU pathwalk... Same goes for freeing LSM shite hanging off super_block's arse. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/super.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/super.c b/fs/super.c index d35e85295489..d6efeba0d0ce 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); - int i; - - for (i = 0; i < SB_FREEZE_LEVELS; i++) + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s) super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); @@ -409,9 +407,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } From cdb67fdeed72248475b1c849699495ef290a1634 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2023 21:11:41 -0400 Subject: [PATCH 314/327] rcu pathwalk: prevent bogus hard errors from may_lookup() If lazy call of ->permission() returns a hard error, check that try_to_unlazy() succeeds before returning it. That both makes life easier for ->permission() instances and closes the race in ENOTDIR handling - it is possible that positive d_can_lookup() seen in link_path_walk() applies to the state *after* unlink() + mkdir(), while nd->inode matches the state prior to that. Normally seeing e.g. EACCES from permission check in rcu pathwalk means that with some timings non-rcu pathwalk would've run into the same; however, running into a non-executable regular file in the middle of a pathname would not get to permission check - it would fail with ENOTDIR instead. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namei.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 4e0de939fea1..9342fa6a38c2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap, { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD || !try_to_unlazy(nd)) + if (!err) // success, keep going + return 0; + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + if (err != -ECHILD) // hard error return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); From 529f89a9e4531e80c44871d7d0c30df6540c20e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2023 19:36:07 -0400 Subject: [PATCH 315/327] affs: free affs_sb_info with kfree_rcu() one of the flags in it is used by ->d_hash()/->d_compare() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/affs/affs.h | 1 + fs/affs/super.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 60685ec76d98..2e612834329a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -105,6 +105,7 @@ struct affs_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ + struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/super.c b/fs/affs/super.c index 58b391446ae1..b56a95cf414a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb) affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); mutex_destroy(&sbi->s_bmlock); - kfree(sbi); + kfree_rcu(sbi, rcu); } } From a13d1a4de3b0fe3c41d818697d691c886c5585fa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2023 15:53:32 -0400 Subject: [PATCH 316/327] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper That stuff can be accessed by ->d_hash()/->d_compare(); as it is, we have a hard-to-hit UAF if rcu pathwalk manages to get into ->d_hash() on a filesystem that is in process of getting shut down. Besides, having nls and upcase table cleanup moved from ->put_super() towards the place where sbi is freed makes for simpler failure exits. Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/exfat/exfat_fs.h | 1 + fs/exfat/nls.c | 14 ++++---------- fs/exfat/super.c | 20 +++++++++++--------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 9474cd50da6d..361595433480 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -275,6 +275,7 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 705710f93e2d..afdf13c34ff5 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb, unsigned int sect_size = sb->s_blocksize; unsigned int i, index = 0; u32 chksum = 0; - int ret; unsigned char skip = false; unsigned short *upcase_table; @@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb, if (!bh) { exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); - ret = -EIO; - goto free_table; + return -EIO; } sector++; for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { @@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb, exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", index, chksum, utbl_checksum); - ret = -EINVAL; -free_table: - exfat_free_upcase_table(sbi); - return ret; + return -EINVAL; } static int exfat_load_default_upcase_table(struct super_block *sb) { - int i, ret = -EIO; + int i; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned char skip = false; unsigned short uni = 0, *upcase_table; @@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb) return 0; /* FATAL error: default upcase table has error */ - exfat_free_upcase_table(sbi); - return ret; + return -EIO; } int exfat_create_upcase_table(struct super_block *sb) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d9d4fa91010b..fcb658267765 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb) exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - - unload_nls(sbi->nls_io); - exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb) ret = exfat_load_bitmap(sb); if (ret) { exfat_err(sb, "failed to load alloc-bitmap"); - goto free_upcase_table; + goto free_bh; } ret = exfat_count_used_clusters(sb, &sbi->used_clusters); @@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb) free_alloc_bitmap: exfat_free_bitmap(sbi); -free_upcase_table: - exfat_free_upcase_table(sbi); free_bh: brelse(sbi->boot_bh); return ret; @@ -701,12 +696,10 @@ put_inode: sb->s_root = NULL; free_table: - exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); check_nls_io: - unload_nls(sbi->nls_io); return err; } @@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); + exfat_free_sbi(sbi); +} + static void exfat_kill_sb(struct super_block *sb) { struct exfat_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi) - exfat_free_sbi(sbi); + call_rcu(&sbi->rcu, delayed_free); } static struct file_system_type exfat_fs_type = { From af072cf683acd2307e02378cfcf2502c49d2e127 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2023 20:18:59 -0400 Subject: [PATCH 317/327] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info ->d_hash() and ->d_compare() use those, so we need to delay freeing them. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/hfsplus/hfsplus_fs.h | 1 + fs/hfsplus/super.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 7ededcb720c1..012a3d003fbe 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -190,6 +190,7 @@ struct hfsplus_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct rcu_head rcu; }; #define HFSPLUS_SB_WRITEBACKUP 0 diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1986b4f18a90..97920202790f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb) spin_unlock(&sbi->work_lock); } +static void delayed_free(struct rcu_head *p) +{ + struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu); + + unload_nls(sbi->nls); + kfree(sbi); +} + static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); - unload_nls(sbi->nls); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + call_rcu(&sbi->rcu, delayed_free); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) From 275655d3207b9e65d1561bf21c06a622d9ec1d43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2023 20:24:34 -0400 Subject: [PATCH 318/327] afs: fix __afs_break_callback() / afs_drop_open_mmap() race In __afs_break_callback() we might check ->cb_nr_mmap and if it's non-zero do queue_work(&vnode->cb_work). In afs_drop_open_mmap() we decrement ->cb_nr_mmap and do flush_work(&vnode->cb_work) if it reaches zero. The trouble is, there's nothing to prevent __afs_break_callback() from seeing ->cb_nr_mmap before the decrement and do queue_work() after both the decrement and flush_work(). If that happens, we might be in trouble - vnode might get freed before the queued work runs. __afs_break_callback() is always done under ->cb_lock, so let's make sure that ->cb_nr_mmap can change from non-zero to zero while holding ->cb_lock (the spinlock component of it - it's a seqlock and we don't need to mess with the counter). Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/afs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 3d33b221d9ca..ef2cc8f565d2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -417,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode) static void afs_drop_open_mmap(struct afs_vnode *vnode) { - if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) return; down_write(&vnode->volume->open_mmaps_lock); - if (atomic_read(&vnode->cb_nr_mmap) == 0) + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); From 10a973fc4fb22390a8d362dd3265ec2c9a81d84c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Sep 2023 21:50:25 -0400 Subject: [PATCH 319/327] nfs: make nfs_set_verifier() safe for use in RCU pathwalk nfs_set_verifier() relies upon dentry being pinned; if that's the case, grabbing ->d_lock stabilizes ->d_parent and guarantees that ->d_parent points to a positive dentry. For something we'd run into in RCU mode that is *not* true - dentry might've been through dentry_kill() just as we grabbed ->d_lock, with its parent going through the same just as we get to into nfs_set_verifier_locked(). It might get to detaching inode (and zeroing ->d_inode) before nfs_set_verifier_locked() gets to fetching that; we get an oops as the result. That can happen in nfs{,4} ->d_revalidate(); the call chain in question is nfs_set_verifier_locked() <- nfs_set_verifier() <- nfs_lookup_revalidate_delegated() <- nfs{,4}_do_lookup_revalidate(). We have checked that the parent had been positive, but that's done before we get to nfs_set_verifier() and it's possible for memory pressure to pick our dentry as eviction candidate by that time. If that happens, back-to-back attempts to kill dentry and its parent are quite normal. Sure, in case of eviction we'll fail the ->d_seq check in the caller, but we need to survive until we return there... Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c8ecbe999059..ac505671efbd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry) static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); - struct inode *dir = d_inode(dentry->d_parent); + struct inode *dir = d_inode_rcu(dentry->d_parent); - if (!nfs_verify_change_attribute(dir, verf)) + if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) nfs_set_verifier_delegated(&verf); From c1b967d03c5d570ed7b90a88031fa2af34bf5b20 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Sep 2023 22:11:26 -0400 Subject: [PATCH 320/327] nfs: fix UAF on pathwalk running into umount NFS ->d_revalidate(), ->permission() and ->get_link() need to access some parts of nfs_server when called in RCU mode: server->flags server->caps *(server->io_stats) and, worst of all, call server->nfs_client->rpc_ops->have_delegation (the last one - as NFS_PROTO(inode)->have_delegation()). We really don't want to RCU-delay the entire nfs_free_server() (it would have to be done with schedule_work() from RCU callback, since it can't be made to run from interrupt context), but actual freeing of nfs_server and ->io_stats can be done via call_rcu() just fine. nfs_client part is handled simply by making nfs_free_client() use kfree_rcu(). Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/nfs/client.c | 13 ++++++++++--- include/linux/nfs_fs_sb.h | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 44eca51b2808..fbdc9ca80f71 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp) put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void) } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); put_cred(server->cred); - kfree(server); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index cd797e00fe35..92de074e63b9 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -124,6 +124,7 @@ struct nfs_client { char cl_ipaddr[48]; struct net *cl_net; struct list_head pending_cb_stateids; + struct rcu_head rcu; }; /* @@ -265,6 +266,7 @@ struct nfs_server { const struct cred *cred; bool has_sec_mnt_opts; struct kobject kobj; + struct rcu_head rcu; }; /* Server capabilities */ From 47458802f6606f652cd0f6dc38cd52ce60ec0145 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2023 23:52:58 -0400 Subject: [PATCH 321/327] procfs: move dropping pde and pid from ->evict_inode() to ->free_inode() that keeps both around until struct inode is freed, making access to them safe from rcu-pathwalk Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/proc/base.c | 2 -- fs/proc/inode.c | 19 ++++++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..18550c071d71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei) hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } - - put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b33e490e3fd9..05350f3c2812 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -30,7 +30,6 @@ static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); @@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode) clear_inode(inode); /* Stop tracking associated processes */ - if (ei->pid) { + if (ei->pid) proc_pid_evict_inode(ei); - ei->pid = NULL; - } - - /* Let go of any associated proc directory entry */ - de = ei->pde; - if (de) { - pde_put(de); - ei->pde = NULL; - } head = ei->sysctl; if (head) { @@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_free_inode(struct inode *inode) { + struct proc_inode *ei = PROC_I(inode); + + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } From e31f0a57ae1ab2f6e17adb8e602bc120ad722232 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Sep 2023 00:12:00 -0400 Subject: [PATCH 322/327] procfs: make freeing proc_fs_info rcu-delayed makes proc_pid_ns() safe from rcu pathwalk (put_pid_ns() is still synchronous, but that's not a problem - it does rcu-delay everything that needs to be) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/proc/root.c | 2 +- include/linux/proc_fs.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index b55dbc70287b..06a297a27ba3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); - kfree(fs_info); + kfree_rcu(fs_info, rcu); } static struct file_system_type proc_fs_type = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index de407e7c3b55..0b2a89854440 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -65,6 +65,7 @@ struct proc_fs_info { kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; + struct rcu_head rcu; }; static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) From 053fc4f755ad43cf35210677bcba798ccdc48d0c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 Sep 2023 00:19:39 -0400 Subject: [PATCH 323/327] fuse: fix UAF in rcu pathwalks ->permission(), ->get_link() and ->inode_get_acl() might dereference ->s_fs_info (and, in case of ->permission(), ->s_fs_info->fc->user_ns as well) when called from rcu pathwalk. Freeing ->s_fs_info->fc is rcu-delayed; we need to make freeing ->s_fs_info and dropping ->user_ns rcu-delayed too. Signed-off-by: Al Viro --- fs/fuse/cuse.c | 3 +-- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 15 +++++++++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 91e89e68177e..b6cad106c37e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -474,8 +474,7 @@ err: static void cuse_fc_release(struct fuse_conn *fc) { - struct cuse_conn *cc = fc_to_cc(fc); - kfree_rcu(cc, fc.rcu); + kfree(fc_to_cc(fc)); } /** diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..bcbe34488862 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -888,6 +888,7 @@ struct fuse_mount { /* Entry on fc->mounts */ struct list_head fc_entry; + struct rcu_head rcu; }; static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..516ea2979a90 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { @@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc) if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } - fc->release(fc); + call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb) void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); - kfree(fm); + kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); From 0511fdb4a378183ca18a9678d3d9044c8ec592c2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2023 22:28:16 -0400 Subject: [PATCH 324/327] cifs_get_link(): bail out in unsafe case ->d_revalidate() bails out there, anyway. It's not enough to prevent getting into ->get_link() in RCU mode, but that could happen only in a very contrieved setup. Not worth trying to do anything fancy here unless ->d_revalidate() stops kicking out of RCU mode at least in some cases. Reviewed-by: Christian Brauner Acked-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/smb/client/cifsfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e902de4e475a..630e74628dfe 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1172,6 +1172,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, { char *target_path; + if (!dentry) + return ERR_PTR(-ECHILD); + target_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!target_path) return ERR_PTR(-ENOMEM); From 9fa8e282c2bfe93338e81a620a49f5903a745231 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Feb 2024 01:17:34 -0500 Subject: [PATCH 325/327] ext4_get_link(): fix breakage in RCU mode 1) errors from ext4_getblk() should not be propagated to caller unless we are really sure that we would've gotten the same error in non-RCU pathwalk. 2) we leak buffer_heads if ext4_getblk() is successful, but bh is not uptodate. Signed-off-by: Al Viro --- fs/ext4/symlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) From 5197728f8182a93a07e5bf860726456322d3a908 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 25 Feb 2024 15:45:34 -0500 Subject: [PATCH 326/327] bcachefs: fix bch2_save_backtrace() Missed a call in the previous fix. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 231003b405ef..3a32faa86b5c 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -289,7 +289,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne do { nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && - !(ret = darray_make_room(stack, stack->size * 2))); + !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; up_read(&task->signal->exec_update_lock); From d206a76d7d2726f3b096037f2079ce0bd3ba329b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Feb 2024 15:46:06 -0800 Subject: [PATCH 327/327] Linux 6.8-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 41fa8a2565f5..6cdb5717bfe0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 8 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION*