From 78b1beb0998437107ed144b341fbe1252188916b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 24 Sep 2017 21:46:29 +0300 Subject: [PATCH 01/34] IB/core: Fix typo in the name of the tag-matching cap struct The tag matching functionality is implemented by mlx5 driver by extending XRQ, however this internal kernel information was exposed to user space applications with *xrq* name instead of *tm*. This patch renames *xrq* to *tm* to handle that. Fixes: 8d50505ada72 ("IB/uverbs: Expose XRQ capabilities") Signed-off-by: Leon Romanovsky Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++++------- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- include/rdma/ib_verbs.h | 4 ++-- include/uapi/rdma/ib_user_verbs.h | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4ab30d832ac5b..52a2cf2d83aaf 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3869,15 +3869,15 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); - if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps)) goto end; - resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; - resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; - resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; - resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; - resp.xrq_caps.flags = attr.xrq_caps.flags; - resp.response_length += sizeof(resp.xrq_caps); + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.response_length += sizeof(resp.tm_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 05fb4bdff6a0a..d6fbad8f34aa7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -778,13 +778,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->xrq_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; - props->xrq_caps.max_num_tags = + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->xrq_caps.flags = IB_TM_CAP_RC; - props->xrq_caps.max_ops = + props->tm_caps.flags = IB_TM_CAP_RC; + props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - props->xrq_caps.max_sge = MLX5_TM_MAX_SGE; + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bdb1279a415b3..bbb5f54db8820 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -285,7 +285,7 @@ enum ib_tm_cap_flags { IB_TM_CAP_RC = 1 << 0, }; -struct ib_xrq_caps { +struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ @@ -358,7 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ - struct ib_xrq_caps xrq_caps; + struct ib_tm_caps tm_caps; }; enum ib_mtu { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 9a0b6479fe0c6..d4e0b53bfc75c 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -261,7 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; - struct ib_uverbs_tm_caps xrq_caps; + struct ib_uverbs_tm_caps tm_caps; }; struct ib_uverbs_query_port { From 73827a605bbd7cebef4cfd1261e497246a82a0e7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:30 +0300 Subject: [PATCH 02/34] IB/core: Fix qp_sec use after free access When security_ib_alloc_security fails, qp->qp_sec memory is freed. However ib_destroy_qp still tries to access this memory which result in kernel crash. So its initialized to NULL to avoid such access. Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/security.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 70ad19c4c73e7..88bdafb297f5f 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -432,8 +432,10 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); - if (ret) + if (ret) { kfree(qp->qp_sec); + qp->qp_sec = NULL; + } return ret; } From edd31551148c09608feee6b8756ad148d550ee3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:31 +0300 Subject: [PATCH 03/34] IB: Correct MR length field to be 64-bit The ib_mr->length represents the length of the MR in bytes as per the IBTA spec 1.3 section 11.2.10.3 (REGISTER PHYSICAL MEMORY REGION). Currently ib_mr->length field is defined as only 32-bits field. This might result into truncation and failed WRs of consumers who registers more than 4GB bytes memory regions and whose WRs accessing such MRs. This patch makes the length 64-bit to avoid such truncation. Cc: Sagi Grimberg Cc: Chuck Lever Cc: Faisal Latif Fixes: 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Signed-off-by: Ilya Lesokhin Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- include/rdma/ib_verbs.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f0dc5f4aa177e..442b9bdc0f03b 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3232,7 +3232,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->ibmr.iova); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - mr->ibmr.length); + lower_32_bits(mr->ibmr.length)); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); set_wqe_32bit_value(wqe->wqe_words, @@ -3274,7 +3274,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->npages * 8); nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %d, rkey: %0x, pgl_paddr: %llx, " + "length: %lld, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", (unsigned long long) mr->ibmr.iova, mr->ibmr.length, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9c3e9ab53a415..322209d5ff582 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -154,7 +154,7 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec npages %d data length %d\n", + iser_err("page vec npages %d data length %lld\n", page_vec->npages, page_vec->fake_mr.length); for (i = 0; i < page_vec->npages; i++) iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbb5f54db8820..e8608b2dc844f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { u32 lkey; u32 rkey; u64 iova; - u32 length; + u64 length; unsigned int page_size; bool need_inval; union { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a32..df062e086bdbb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); From 9c6f42e9254150d2772242d9f8bd8d0b7b7431ff Mon Sep 17 00:00:00 2001 From: Shalom Lagziel Date: Sun, 24 Sep 2017 21:46:32 +0300 Subject: [PATCH 04/34] IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock A possible ABBA lock can happen with RTNL and vlan_rwsem. For example: Flow A: Device Flush __ipoib_ib_dev_flush down_read(vlan_rwsem) // Lock A ipoib_flush_ah flush_workqueue(priv->wq) // Wait for completion A work on shared WQ (Mcast carrier) ipoib_mcast_carrier_on_task while (!rtnl_trylock()) // Wait for lock B Flow B: Sysfs PKEY delete ipoib_vlan_delete lock(RTNL) // Lock B down_write(vlan_rwsem) // Wait for lock A This can happen with PKEY creates as well. The solution is to release the RTNL lock in sysfs functions in case it is not possible to lock VLAN RW semaphore and reset the SYS call. Fixes: 69956d83267e ("IB/ipoib: Sync between remove_one to sysfs calls that use rtnl_lock") Signed-off-by: Shalom Lagziel Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 9927cd6b7082b..e01c58edca15c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -141,14 +141,17 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); - if (!priv) { + if (!down_write_trylock(&ppriv->vlan_rwsem)) { rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - return -ENOMEM; + return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (!priv) { + result = -ENOMEM; + goto out; + } /* * First ensure this isn't a duplicate. We check the parent device and @@ -175,7 +178,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - if (result) { + if (result && priv) { free_netdev(priv->dev); kfree(priv); } @@ -204,7 +207,12 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + if (!down_write_trylock(&ppriv->vlan_rwsem)) { + rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); + return restart_syscall(); + } + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { From 7c9d9662103ae1c11acc7bfc47d988466cff23cf Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 24 Sep 2017 21:46:33 +0300 Subject: [PATCH 05/34] IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev Call free_rdma_netdev instead of free_netdev each time we want to release a netdevice. This call is also relevant for future freeing of offloaded child interfaces. This patch also adds a missing call for free netdevice when releasing a parent interface that has child interfaces using ipoib_remove_one. Fixes: cd565b4b51e5 ('IB/IPoIB: Support acceleration options callbacks') Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++++++++++---- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index bac95b509a9b2..dcc77014018db 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2180,6 +2180,7 @@ static struct net_device *ipoib_add_port(const char *format, { struct ipoib_dev_priv *priv; struct ib_port_attr attr; + struct rdma_netdev *rn; int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); @@ -2279,7 +2280,8 @@ static struct net_device *ipoib_add_port(const char *format, ipoib_dev_cleanup(priv->dev); device_init_failed: - free_netdev(priv->dev); + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); alloc_mem_failed: @@ -2328,7 +2330,7 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *rn = netdev_priv(priv->dev); + struct rdma_netdev *parent_rn = netdev_priv(priv->dev); ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); @@ -2350,10 +2352,15 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) unregister_netdev(priv->dev); mutex_unlock(&priv->sysfs_mutex); - rn->free_rdma_netdev(priv->dev); + parent_rn->free_rdma_netdev(priv->dev); + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + struct rdma_netdev *child_rn; - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) + child_rn = netdev_priv(cpriv->dev); + child_rn->free_rdma_netdev(cpriv->dev); kfree(cpriv); + } kfree(priv); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index e01c58edca15c..55a9b71ed05a7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -179,7 +179,10 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) mutex_unlock(&ppriv->sysfs_mutex); if (result && priv) { - free_netdev(priv->dev); + struct rdma_netdev *rn; + + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); } @@ -232,7 +235,10 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) mutex_unlock(&ppriv->sysfs_mutex); if (dev) { - free_netdev(dev); + struct rdma_netdev *rn; + + rn = netdev_priv(dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); return 0; } From d67bc5d4e3e100d762c0f57ea67f28bc219698a6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:34 +0300 Subject: [PATCH 06/34] IB/mlx5: Simplify mlx5_ib_cont_pages The patch simplifies mlx5_ib_cont_pages and fixes the following issues in the original implementation: First issues is related to alignment of the PFNs. After the check base + p != PFN, the alignment of the PFN wasn't checked. So the PFN sequence 0, 1, 1, 2 would result in a page_shift of 13 even though the 3rd PFN is not 8KB aligned. This wasn't actually a bug because it was supported by all the existing mlx5 compatible device, but we don't want to require this support in all future devices. Another issue is because the inner loop didn't advance PFN so the test "if (base + p != pfn)" always failed for SGE with len > (1< Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mem.c | 47 ++++++++++++-------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 914f212e7ef60..f3dbd75a0a968 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -50,13 +50,9 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, { unsigned long tmp; unsigned long m; - int i, k; - u64 base = 0; - int p = 0; - int skip; - int mask; - u64 len; - u64 pfn; + u64 base = ~0, p = 0; + u64 len, pfn; + int i = 0; struct scatterlist *sg; int entry; unsigned long page_shift = umem->page_shift; @@ -76,33 +72,24 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) m = min_t(unsigned long, max_page_shift - page_shift, m); - skip = 1 << m; - mask = skip - 1; - i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> page_shift; pfn = sg_dma_address(sg) >> page_shift; - for (k = 0; k < len; k++) { - if (!(i & mask)) { - tmp = (unsigned long)pfn; - m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG)); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } else { - if (base + p != pfn) { - tmp = (unsigned long)p; - m = find_first_bit(&tmp, BITS_PER_LONG); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } - } - p++; - i++; + if (base + p != pfn) { + /* If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; } + + p += len; + i += len; } if (i) { From fbcd49838d9094ca45772356e7b33afe4b7c93e7 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:35 +0300 Subject: [PATCH 07/34] IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure mlx5_ib_reg_user_mr called mlx5_ib_dereg_mr in case of MR population failure. This resulted in a NULL dereference as ibmr->device wasn't initialized yet. We address this by adding an internal dereg_mr function that can handle partially initialized MRs, and fixing clean_mr to work on partially initialized MRs. Fixes: ff740aefecb9 ("IB/mlx5: Decouple MR allocation and population flows") Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0e2789d9bb4d0..37bbc543847a5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -47,7 +47,8 @@ enum { #define MLX5_UMR_ALIGN 2048 -static int clean_mr(struct mlx5_ib_mr *mr); +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); @@ -1270,8 +1271,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, update_xlt_flags); + if (err) { - mlx5_ib_dereg_mr(&mr->ibmr); + dereg_mr(dev, mr); return ERR_PTR(err); } } @@ -1356,7 +1358,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1410,7 +1412,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); ib_umem_release(mr->umem); - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1469,9 +1471,8 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } -static int clean_mr(struct mlx5_ib_mr *mr) +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int allocated_from_cache = mr->allocated_from_cache; int err; @@ -1507,10 +1508,8 @@ static int clean_mr(struct mlx5_ib_mr *mr) return 0; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); int npages = mr->npages; struct ib_umem *umem = mr->umem; @@ -1539,7 +1538,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) } #endif - clean_mr(mr); + clean_mr(dev, mr); if (umem) { ib_umem_release(umem); @@ -1549,6 +1548,14 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + + return dereg_mr(dev, mr); +} + struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Sep 2017 10:44:36 -0700 Subject: [PATCH 08/34] PM / OPP: Call notifier without holding opp_table->lock The notifier callbacks may want to call some OPP helper routines which may try to take the same opp_table->lock again and cause a deadlock. One such usecase was reported by Chanwoo Choi, where calling dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler, which further calls dev_pm_opp_find_freq_floor() and it deadlocks. We don't really need the opp_table->lock to be held across the notifier call though, all we want to make sure is that the 'opp' doesn't get freed while being used from within the notifier chain. We can do it with help of dev_pm_opp_get/put() as well. Let's do it. Cc: 4.11+ # 4.11+ Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()" Reported-by: Chanwoo Choi Tested-by: Chanwoo Choi Reviewed-by: Stephen Boyd Reviewed-by: Chanwoo Choi Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index a8cc14fd8ae49..a6de325306933 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, opp->available = availability_req; + dev_pm_opp_get(opp); + mutex_unlock(&opp_table->lock); + /* Notify the change of the OPP availability */ if (availability_req) blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE, @@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_DISABLE, opp); + dev_pm_opp_put(opp); + goto put_table; + unlock: mutex_unlock(&opp_table->lock); +put_table: dev_pm_opp_put_opp_table(opp_table); return r; } From d477bf3af1e88fe27c893f84136647fe11963198 Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Thu, 21 Sep 2017 19:09:03 +0530 Subject: [PATCH 09/34] cpufreq: dt: Fix sysfs duplicate filename creation for platform-device ti-cpufreq and cpufreq-dt-platdev drivers are registering platform-device with same name "cpufreq-dt" using platform_device_register_*() routines. This is leading to build warnings appended below. Providing hardware information to OPP framework along with the platform- device creation should be done by ti-cpufreq driver before cpufreq-dt driver comes into place. This patch add's TI am33xx, am43 and dra7 platforms (which use opp-v2 property) to the blacklist of devices in cpufreq-dt-platform driver to avoid creating platform-device twice and remove build warnings. [ 2.370167] ------------[ cut here ]------------ [ 2.375087] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x58/0x78 [ 2.383112] sysfs: cannot create duplicate filename '/devices/platform/cpufreq-dt' [ 2.391219] Modules linked in: [ 2.394506] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-next-20170912 #1 [ 2.402006] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.408437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.416568] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.424165] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.431488] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.439351] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x58/0x78) [ 2.447938] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x80/0x98) [ 2.456719] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0x9c/0x2d4) [ 2.466124] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.474712] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.482489] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.491085] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.501305] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.511253] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.519838] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.528974] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.537565] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.545981] ---[ end trace 2fc00e213c13ab20 ]--- [ 2.551051] ------------[ cut here ]------------ [ 2.555931] WARNING: CPU: 0 PID: 1 at lib/kobject.c:240 kobject_add_internal+0x254/0x2d4 [ 2.564578] kobject_add_internal failed for cpufreq-dt with -EEXIST, don't try to register things with the same name in the same directory. [ 2.577977] Modules linked in: [ 2.581261] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.13.0-next-20170912 #1 [ 2.590013] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.596437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.604573] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.612172] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.619494] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.627362] [] (warn_slowpath_fmt) from [] (kobject_add_internal+0x254/0x2d4) [ 2.636666] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.645255] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.653027] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.661615] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.671833] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.681779] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.690377] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.699510] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.708106] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.716217] ---[ end trace 2fc00e213c13ab21 ]--- Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Suniel Mahesh Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 430edadca527a..a753c50e9e412 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -118,6 +118,10 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "sigma,tango4", }, + { .compatible = "ti,am33xx", }, + { .compatible = "ti,am43", }, + { .compatible = "ti,dra7", }, + { } }; From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:16 -0700 Subject: [PATCH 10/34] xfs: don't unconditionally clear the reflink flag on zero-block files If we have speculative cow preallocations hanging around in the cow fork, don't let a truncate operation clear the reflink flag because if we do then there's a chance we'll forget to free those extents when we destroy the incore inode. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5599dda4727af..4ec5b7f454013 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1624,10 +1624,12 @@ xfs_itruncate_extents( goto out; /* - * Clear the reflink flag if we truncated everything. + * Clear the reflink flag if there are no data fork blocks and + * there are no extents staged in the cow fork. */ - if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { + if (ip->i_d.di_nblocks == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); } From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: [PATCH 11/34] xfs: evict CoW fork extents when performing finsert/fcollapse When we perform an finsert/fcollapse operation, cancel all the CoW extents for the affected file offset range so that they don't end up pointing to the wrong blocks. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cd9a5400ba4fe..bc6c6e10a9699 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1459,7 +1459,19 @@ xfs_shift_file_space( return error; /* - * The extent shiting code works on extent granularity. So, if + * Clean out anything hanging around in the cow fork now that + * we've flushed all the dirty data out to disk to avoid having + * CoW extents at the wrong offsets. + */ + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, + true); + if (error) + return error; + } + + /* + * The extent shifting code works on extent granularity. So, if * stop_fsb is not the starting block of extent, we need to split * the extent at stop_fsb. */ From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 11:34:16 -0700 Subject: [PATCH 12/34] fs/xfs: Use %pS printk format for direct addresses Use the %pS instead of the %pF printk format specifier for printing symbols from direct addresses. This is needed for the ia64, ppc64 and parisc64 architectures. Signed-off-by: Helge Deller Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index bd786a9ac2c38..eaf86f55b7f21 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,7 +347,7 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_ops->name, bp->b_bn); From 64671bafbdd984535aa382bccadd91fbe7be0e80 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:38:58 -0700 Subject: [PATCH 13/34] xfs: kill meaningless variable 'zero' In xfs_file_aio_write_checks(), variable 'zero' is there only to satisfy xfs_zero_eof(), the result of it is ignored. Now, with iomap_zero_range() based xfs_zero_eof(), we can safely pass NULL as the last param of it and kill 'zero'. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebdd0bd2b2616..261d83f1db767 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -377,8 +377,6 @@ xfs_file_aio_write_checks( */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { - bool zero = false; - spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -399,7 +397,7 @@ xfs_file_aio_write_checks( drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:39:23 -0700 Subject: [PATCH 14/34] xfs: report zeroed or not correctly in xfs_zero_range() The 'did_zero' param of xfs_zero_range() was not passed to iomap_zero_range() correctly. This was introduced by commit 7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found by code inspection. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 261d83f1db767..350b6d43ba23b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -58,7 +58,7 @@ xfs_zero_range( xfs_off_t count, bool *did_zero) { - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); } int From 1e6fa688bffc0ff419a4c3e78dbaf7aabfb55183 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Mon, 18 Sep 2017 12:03:56 -0700 Subject: [PATCH 15/34] xfs: Output warning message when discard option was enabled even though the device does not support discard In order to using discard function, it is necessary that not only xfs is mounted with discard option, but also the discard function is supported by the device. Current code doesn't output any message when users mount with discard option on unsupported device, so it is difficult to notice that it was not enabled actually. This patch adds the warning message to notice that discard option is not enabled due to unsupported device when the filesystem is mounted. Changes in v2 (Suggested by Brian Foster): - Move the unsupported device check into xfs_fs_fill_super(). - Clear the discard flag when device is unsupported. Signed-off-by: Kenjiro Nakayama Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c996f4ae4a5f2..584cf2d573bab 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1654,6 +1654,16 @@ xfs_fs_fill_super( "DAX and reflink have not been tested together!"); } + if (mp->m_flags & XFS_MOUNT_DISCARD) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) { + xfs_warn(mp, "mounting with \"discard\" option, but " + "the device does not support discard"); + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } + } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, From 60915f83cd1e021a66fc1503a446aef5c772553a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 13:38:46 -0700 Subject: [PATCH 16/34] xfs: remove redundant re-initialization of total_nr_pages Variable total_nr_pages is being initialized and then updated with the same value, this latter assignment is redundant and can be removed. Cleans up clang build warning: Value stored to 'total_nr_pages' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index da14658da3103..2f97c12ca75e4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1258,8 +1258,6 @@ xfs_buf_ioapply_map( int size; int offset; - total_nr_pages = bp->b_page_count; - /* skip the pages in the buffer before the start offset */ page_index = 0; offset = *buf_offset; From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 18 Sep 2017 14:46:03 -0700 Subject: [PATCH 17/34] xfs: validate bdev support for DAX inode flag Currently only the blocksize is checked, but we should really be calling bdev_dax_supported() which also tests to make sure we can get a struct dax_device and that the dax_direct_access() path is working. This is the same check that we do for the "-o dax" mount option in xfs_fs_fill_super(). This does not fix the race issues that caused the XFS DAX inode option to be disabled, so that option will still be disabled. If/when we re-enable it, though, I think we will want this issue to have been fixed. I also do think that we want to fix this in stable kernels. Signed-off-by: Ross Zwisler CC: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5049e8ab6e302..aa75389be8cfa 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } From 546e7be8244dc050effef0555df5b8d94d10dafc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Fri, 22 Sep 2017 11:47:33 -0700 Subject: [PATCH 18/34] iomap_dio_rw: Allocate AIO completion queue before submitting dio Executing xfs/104 test in a loop on Linux-v4.13 kernel on a ppc64 machine can cause the following NULL pointer dereference, .queue_work_on+0x4c/0x80 .iomap_dio_bio_end_io+0xbc/0x1f0 .bio_endio+0x118/0x1f0 .blk_update_request+0xd0/0x470 .blk_mq_end_request+0x24/0xc0 .lo_complete_rq+0x40/0xe0 .__blk_mq_complete_request_remote+0x28/0x40 .flush_smp_call_function_queue+0xc4/0x1e0 .smp_ipi_demux_relaxed+0x8c/0x100 .icp_hv_ipi_action+0x54/0xa0 .__handle_irq_event_percpu+0x84/0x2c0 .handle_irq_event_percpu+0x28/0x80 .handle_percpu_irq+0x78/0xc0 .generic_handle_irq+0x40/0x70 .__do_irq+0x88/0x200 .call_do_irq+0x14/0x24 .do_IRQ+0x84/0x130 This occurs due to the following sequence of events, 1. Allocate dio for Direct I/O write. 2. Invoke iomap_apply() until iov_iter_count() bytes have been submitted. - Assume that we have submitted atleast one bio. Hence iomap_dio->ref value will be >= 2. - If during the second iteration, iomap_apply() ends up returning -ENOSPC, we would break out of the loop and since the 'ret' value is a negative number we end up not allocating memory for super_block->s_dio_done_wq. 3. Meanwhile, iomap_dio_bio_end_io() is invoked for bios that have been submitted and here the code ends up dereferencing the NULL pointer stored at super_block->s_dio_done_wq. This commit fixes the bug by allocating memory for super_block->s_dio_done_wq before iomap_apply() is invoked. Reported-by: Eryu Guan Reviewed-by: Christoph Hellwig Tested-by: Eryu Guan Signed-off-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f321..d4f526a3f5b2f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -993,6 +993,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, WARN_ON_ONCE(ret); ret = 0; + if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; + } + inode_dio_begin(inode); blk_start_plug(&plug); @@ -1015,13 +1022,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); - if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && - !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - iomap_dio_set_error(dio, ret); - } - if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 21 Sep 2017 11:26:18 -0700 Subject: [PATCH 19/34] xfs: update i_size after unwritten conversion in dio completion Since commit d531d91d6990 ("xfs: always use unwritten extents for direct I/O writes"), we start allocating unwritten extents for all direct writes to allow appending aio in XFS. But for dio writes that could extend file size we update the in-core inode size first, then convert the unwritten extents to real allocations at dio completion time in xfs_dio_write_end_io(). Thus a racing direct read could see the new i_size and find the unwritten extents first and read zeros instead of actual data, if the direct writer also takes a shared iolock. Fix it by updating the in-core inode size after the unwritten extent conversion. To do this, introduce a new boolean argument to xfs_iomap_write_unwritten() to tell if we want to update in-core i_size or not. Suggested-by: Brian Foster Reviewed-by: Brian Foster Signed-off-by: Eryu Guan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 3 ++- fs/xfs/xfs_file.c | 33 +++++++++++++++++++-------------- fs/xfs/xfs_iomap.c | 7 +++++-- fs/xfs/xfs_iomap.h | 2 +- fs/xfs/xfs_pnfs.c | 2 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29172609f2a31..f18e5932aec4b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -343,7 +343,8 @@ xfs_end_io( error = xfs_reflink_end_cow(ip, offset, size); break; case XFS_IO_UNWRITTEN: - error = xfs_iomap_write_unwritten(ip, offset, size); + /* writeback should never update isize */ + error = xfs_iomap_write_unwritten(ip, offset, size, false); break; default: ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 350b6d43ba23b..309e26c9dddb4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -434,7 +434,6 @@ xfs_dio_write_end_io( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; - bool update_size = false; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); @@ -445,6 +444,21 @@ xfs_dio_write_end_io( if (size <= 0) return size; + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) + return xfs_iomap_write_unwritten(ip, offset, size, true); + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We @@ -459,20 +473,11 @@ xfs_dio_write_end_io( spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); - update_size = true; - } - spin_unlock(&ip->i_flags_lock); - - if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); - if (error) - return error; - } - - if (flags & IOMAP_DIO_UNWRITTEN) - error = xfs_iomap_write_unwritten(ip, offset, size); - else if (update_size) + spin_unlock(&ip->i_flags_lock); error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a1909bc064e9e..f179bdf1644dc 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -829,7 +829,8 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct xfs_defer_ops dfops; + struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; @@ -899,7 +901,8 @@ xfs_iomap_write_unwritten( i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; - + if (update_isize && i_size > i_size_read(inode)) + i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_d.di_size = i_size; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 00db3ecea0840..ee535065c5d0e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 2f2dc3c09ad00..4246876df7b75 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -274,7 +274,7 @@ xfs_fs_commit_blocks( (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); - error = xfs_iomap_write_unwritten(ip, start, length); + error = xfs_iomap_write_unwritten(ip, start, length, false); if (error) goto out_drop_iolock; } From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:42:09 -0700 Subject: [PATCH 20/34] xfs: perag initialization should only touch m_ag_max_usable for AG 0 We call __xfs_ag_resv_init to make a per-AG reservation for each AG. This makes the reservation per-AG, not per-filesystem. Therefore, it is incorrect to adjust m_ag_max_usable for each AG. Adjust it only when we're reserving AG 0's blocks so that we only do it once per fs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ag_resv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index b008ff3250eba..df3e600835e8d 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -156,7 +156,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag->pag_agno == 0) + pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* * AGFL blocks are always considered "free", so whatever * was reserved at mount time must be given back at umount. @@ -216,7 +217,14 @@ __xfs_ag_resv_init( return error; } - mp->m_ag_max_usable -= ask; + /* + * Reduce the maximum per-AG allocation length by however much we're + * trying to reserve for an AG. Since this is a filesystem-wide + * counter, we only make the adjustment for AG 0. This assumes that + * there aren't any AGs hungrier for per-AG reservation than AG 0. + */ + if (pag->pag_agno == 0) + mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 22 Sep 2017 11:47:46 -0700 Subject: [PATCH 21/34] xfs: Capture state of the right inode in xfs_iflush_done My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly resubmitted. In the loop scanning other inodes being completed, it should check the current item for the XFS_LI_FAILED, and not the initial one. The state of the initial inode is checked after the loop ends Kudos to Eric for catching this. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6d0f74ec31e89..a705f34b58fad 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -745,7 +745,7 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + (blip->li_flags & XFS_LI_FAILED)) need_ail++; blip = next; From 5e5c943c1f257c2b3424fc3f8a7b18570152dab3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: [PATCH 22/34] xfs: revert "xfs: factor rmap btree size into the indlen calculations" In commit fd26a88093ba we added a worst case estimate for rmapbt blocks needed to satisfy the block mapping request. Since then, we added the ability to reserve enough space in each AG such that we should never run out of blocks to grow the rmapbt, which makes this calculation unnecessary. Revert the commit because it makes the extra delalloc indlen accounting unnecessary and incorrect. Reported-by: Eryu Guan Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 459f4b4f08fe5..044a363119bea 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,7 +49,6 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_rmap_btree.h" #include "xfs_icache.h" @@ -192,12 +191,8 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ - xfs_filblks_t orig_len; mp = ip->i_mount; - - /* Calculate the worst-case size of the bmbt. */ - orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -205,20 +200,12 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) { - rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; - break; - } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } - - /* Calculate the worst-case size of the rmapbt. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + - mp->m_rmap_maxlevels; - return rval; } From df5efdd97029f2cff7e5c91ea1c9f2b94d009b0f Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:05:57 -0700 Subject: [PATCH 23/34] IB/hfi1: Turn off AOC TX after offline substates Offline.quietDuration was added in the 8051 firmware, and the driver only turns off the AOC transmitters when offline.quiet is reached. However, the AOC transmitters need to be turned off at the new state. Therefore, turn off the AOC transmitters at any offline substates including offline.quiet and offline.quietDuration, then recheck we reached offline.quiet to support backwards compatibility. Reviewed-by: Jakub Byczkowski Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 85 +++++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/chip.h | 1 + 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b2ed4b9cda6ee..1c810d65721a0 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1066,6 +1066,8 @@ static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); static int thermal_init(struct hfi1_devdata *dd); static void update_statusp(struct hfi1_pportdata *ppd, u32 state); +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); @@ -10305,6 +10307,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 previous_state; + int offline_state_ret; int ret; update_lcb_cache(dd); @@ -10326,28 +10329,11 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); - /* - * Wait for offline transition. It can take a while for - * the link to go down. - */ - ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000); - if (ret < 0) - return ret; - - /* - * Now in charge of LCB - must be after the physical state is - * offline.quiet and before host_link_state is changed. - */ - set_host_lcb_access(dd); - write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ - - /* make sure the logical state is also down */ - ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); - if (ret) - force_logical_link_state_down(ppd); - - ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); + if (offline_state_ret < 0) + return offline_state_ret; + /* Disabling AOC transmitters */ if (ppd->port_type == PORT_TYPE_QSFP && ppd->qsfp_info.limiting_active && qsfp_mod_present(ppd)) { @@ -10364,6 +10350,30 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } } + /* + * Wait for the offline.Quiet transition if it hasn't happened yet. It + * can take a while for the link to go down. + */ + if (offline_state_ret != PLS_OFFLINE_QUIET) { + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); + if (ret < 0) + return ret; + } + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + /* make sure the logical state is also down */ + ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + if (ret) + force_logical_link_state_down(ppd); + + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + /* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different @@ -12804,6 +12814,39 @@ static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, return 0; } +/* + * wait_phys_link_offline_quiet_substates - wait for any offline substate + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) == PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index b8345a60a0fbc..461f937fe110d 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -204,6 +204,7 @@ #define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 #define PLS_OFFLINE_REPORT_FAILURE 0x93 #define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_OFFLINE_QUIET_DURATION 0x95 #define PLS_POLLING 0x20 #define PLS_POLLING_QUIET 0x20 #define PLS_POLLING_ACTIVE 0x21 From 30e10527bcce376114e627abb7fabfbe9bfee91e Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:06:03 -0700 Subject: [PATCH 24/34] IB/hfi1: Only reset QSFP after link up and turn off AOC TX QSFP reset enables AOC transmitters by default. They should be off before moving to high power mode to complete the setup. There is no need to reset the QSFP during LNI failure as it was reset at link down. Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 12 +++++++++++- drivers/infiniband/hw/hfi1/chip.h | 2 +- drivers/infiniband/hw/hfi1/platform.c | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 1c810d65721a0..27b75a8f5097e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9415,7 +9415,7 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } -void reset_qsfp(struct hfi1_pportdata *ppd) +int reset_qsfp(struct hfi1_pportdata *ppd) { struct hfi1_devdata *dd = ppd->dd; u64 mask, qsfp_mask; @@ -9445,6 +9445,13 @@ void reset_qsfp(struct hfi1_pportdata *ppd) * for alarms and warnings */ set_qsfp_int_n(ppd, 1); + + /* + * After the reset, AOC transmitters are enabled by default. They need + * to be turned off to complete the QSFP setup before they can be + * enabled again. + */ + return set_qsfp_tx(ppd, 0); } static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, @@ -10406,6 +10413,9 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ check_lni_states(ppd); + + /* The QSFP doesn't need to be reset on LNI failure */ + ppd->qsfp_info.reset_needed = 0; } /* the active link width (downgrade) is 0 on link down */ diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 461f937fe110d..50b8645d0b876 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -723,7 +723,7 @@ void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_start_link(struct work_struct *work); void handle_sma_message(struct work_struct *work); -void reset_qsfp(struct hfi1_pportdata *ppd); +int reset_qsfp(struct hfi1_pportdata *ppd); void qsfp_event(struct work_struct *work); void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); int send_idle_sma(struct hfi1_devdata *dd, u64 message); diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index a8af96d2b1b0a..d486355880cb0 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -790,7 +790,9 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, * reuse of stale settings established in our previous pass through. */ if (ppd->qsfp_info.reset_needed) { - reset_qsfp(ppd); + ret = reset_qsfp(ppd); + if (ret) + return ret; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; From 753b19afb19dd97d0767df5e8afb13faff605315 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Tue, 26 Sep 2017 06:06:09 -0700 Subject: [PATCH 25/34] IB/hfi1: Check eeprom config partition validity Relying on a trailing magic value is incorrect. There are instances where this is not present as trailing magic value has a specific purpose which is not partition validation. Instead use the header magic value which is present in all variants of the platform configuration and is intended for validation. This is also used in other locations in the driver. Fixes: bc5214ee2922 (IB/hfi1: Handle missing magic values in config file) Reviewed-by: Jakub Byczkowski Signed-off-by: Jan Sokolowski Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/eprom.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c index d46b171079010..1613af1c58d9d 100644 --- a/drivers/infiniband/hw/hfi1/eprom.c +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -204,7 +204,10 @@ int eprom_init(struct hfi1_devdata *dd) return ret; } -/* magic character sequence that trails an image */ +/* magic character sequence that begins an image */ +#define IMAGE_START_MAGIC "APO=" + +/* magic character sequence that might trail an image */ #define IMAGE_TRAIL_MAGIC "egamiAPO" /* EPROM file types */ @@ -250,6 +253,7 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, { void *buffer; void *p; + u32 length; int ret; buffer = kmalloc(P1_SIZE, GFP_KERNEL); @@ -262,15 +266,21 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, return ret; } - /* scan for image magic that may trail the actual data */ - p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); - if (!p) { + /* config partition is valid only if it starts with IMAGE_START_MAGIC */ + if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) { kfree(buffer); return -ENOENT; } + /* scan for image magic that may trail the actual data */ + p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); + if (p) + length = p - buffer; + else + length = P1_SIZE; + *data = buffer; - *size = p - buffer; + *size = length; return 0; } From 09592af5fdd722615ebe435fb34308de26c74bcf Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Tue, 26 Sep 2017 06:06:15 -0700 Subject: [PATCH 26/34] IB/hfi1: Return correct value in general interrupt handler The general interrupt handler returns IRQ_HANDLED whether an IRQ was handled or not. Determine if an IRQ was handled and return the correct value. Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 27b75a8f5097e..0be42787759fa 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8240,6 +8240,7 @@ static irqreturn_t general_interrupt(int irq, void *data) u64 regs[CCE_NUM_INT_CSRS]; u32 bit; int i; + irqreturn_t handled = IRQ_NONE; this_cpu_inc(*dd->int_counter); @@ -8260,9 +8261,10 @@ static irqreturn_t general_interrupt(int irq, void *data) for_each_set_bit(bit, (unsigned long *)®s[0], CCE_NUM_INT_CSRS * 64) { is_interrupt(dd, bit); + handled = IRQ_HANDLED; } - return IRQ_HANDLED; + return handled; } static irqreturn_t sdma_interrupt(int irq, void *data) From 612601d0013f03de9dc134809f242ba6da9ca252 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Tue, 26 Sep 2017 06:06:22 -0700 Subject: [PATCH 27/34] Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0" commit 9a9b8112699d will cause core to fail UD QP from being destroyed on ipoib unload, therefore cause resources leakage. On pkey change event above patch modifies mgid before calling underlying driver to detach it from QP. Drivers' detach_mcast() will fail to find modified mgid it was never given to attach in a first place. Core qp->usecnt will never go down, so ib_destroy_qp() will fail. IPoIB driver actually does take care of new broadcast mgid based on new pkey by destroying an old mcast object in ipoib_mcast_dev_flush()) .... if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } ... then in restarted ipoib_macst_join_task() creating a new broadcast mcast object, sending join request and on completion tells the driver to attach to reinitialized QP: ... if (!priv->broadcast) { ... broadcast = ipoib_mcast_alloc(dev, 0); ... memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; ... Fixes: 9a9b8112699d ("IB/ipoib: Update broadcast object if PKey value was changed in index 0") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Feras Daoud Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 2e075377242e2..6cd61638b4414 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1000,19 +1000,6 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) */ priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; - - /* - * Update the broadcast address in the priv->broadcast object, - * in case it already exists, otherwise no one will do that. - */ - if (priv->broadcast) { - spin_lock_irq(&priv->lock); - memcpy(priv->broadcast->mcmember.mgid.raw, - priv->dev->broadcast + 4, - sizeof(union ib_gid)); - spin_unlock_irq(&priv->lock); - } - return 0; } From b8f42738acaddf67731c34935c0994e09a588ca7 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 26 Sep 2017 06:06:28 -0700 Subject: [PATCH 28/34] IB/hfi1: On error, fix use after free during user context setup During base context setup, if setup_base_ctxt() fails, the context is deallocated. This is incorrect because the context is referenced on return, to notify any waiting subcontext. If there are no subcontexts the pointer will be invalid. Reorganize the error path so that deallocate_ctxt() is called after all the possible subcontexts have been notified. Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 41 ++++++++++++++------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2bc89260235a1..d9a1e98931364 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -930,15 +930,8 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo) switch (ret) { case 0: ret = setup_base_ctxt(fd, uctxt); - if (uctxt->subctxt_cnt) { - /* - * Base context is done (successfully or not), notify - * anybody using a sub-context that is waiting for - * this completion. - */ - clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); - wake_up(&uctxt->wait); - } + if (ret) + deallocate_ctxt(uctxt); break; case 1: ret = complete_subctxt(fd); @@ -1305,25 +1298,25 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, /* Now allocate the RcvHdr queue and eager buffers. */ ret = hfi1_create_rcvhdrq(dd, uctxt); if (ret) - return ret; + goto done; ret = hfi1_setup_eagerbufs(uctxt); if (ret) - goto setup_failed; + goto done; /* If sub-contexts are enabled, do the appropriate setup */ if (uctxt->subctxt_cnt) ret = setup_subctxt(uctxt); if (ret) - goto setup_failed; + goto done; ret = hfi1_alloc_ctxt_rcv_groups(uctxt); if (ret) - goto setup_failed; + goto done; ret = init_user_ctxt(fd, uctxt); if (ret) - goto setup_failed; + goto done; user_init(uctxt); @@ -1331,12 +1324,22 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); - return 0; +done: + if (uctxt->subctxt_cnt) { + /* + * On error, set the failed bit so sub-contexts will clean up + * correctly. + */ + if (ret) + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); -setup_failed: - /* Set the failed bit so sub-context init can do the right thing */ - set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); - deallocate_ctxt(uctxt); + /* + * Base context is done (successfully or not), notify anybody + * using a sub-context that is waiting for this completion. + */ + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } return ret; } From 828bcbdc975fbcfb27946c33d4b1d1bfab70789b Mon Sep 17 00:00:00 2001 From: Harish Chegondi Date: Tue, 26 Sep 2017 06:06:34 -0700 Subject: [PATCH 29/34] IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load Failure to tune PCIe capabilities should not fail driver load. This can cause the driver load to fail on systems with any of the following: 1. HFI's parent is not root. Example: HFI card is behind a PCIe bridge. 2. HFI's parent is not PCI Express capable. In these situations, failure to tune PCIe capabilities should be logged in the system message logs but not cause the driver load to fail. This patch also ensures pcie capability word DevCtl is written only after a successful read and the capability tuning process continues even if read/write of the pcie capability word DevCtl fails. Fixes: c53df62c7a9a ("IB/hfi1: Check return values from PCI config API calls") Fixes: bf70a7757736 ("staging/rdma/hfi1: Enable WFR PCIe extended tags from the driver") Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 50 +++++++++++++------------------ 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 82447b7cdda1e..09e50fd2a08f0 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -68,7 +68,7 @@ /* * Code to adjust PCIe capabilities. */ -static int tune_pcie_caps(struct hfi1_devdata *); +static void tune_pcie_caps(struct hfi1_devdata *); /* * Do all the common PCIe setup and initialization. @@ -351,7 +351,7 @@ int pcie_speeds(struct hfi1_devdata *dd) */ int request_msix(struct hfi1_devdata *dd, u32 msireq) { - int nvec, ret; + int nvec; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, PCI_IRQ_MSIX | PCI_IRQ_LEGACY); @@ -360,12 +360,7 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) return nvec; } - ret = tune_pcie_caps(dd); - if (ret) { - dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret); - pci_free_irq_vectors(dd->pcidev); - return ret; - } + tune_pcie_caps(dd); /* check for legacy IRQ */ if (nvec == 1 && !dd->pcidev->msix_enabled) @@ -502,7 +497,7 @@ uint aspm_mode = ASPM_MODE_DISABLED; module_param_named(aspm, aspm_mode, uint, S_IRUGO); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static int tune_pcie_caps(struct hfi1_devdata *dd) +static void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -513,22 +508,14 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * Turn on extended tags in DevCtl in case the BIOS has turned it off * to improve WFR SDMA bandwidth */ - ret = pcie_capability_read_word(dd->pcidev, - PCI_EXP_DEVCTL, &ectl); - if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return ret; - } - - if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { dd_dev_info(dd, "Enabling PCIe extended tags\n"); ectl |= PCI_EXP_DEVCTL_EXT_TAG; ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); - return ret; - } + if (ret) + dd_dev_info(dd, "Unable to write to PCI config\n"); } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; @@ -536,15 +523,22 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * The driver cannot perform the tuning if it does not have * access to the upstream component. */ - if (!parent) - return -EINVAL; + if (!parent) { + dd_dev_info(dd, "Parent not found\n"); + return; + } if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); - return -EINVAL; + return; + } + if (!pci_is_pcie(parent)) { + dd_dev_info(dd, "Parent is not PCI Express capable\n"); + return; + } + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_info(dd, "PCI device is not PCI Express capable\n"); + return; } - - if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - return -EINVAL; rc_mpss = parent->pcie_mpss; rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ @@ -590,8 +584,6 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) ep_mrrs = max_mrrs; pcie_set_readrq(dd->pcidev, ep_mrrs); } - - return 0; } /* End of PCIe capability tuning */ From aaf2c2fb0f51f91c699039440862b6ae9c25c10e Mon Sep 17 00:00:00 2001 From: Tyler Baicar Date: Mon, 28 Aug 2017 10:53:41 -0600 Subject: [PATCH 30/34] ACPI / APEI: clear error status before acknowledging the error Currently we acknowledge errors before clearing the error status. This could cause a new error to be populated by firmware in-between the error acknowledgment and the error status clearing which would cause the second error's status to be cleared without being handled. So, clear the error status before acknowledging the errors. Also, make sure to acknowledge the error if the error status read fails. Signed-off-by: Tyler Baicar Reviewed-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 077f9bad6f44a..3c3a37b8503bd 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -743,17 +743,19 @@ static int ghes_proc(struct ghes *ghes) } ghes_do_proc(ghes, ghes->estatus); +out: + ghes_clear_estatus(ghes); + + if (rc == -ENOENT) + return rc; + /* * GHESv2 type HEST entries introduce support for error acknowledgment, * so only acknowledge the error if this support is present. */ - if (is_hest_type_generic_v2(ghes)) { - rc = ghes_ack_error(ghes->generic_v2); - if (rc) - return rc; - } -out: - ghes_clear_estatus(ghes); + if (is_hest_type_generic_v2(ghes)) + return ghes_ack_error(ghes->generic_v2); + return rc; } From 8aba2333904f9b1c1ea038df261bf7ae8fefb98e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 02:08:43 +0200 Subject: [PATCH 31/34] cpufreq: docs: Drop intel-pstate.txt from index.txt Commit 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) dropped the intel-pstate.txt file from Documentation/cpu-freq/, but it did not update the index.txt file in there accordingly, so do that now. Fixes: 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/index.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index 03a7cee6ac73a..c15e75386a052 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -32,8 +32,6 @@ cpufreq-stats.txt - General description of sysfs cpufreq stats. index.txt - File index, Mailing list and Links (this document) -intel-pstate.txt - Intel pstate cpufreq driver specific file. - pcc-cpufreq.txt - PCC cpufreq driver specific file. From 66a733ea6b611aecf0119514d2dddab5f9d6c01e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: [PATCH 32/34] seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Cc: stable@vger.kernel.org # needs s/refcount_/atomic_/ for v4.12 and earlier Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a14..bb3a38005b9cc 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: From e49aa15ef6c179f69e5578a271801f31a09e9a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Sep 2017 13:20:32 -0700 Subject: [PATCH 33/34] Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" This reverts commit dbbccdc4ced015cdd4051299bd87fbe0254ad351. It turns out that the "legacy" users aren't so legacy at all, and that turning off the legacy ioctl will break the current Qt bluetooth stack for bluetooth LE devices that were released just a couple of months ago. So it's simply not true that this was a legacy interface that hasn't been needed and is only limited to old legacy BT devices. Because I actually read Kconfig help messages, and actively try to turn off features that I don't need, I turned the option off. Then I spent _way_ too much time debugging BLE issues until I realized that it wasn't the Qt and subsurface development that had broken one of my dive computer BLE downloads, but simply my broken kernel config. Maybe in a decade it will be true that this is a legacy interface. And maybe with a better help-text and correct dependencies, this kind of legacy removal might be acceptable. But as things are right now both the commit message and the Kconfig help text were misleading, and the Kconfig option had the wrong dependenencies. There's no reason to keep that broken Kconfig option in the tree. Cc: Marcel Holtmann Cc: Johan Hedberg Signed-off-by: Linus Torvalds --- net/bluetooth/Kconfig | 10 ---------- net/bluetooth/hci_sock.c | 6 ------ 2 files changed, 16 deletions(-) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00b..db82a40875e8d 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af9..65d734c165bd6 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: [PATCH 34/34] perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12eea..f684d8e5fa2be 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; }