Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/gi…
Browse files Browse the repository at this point in the history
…t/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "Aside from the usual things this has an arch update for
  __iowrite64_copy() used by the RDMA drivers.

  This API was intended to generate large 64 byte MemWr TLPs on PCI.
  These days most processors had done this by just repeating writel() in
  a loop. S390 and some new ARM64 designs require a special helper to
  get this to generate.

   - Small improvements and fixes for erdma, efa, hfi1, bnxt_re

   - Fix a UAF crash after module unload on leaking restrack entry

   - Continue adding full RDMA support in mana with support for EQs,
     GID's and CQs

   - Improvements to the mkey cache in mlx5

   - DSCP traffic class support in hns and several bug fixes

   - Cap the maximum number of MADs in the receive queue to avoid OOM

   - Another batch of rxe bug fixes from large scale testing

   - __iowrite64_copy() optimizations for write combining MMIO memory

   - Remove NULL checks before dev_put/hold()

   - EFA support for receive with immediate

   - Fix a recent memleaking regression in a cma error path"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (70 commits)
  RDMA/cma: Fix kmemleak in rdma_core observed during blktests nvme/rdma use siw
  RDMA/IPoIB: Fix format truncation compilation errors
  bnxt_re: avoid shift undefined behavior in bnxt_qplib_alloc_init_hwq
  RDMA/efa: Support QP with unsolicited write w/ imm. receive
  IB/hfi1: Remove generic .ndo_get_stats64
  IB/hfi1: Do not use custom stat allocator
  RDMA/hfi1: Use RMW accessors for changing LNKCTL2
  RDMA/mana_ib: implement uapi for creation of rnic cq
  RDMA/mana_ib: boundary check before installing cq callbacks
  RDMA/mana_ib: introduce a helper to remove cq callbacks
  RDMA/mana_ib: create and destroy RNIC cqs
  RDMA/mana_ib: create EQs for RNIC CQs
  RDMA/core: Remove NULL check before dev_{put, hold}
  RDMA/ipoib: Remove NULL check before dev_{put, hold}
  RDMA/mlx5: Remove NULL check before dev_{put, hold}
  RDMA/mlx5: Track DCT, DCI and REG_UMR QPs as diver_detail resources.
  RDMA/core: Add an option to display driver-specific QPs in the rdmatool
  RDMA/efa: Add shutdown notifier
  RDMA/mana_ib: Fix missing ret value
  IB/mlx5: Use __iowrite64_copy() for write combining stores
  ...
  • Loading branch information
Linus Torvalds committed May 18, 2024
2 parents 56172ac + 9c07318 commit 25f4874
Show file tree
Hide file tree
Showing 77 changed files with 1,584 additions and 768 deletions.
132 changes: 132 additions & 0 deletions arch/arm64/include/asm/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
#define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l))
#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l))

/*
* The ARM64 iowrite implementation is intended to support drivers that want to
* use write combining. For instance PCI drivers using write combining with a 64
* byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
*
* Newer ARM core have sensitive write combining buffers, it is important that
* the stores be contiguous blocks of store instructions. Normal memcpy
* approaches have a very low chance to generate write combining.
*
* Since this is the only API on ARM64 that should be used with write combining
* it also integrates the DGH hint which is supposed to lower the latency to
* emit the large TLP from the CPU.
*/

static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
const u32 *from, size_t count)
{
switch (count) {
case 8:
asm volatile("str %w0, [%8, #4 * 0]\n"
"str %w1, [%8, #4 * 1]\n"
"str %w2, [%8, #4 * 2]\n"
"str %w3, [%8, #4 * 3]\n"
"str %w4, [%8, #4 * 4]\n"
"str %w5, [%8, #4 * 5]\n"
"str %w6, [%8, #4 * 6]\n"
"str %w7, [%8, #4 * 7]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
break;
case 4:
asm volatile("str %w0, [%4, #4 * 0]\n"
"str %w1, [%4, #4 * 1]\n"
"str %w2, [%4, #4 * 2]\n"
"str %w3, [%4, #4 * 3]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
"rZ"(from[3]), "r"(to));
break;
case 2:
asm volatile("str %w0, [%2, #4 * 0]\n"
"str %w1, [%2, #4 * 1]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
break;
case 1:
__raw_writel(*from, to);
break;
default:
BUILD_BUG();
}
}

void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);

static inline void __const_iowrite32_copy(void __iomem *to, const void *from,
size_t count)
{
if (count == 8 || count == 4 || count == 2 || count == 1) {
__const_memcpy_toio_aligned32(to, from, count);
dgh();
} else {
__iowrite32_copy_full(to, from, count);
}
}

#define __iowrite32_copy(to, from, count) \
(__builtin_constant_p(count) ? \
__const_iowrite32_copy(to, from, count) : \
__iowrite32_copy_full(to, from, count))

static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
const u64 *from, size_t count)
{
switch (count) {
case 8:
asm volatile("str %x0, [%8, #8 * 0]\n"
"str %x1, [%8, #8 * 1]\n"
"str %x2, [%8, #8 * 2]\n"
"str %x3, [%8, #8 * 3]\n"
"str %x4, [%8, #8 * 4]\n"
"str %x5, [%8, #8 * 5]\n"
"str %x6, [%8, #8 * 6]\n"
"str %x7, [%8, #8 * 7]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
break;
case 4:
asm volatile("str %x0, [%4, #8 * 0]\n"
"str %x1, [%4, #8 * 1]\n"
"str %x2, [%4, #8 * 2]\n"
"str %x3, [%4, #8 * 3]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
"rZ"(from[3]), "r"(to));
break;
case 2:
asm volatile("str %x0, [%2, #8 * 0]\n"
"str %x1, [%2, #8 * 1]\n"
:
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
break;
case 1:
__raw_writeq(*from, to);
break;
default:
BUILD_BUG();
}
}

void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);

static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
size_t count)
{
if (count == 8 || count == 4 || count == 2 || count == 1) {
__const_memcpy_toio_aligned64(to, from, count);
dgh();
} else {
__iowrite64_copy_full(to, from, count);
}
}

#define __iowrite64_copy(to, from, count) \
(__builtin_constant_p(count) ? \
__const_iowrite64_copy(to, from, count) : \
__iowrite64_copy_full(to, from, count))

/*
* I/O memory mapping functions.
*/
Expand Down
42 changes: 42 additions & 0 deletions arch/arm64/kernel/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
}
EXPORT_SYMBOL(__memcpy_fromio);

/*
* This generates a memcpy that works on a from/to address which is aligned to
* bits. Count is in terms of the number of bits sized quantities to copy. It
* optimizes to use the STR groupings when possible so that it is WC friendly.
*/
#define memcpy_toio_aligned(to, from, count, bits) \
({ \
volatile u##bits __iomem *_to = to; \
const u##bits *_from = from; \
size_t _count = count; \
const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
\
for (; _from < _end_from; _from += 8, _to += 8) \
__const_memcpy_toio_aligned##bits(_to, _from, 8); \
if ((_count % 8) >= 4) { \
__const_memcpy_toio_aligned##bits(_to, _from, 4); \
_from += 4; \
_to += 4; \
} \
if ((_count % 4) >= 2) { \
__const_memcpy_toio_aligned##bits(_to, _from, 2); \
_from += 2; \
_to += 2; \
} \
if (_count % 2) \
__const_memcpy_toio_aligned##bits(_to, _from, 1); \
})

void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
{
memcpy_toio_aligned(to, from, count, 64);
dgh();
}
EXPORT_SYMBOL(__iowrite64_copy_full);

void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
{
memcpy_toio_aligned(to, from, count, 32);
dgh();
}
EXPORT_SYMBOL(__iowrite32_copy_full);

/*
* Copy data from "real" memory space to IO memory space.
*/
Expand Down
15 changes: 15 additions & 0 deletions arch/s390/include/asm/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p)
#define __raw_writel zpci_write_u32
#define __raw_writeq zpci_write_u64

/* combine single writes by using store-block insn */
static inline void __iowrite32_copy(void __iomem *to, const void *from,
size_t count)
{
zpci_memcpy_toio(to, from, count * 4);
}
#define __iowrite32_copy __iowrite32_copy

static inline void __iowrite64_copy(void __iomem *to, const void *from,
size_t count)
{
zpci_memcpy_toio(to, from, count * 8);
}
#define __iowrite64_copy __iowrite64_copy

#endif /* CONFIG_PCI */

#include <asm-generic/io.h>
Expand Down
6 changes: 0 additions & 6 deletions arch/s390/pci/pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
return 0;
}

/* combine single writes by using store-block insn */
void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
{
zpci_memcpy_toio(to, from, count * 8);
}

void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
unsigned long prot)
{
Expand Down
17 changes: 17 additions & 0 deletions arch/x86/include/asm/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
#define memcpy_toio memcpy_toio
#define memset_io memset_io

#ifdef CONFIG_X86_64
/*
* Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
* x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
* loop.
*/
static inline void __iowrite32_copy(void __iomem *to, const void *from,
size_t count)
{
asm volatile("rep ; movsl"
: "=&c"(count), "=&D"(to), "=&S"(from)
: "0"(count), "1"(to), "2"(from)
: "memory");
}
#define __iowrite32_copy __iowrite32_copy
#endif

/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
Expand Down
1 change: 0 additions & 1 deletion arch/x86/lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += atomic64_386_32.o
endif
else
obj-y += iomap_copy_64.o
ifneq ($(CONFIG_GENERIC_CSUM),y)
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
endif
Expand Down
15 changes: 0 additions & 15 deletions arch/x86/lib/iomap_copy_64.S

This file was deleted.

4 changes: 3 additions & 1 deletion drivers/infiniband/core/cma.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,8 +715,10 @@ cma_validate_port(struct ib_device *device, u32 port,
rcu_read_lock();
ndev = rcu_dereference(sgid_attr->ndev);
if (!net_eq(dev_net(ndev), dev_addr->net) ||
ndev->ifindex != bound_if_index)
ndev->ifindex != bound_if_index) {
rdma_put_gid_attr(sgid_attr);
sgid_attr = ERR_PTR(-ENODEV);
}
rcu_read_unlock();
goto out;
}
Expand Down
10 changes: 3 additions & 7 deletions drivers/infiniband/core/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -2174,8 +2174,7 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
spin_unlock_irqrestore(&pdata->netdev_lock, flags);

add_ndev_hash(pdata);
if (old_ndev)
__dev_put(old_ndev);
__dev_put(old_ndev);

return 0;
}
Expand Down Expand Up @@ -2235,8 +2234,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
spin_lock(&pdata->netdev_lock);
res = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
if (res)
dev_hold(res);
dev_hold(res);
spin_unlock(&pdata->netdev_lock);
}

Expand Down Expand Up @@ -2311,9 +2309,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,

if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);

if (idev)
dev_put(idev);
dev_put(idev);
}
}

Expand Down
3 changes: 1 addition & 2 deletions drivers/infiniband/core/lag.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
slave = netdev_get_xmit_slave(master, skb,
!!(device->lag_flags &
RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
if (slave)
dev_hold(slave);
dev_hold(slave);
rcu_read_unlock();
kfree_skb(skb);
return slave;
Expand Down
Loading

0 comments on commit 25f4874

Please sign in to comment.