Skip to content

Commit

Permalink
Merge tag 'irq-urgent-2021-08-15' of git://git.kernel.org/pub/scm/lin…
Browse files Browse the repository at this point in the history
…ux/kernel/git/tip/tip

Pull irq fixes from Thomas Gleixner:
 "A set of fixes for PCI/MSI and x86 interrupt startup:

   - Mask all MSI-X entries when enabling MSI-X otherwise stale unmasked
     entries stay around e.g. when a crashkernel is booted.

   - Enforce masking of a MSI-X table entry when updating it, which
     mandatory according to speification

   - Ensure that writes to MSI[-X} tables are flushed.

   - Prevent invalid bits being set in the MSI mask register

   - Properly serialize modifications to the mask cache and the mask
     register for multi-MSI.

   - Cure the violation of the affinity setting rules on X86 during
     interrupt startup which can cause lost and stale interrupts. Move
     the initial affinity setting ahead of actualy enabling the
     interrupt.

   - Ensure that MSI interrupts are completely torn down before freeing
     them in the error handling case.

   - Prevent an array out of bounds access in the irq timings code"

* tag 'irq-urgent-2021-08-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  driver core: Add missing kernel doc for device::msi_lock
  genirq/msi: Ensure deactivation on teardown
  genirq/timings: Prevent potential array overflow in __irq_timings_store()
  x86/msi: Force affinity setup before startup
  x86/ioapic: Force affinity setup before startup
  genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
  PCI/MSI: Protect msi_desc::masked for multi-MSI
  PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
  PCI/MSI: Correct misleading comments
  PCI/MSI: Do not set invalid bits in MSI mask
  PCI/MSI: Enforce MSI[X] entry updates to be visible
  PCI/MSI: Enforce that MSI-X table entry is masked for update
  PCI/MSI: Mask all unused MSI-X entries
  PCI/MSI: Enable and mask MSI-X early
  • Loading branch information
Linus Torvalds committed Aug 15, 2021
2 parents 839da25 + 7a3dc4f commit c4f14ea
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 61 deletions.
6 changes: 4 additions & 2 deletions arch/x86/kernel/apic/io_apic.c
Original file line number Diff line number Diff line change
Expand Up @@ -1986,7 +1986,8 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_AFFINITY_PRE_STARTUP,
};

static struct irq_chip ioapic_ir_chip __read_mostly = {
Expand All @@ -1999,7 +2000,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_AFFINITY_PRE_STARTUP,
};

static inline void init_IO_APIC_traps(void)
Expand Down
11 changes: 8 additions & 3 deletions arch/x86/kernel/apic/msi.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
* The quirk bit is not set in this case.
* - The new vector is the same as the old vector
* - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
* - The interrupt is not yet started up
* - The new destination CPU is the same as the old destination CPU
*/
if (!irqd_msi_nomask_quirk(irqd) ||
cfg->vector == old_cfg.vector ||
old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
!irqd_is_started(irqd) ||
cfg->dest_apicid == old_cfg.dest_apicid) {
irq_msi_update_msg(irqd, cfg);
return ret;
Expand Down Expand Up @@ -150,7 +152,8 @@ static struct irq_chip pci_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_AFFINITY_PRE_STARTUP,
};

int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
Expand Down Expand Up @@ -219,7 +222,8 @@ static struct irq_chip pci_msi_ir_controller = {
.irq_mask = pci_msi_mask_irq,
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_AFFINITY_PRE_STARTUP,
};

static struct msi_domain_info pci_msi_ir_domain_info = {
Expand Down Expand Up @@ -273,7 +277,8 @@ static struct irq_chip dmar_msi_controller = {
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_AFFINITY_PRE_STARTUP,
};

static int dmar_msi_init(struct irq_domain *domain,
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/hpet.c
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_write_msi_msg = hpet_msi_write_msg,
.flags = IRQCHIP_SKIP_SET_WAKE,
.flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
};

static int hpet_msi_init(struct irq_domain *domain,
Expand Down
1 change: 1 addition & 0 deletions drivers/base/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2837,6 +2837,7 @@ void device_initialize(struct device *dev)
device_pm_init(dev);
set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
raw_spin_lock_init(&dev->msi_lock);
INIT_LIST_HEAD(&dev->msi_list);
#endif
INIT_LIST_HEAD(&dev->links.consumers);
Expand Down
125 changes: 77 additions & 48 deletions drivers/pci/msi.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,24 +143,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
* reliably as devices without an INTx disable bit will then generate a
* level IRQ which will never be cleared.
*/
u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
u32 mask_bits = desc->masked;
raw_spinlock_t *lock = &desc->dev->msi_lock;
unsigned long flags;

if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
return 0;
return;

mask_bits &= ~mask;
mask_bits |= flag;
raw_spin_lock_irqsave(lock, flags);
desc->masked &= ~mask;
desc->masked |= flag;
pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
mask_bits);

return mask_bits;
desc->masked);
raw_spin_unlock_irqrestore(lock, flags);
}

static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
{
desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
__pci_msi_desc_mask_irq(desc, mask, flag);
}

static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
Expand Down Expand Up @@ -289,13 +290,31 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);

if (!base)
goto skip;

/*
* The specification mandates that the entry is masked
* when the message is modified:
*
* "If software changes the Address or Data value of an
* entry while the entry is unmasked, the result is
* undefined."
*/
if (unmasked)
__pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);

writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);

if (unmasked)
__pci_msix_desc_mask_irq(entry, 0);

/* Ensure that the writes are visible in the device */
readl(base + PCI_MSIX_ENTRY_DATA);
} else {
int pos = dev->msi_cap;
u16 msgctl;
Expand All @@ -316,6 +335,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
/* Ensure that the writes are visible in the device */
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
}

skip:
Expand Down Expand Up @@ -636,21 +657,21 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
/* Configure MSI capability structure */
ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
if (ret) {
msi_mask_irq(entry, mask, ~mask);
msi_mask_irq(entry, mask, 0);
free_msi_irqs(dev);
return ret;
}

ret = msi_verify_entries(dev);
if (ret) {
msi_mask_irq(entry, mask, ~mask);
msi_mask_irq(entry, mask, 0);
free_msi_irqs(dev);
return ret;
}

ret = populate_msi_sysfs(dev);
if (ret) {
msi_mask_irq(entry, mask, ~mask);
msi_mask_irq(entry, mask, 0);
free_msi_irqs(dev);
return ret;
}
Expand Down Expand Up @@ -691,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
{
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
void __iomem *addr;
int ret, i;
int vec_count = pci_msix_vec_count(dev);

Expand All @@ -711,6 +733,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,

entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;

if (entries)
entry->msi_attrib.entry_nr = entries[i].entry;
else
Expand All @@ -722,6 +745,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.default_irq = dev->irq;
entry->mask_base = base;

addr = pci_msix_desc_addr(entry);
if (addr)
entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);

list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
if (masks)
curmsk++;
Expand All @@ -732,26 +759,25 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
return ret;
}

static void msix_program_entries(struct pci_dev *dev,
struct msix_entry *entries)
static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
{
struct msi_desc *entry;
int i = 0;
void __iomem *desc_addr;

for_each_pci_msi_entry(entry, dev) {
if (entries)
entries[i++].vector = entry->irq;
if (entries) {
entries->vector = entry->irq;
entries++;
}
}
}

desc_addr = pci_msix_desc_addr(entry);
if (desc_addr)
entry->masked = readl(desc_addr +
PCI_MSIX_ENTRY_VECTOR_CTRL);
else
entry->masked = 0;
static void msix_mask_all(void __iomem *base, int tsize)
{
u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
int i;

msix_mask_irq(entry, 1);
}
for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
}

/**
Expand All @@ -768,22 +794,33 @@ static void msix_program_entries(struct pci_dev *dev,
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
int ret;
u16 control;
void __iomem *base;
int ret, tsize;
u16 control;

/* Ensure MSI-X is disabled while it is set up */
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
/*
* Some devices require MSI-X to be enabled before the MSI-X
* registers can be accessed. Mask all the vectors to prevent
* interrupts coming in before they're fully set up.
*/
pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
PCI_MSIX_FLAGS_ENABLE);

pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
/* Request & Map MSI-X table region */
base = msix_map_region(dev, msix_table_size(control));
if (!base)
return -ENOMEM;
tsize = msix_table_size(control);
base = msix_map_region(dev, tsize);
if (!base) {
ret = -ENOMEM;
goto out_disable;
}

/* Ensure that all table entries are masked. */
msix_mask_all(base, tsize);

ret = msix_setup_entries(dev, base, entries, nvec, affd);
if (ret)
return ret;
goto out_disable;

ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
if (ret)
Expand All @@ -794,15 +831,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
if (ret)
goto out_free;

/*
* Some devices require MSI-X to be enabled before we can touch the
* MSI-X registers. We need to mask all the vectors to prevent
* interrupts coming in before they're fully set up.
*/
pci_msix_clear_and_set_ctrl(dev, 0,
PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);

msix_program_entries(dev, entries);
msix_update_entries(dev, entries);

ret = populate_msi_sysfs(dev);
if (ret)
Expand Down Expand Up @@ -836,6 +865,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
out_free:
free_msi_irqs(dev);

out_disable:
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);

return ret;
}

Expand Down Expand Up @@ -930,8 +962,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)

/* Return the device with MSI unmasked as initial states */
mask = msi_mask(desc->msi_attrib.multi_cap);
/* Keep cached state to be restored */
__pci_msi_desc_mask_irq(desc, mask, ~mask);
msi_mask_irq(desc, mask, 0);

/* Restore dev->irq to its default pin-assertion IRQ */
dev->irq = desc->msi_attrib.default_irq;
Expand Down Expand Up @@ -1016,10 +1047,8 @@ static void pci_msix_shutdown(struct pci_dev *dev)
}

/* Return the device with MSI-X masked as initial states */
for_each_pci_msi_entry(entry, dev) {
/* Keep cached states to be restored */
for_each_pci_msi_entry(entry, dev)
__pci_msix_desc_mask_irq(entry, 1);
}

pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
pci_intx_for_msi(dev, 1);
Expand Down
2 changes: 2 additions & 0 deletions include/linux/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ struct dev_links_info {
* @em_pd: device's energy model performance domain
* @pins: For device pin management.
* See Documentation/driver-api/pin-control.rst for details.
* @msi_lock: Lock to protect MSI mask cache and mask register
* @msi_list: Hosts MSI descriptors
* @msi_domain: The generic MSI domain this device is using.
* @numa_node: NUMA node this device is close to.
Expand Down Expand Up @@ -506,6 +507,7 @@ struct device {
struct dev_pin_info *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
raw_spinlock_t msi_lock;
struct list_head msi_list;
#endif
#ifdef CONFIG_DMA_OPS
Expand Down
2 changes: 2 additions & 0 deletions include/linux/irq.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ struct irq_chip {
* IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips
* IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs
* in the suspend path if they are in disabled state
* IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup
*/
enum {
IRQCHIP_SET_TYPE_MASKED = (1 << 0),
Expand All @@ -581,6 +582,7 @@ enum {
IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7),
IRQCHIP_SUPPORTS_NMI = (1 << 8),
IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9),
IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10),
};

#include <linux/irqdesc.h>
Expand Down
2 changes: 1 addition & 1 deletion include/linux/msi.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);

u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
void pci_msi_mask_irq(struct irq_data *data);
void pci_msi_unmask_irq(struct irq_data *data);

Expand Down
5 changes: 4 additions & 1 deletion kernel/irq/chip.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
irq_setup_affinity(desc);
ret = __irq_startup(desc);
irq_setup_affinity(desc);
if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
Expand Down
Loading

0 comments on commit c4f14ea

Please sign in to comment.