From 2053f7db7af8c5ab2aefe9759df3505e6b840379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pecio?= Date: Tue, 7 Jun 2016 12:34:45 +0200 Subject: [PATCH 001/118] USB: OHCI: Don't mark EDs as ED_OPER if scheduling fails commit c66f59ee5050447b3da92d36f5385a847990a894 upstream. Since ed_schedule begins with marking the ED as "operational", the ED may be left in such state even if scheduling actually fails. This allows future submission attempts to smuggle this ED to the hardware behind the scheduler's back and without linking it to the ohci->eds_in_use list. The former causes bandwidth saturation and data loss on isoc endpoints, the latter crashes the kernel when attempt is made to unlink such ED from this list. Fix ed_schedule to update ED state only on successful return. Signed-off-by: Michal Pecio Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ohci-q.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c index d029bbe9eb36a..641fed6099115 100644 --- a/drivers/usb/host/ohci-q.c +++ b/drivers/usb/host/ohci-q.c @@ -183,7 +183,6 @@ static int ed_schedule (struct ohci_hcd *ohci, struct ed *ed) { int branch; - ed->state = ED_OPER; ed->ed_prev = NULL; ed->ed_next = NULL; ed->hwNextED = 0; @@ -259,6 +258,8 @@ static int ed_schedule (struct ohci_hcd *ohci, struct ed *ed) /* the HC may not see the schedule updates yet, but if it does * then they'll be properly ordered. */ + + ed->state = ED_OPER; return 0; } From bab5a36c1917216f0c94b521d008bdca393cc409 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 002/118] x86/quirks: Apply nvidia_bugs quirk only on root bus commit 447d29d1d3aed839e74c2401ef63387780ac51ed upstream. Since the following commit: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") ... early quirks are only applied to devices on the root bus. The motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to reintroduce scanning of secondary buses for a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent regressions, open code the requirement to apply nvidia_bugs only on the root bus. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index db9a675e751b0..b714e6325e603 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -75,6 +75,13 @@ static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. From dd4eb74efbd22006c99ae7ff45ef9ef676bb5715 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 003/118] x86/quirks: Reintroduce scanning of secondary buses commit 850c321027c2e31d0afc71588974719a4b565550 upstream. We used to scan secondary buses until the following commit that was applied in 2009: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") which commit constrained early quirks to the root bus only. Its motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to add a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs, which is located on a secondary bus behind a PCIe root port. To facilitate that, reintroduce scanning of secondary buses. The commit message of 8659c406ade3 notes that scanning only the root bus "saves quite some unnecessary scanning work". The algorithm used prior to 8659c406ade3 was particularly time consuming because it scanned buses 0 to 31 brute force. To avoid lengthening boot time, employ a recursive strategy which only scans buses that are actually reachable from the root bus. Yinghai Lu pointed out that the secondary bus number read from a bridge's config space may be invalid, in particular a value of 0 would cause an infinite loop. The PCI core goes beyond that and recurses to a child bus only if its bus number is greater than the parent bus number (see pci_scan_bridge()). Since the root bus is numbered 0, this implies that secondary buses may not be 0. Do the same on early scanning. If this algorithm is found to significantly impact boot time or cause infinite loops on broken hardware, it would be possible to limit its recursion depth: The Broadcom 4331 quirk applies at depth 1, all others at depth 0, so the bus need not be scanned deeper than that for now. An alternative approach would be to revert to scanning only the root bus, and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12 and 8086:1e16. Apple always positioned the card behind either of these three ports. The quirk would then check presence of the card in slot 0 below the root port and do its deed. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b714e6325e603..ca82b2e1eabe5 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -609,12 +609,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -647,6 +641,8 @@ static struct chipset early_qrk[] __initdata = { {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -655,7 +651,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -664,6 +660,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -691,25 +688,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} From ba1eebc72dc6cf8995562e534a337b965b66ef3b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: [PATCH 004/118] x86/quirks: Add early quirk to reset Apple AirPort card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit abb2bafd295fe962bbadc329dbfb2146457283ac upstream. The EFI firmware on Macs contains a full-fledged network stack for downloading OS X images from osrecovery.apple.com. Unfortunately on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 wireless card on every boot and leaves it enabled even after ExitBootServices has been called. The card continues to assert its IRQ line, causing spurious interrupts if the IRQ is shared. It also corrupts memory by DMAing received packets, allowing for remote code execution over the air. This only stops when a driver is loaded for the wireless card, which may be never if the driver is not installed or blacklisted. The issue seems to be constrained to the Broadcom 4331. Chris Milsted has verified that the newer Broadcom 4360 built into the MacBookPro11,3 (2013/2014) does not exhibit this behaviour. The chances that Apple will ever supply a firmware fix for the older machines appear to be zero. The solution is to reset the card on boot by writing to a reset bit in its mmio space. This must be done as an early quirk and not as a plain vanilla PCI quirk to successfully combat memory corruption by DMAed packets: Matthew Garrett found out in 2012 that the packets are written to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). This type of memory is made available to the page allocator by efi_free_boot_services(). Plain vanilla PCI quirks run much later, in subsys initcall level. In-between a time window would be open for memory corruption. Random crashes occurring in this time window and attributed to DMAed packets have indeed been observed in the wild by Chris Bainbridge. When Matthew Garrett analyzed the memory corruption issue in 2012, he sought to fix it with a grub quirk which transitions the card to D3hot: http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 This approach does not help users with other bootloaders and while it may prevent DMAed packets, it does not cure the spurious interrupts emanating from the card. Unfortunately the card's mmio space is inaccessible in D3hot, so to reset it, we have to undo the effect of Matthew's grub patch and transition the card back to D0. Note that the quirk takes a few shortcuts to reduce the amount of code: The size of BAR 0 and the location of the PM capability is identical on all affected machines and therefore hardcoded. Only the address of BAR 0 differs between models. Also, it is assumed that the BCMA core currently mapped is the 802.11 core. The EFI driver seems to always take care of this. Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback towards finding the best solution to this problem. The following should be a comprehensive list of affected models: iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] For posterity, spurious interrupts caused by the Broadcom 4331 wireless card resulted in splats like this (stacktrace omitted): irq 17: nobody cared (try booting with the "irqpoll" option) handlers: [] pcie_isr [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] [] azx_interrupt [snd_hda_codec] Disabling IRQ #17 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 Tested-by: Konstantin Simanov # [MacBookPro8,1] Tested-by: Lukas Wunner # [MacBookPro9,1] Tested-by: Bryan Paradis # [MacBookPro9,2] Tested-by: Andrew Worsley # [MacBookPro10,1] Tested-by: Chris Bainbridge # [MacBookPro10,2] Signed-off-by: Lukas Wunner Acked-by: Rafał Miłecki Acked-by: Matt Fleming Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Milsted Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Garrett Cc: Michael Buesch Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: b43-dev@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linux-wireless@vger.kernel.org Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de [ Did minor readability edits. ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 64 ++++++++++++++++++++++++++++++++++ drivers/bcma/bcma_private.h | 2 -- include/linux/bcma/bcma.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ca82b2e1eabe5..9fdf1d3307270 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include #include +#include +#include #include +#include +#include #include #include #include @@ -21,6 +25,9 @@ #include #include #include +#include + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -596,6 +603,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -638,6 +700,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index 38f156745d533..71df8f2afc6cd 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -8,8 +8,6 @@ #include #include -#define BCMA_CORE_SIZE 0x1000 - #define bcma_err(bus, fmt, ...) \ pr_err("bus%d: " fmt, (bus)->num, ##__VA_ARGS__) #define bcma_warn(bus, fmt, ...) \ diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3feb1b2d75d87..14cd6f77e284f 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -156,6 +156,7 @@ struct bcma_host_ops { #define BCMA_CORE_DEFAULT 0xFFF #define BCMA_MAX_NR_CORES 16 +#define BCMA_CORE_SIZE 0x1000 /* Chip IDs of PCIe devices */ #define BCMA_CHIP_ID_BCM4313 0x4313 From 1b60fcdcf9bc09d3f86084b17d75c050cfed13a8 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:08 +0200 Subject: [PATCH 005/118] dmaengine: at_xdmac: align descriptors on 64 bits commit 4a9723e8df68cfce4048517ee32e37f78854b6fb upstream. Having descriptors aligned on 64 bits allows update CNDA and CUBC in an atomic way. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 02f9aa4ebe05f..6251969d1cf9d 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -242,7 +242,7 @@ struct at_xdmac_lld { u32 mbr_dus; /* Destination Microblock Stride Register */ }; - +/* 64-bit alignment needed to update CNDA and CUBC registers in an atomic way. */ struct at_xdmac_desc { struct at_xdmac_lld lld; enum dma_transfer_direction direction; @@ -253,7 +253,7 @@ struct at_xdmac_desc { unsigned int xfer_size; struct list_head descs_list; struct list_head xfer_node; -}; +} __aligned(sizeof(u64)); static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb) { From 5ce7333f41a81e82d54537e5057224a3cdc78342 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:09 +0200 Subject: [PATCH 006/118] dmaengine: at_xdmac: fix residue corruption commit 53398f488821c2b5b15291e3debec6ad33f75d3d upstream. An unexpected value of CUBC can lead to a corrupted residue. A more complex sequence is needed to detect an inaccurate value for NCA or CUBC. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 54 +++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 6251969d1cf9d..6bbfbba66a554 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1388,6 +1388,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, u32 cur_nda, check_nda, cur_ubc, mask, value; u8 dwidth = 0; unsigned long flags; + bool initd; ret = dma_cookie_status(chan, cookie, txstate); if (ret == DMA_COMPLETE) @@ -1423,34 +1424,43 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, } /* - * When processing the residue, we need to read two registers but we - * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where - * we stand in the descriptor list and AT_XDMAC_CUBC is used - * to know how many data are remaining for the current descriptor. - * Since the dma channel is not paused to not loose data, between the - * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of - * descriptor. - * For that reason, after reading AT_XDMAC_CUBC, we check if we are - * still using the same descriptor by reading a second time - * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to - * read again AT_XDMAC_CUBC. + * The easiest way to compute the residue should be to pause the DMA + * but doing this can lead to miss some data as some devices don't + * have FIFO. + * We need to read several registers because: + * - DMA is running therefore a descriptor change is possible while + * reading these registers + * - When the block transfer is done, the value of the CUBC register + * is set to its initial value until the fetch of the next descriptor. + * This value will corrupt the residue calculation so we have to skip + * it. + * + * INITD -------- ------------ + * |____________________| + * _______________________ _______________ + * NDA @desc2 \/ @desc3 + * _______________________/\_______________ + * __________ ___________ _______________ + * CUBC 0 \/ MAX desc1 \/ MAX desc2 + * __________/\___________/\_______________ + * + * Since descriptors are aligned on 64 bits, we can assume that + * the update of NDA and CUBC is atomic. * Memory barriers are used to ensure the read order of the registers. - * A max number of retries is set because unlikely it can never ends if - * we are transferring a lot of data with small buffers. + * A max number of retries is set because unlikely it could never ends. */ - cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; - rmb(); - cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) { - rmb(); check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; - - if (likely(cur_nda == check_nda)) - break; - - cur_nda = check_nda; + rmb(); + initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD); rmb(); cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); + rmb(); + cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; + rmb(); + + if ((check_nda == cur_nda) && initd) + break; } if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) { From 6b373d53e8b99b7e1d4568914851a21bbd07d3bc Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 12 May 2016 16:54:10 +0200 Subject: [PATCH 007/118] dmaengine: at_xdmac: double FIFO flush needed to compute residue commit 9295c41d77ca93aac79cfca6fa09fa1ca5cab66f upstream. Due to the way CUBC register is updated, a double flush is needed to compute an accurate residue. First flush aim is to get data from the DMA FIFO and second one ensures that we won't report data which are not in memory. Signed-off-by: Ludovic Desroches Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver") Reviewed-by: Nicolas Ferre Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 6bbfbba66a554..e44a1bfb02504 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1413,7 +1413,16 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, residue = desc->xfer_size; /* * Flush FIFO: only relevant when the transfer is source peripheral - * synchronized. + * synchronized. Flush is needed before reading CUBC because data in + * the FIFO are not reported by CUBC. Reporting a residue of the + * transfer length while we have data in FIFO can cause issue. + * Usecase: atmel USART has a timeout which means I have received + * characters but there is no more character received for a while. On + * timeout, it requests the residue. If the data are in the DMA FIFO, + * we will return a residue of the transfer length. It means no data + * received. If an application is waiting for these data, it will hang + * since we won't have another USART timeout without receiving new + * data. */ mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC; value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM; @@ -1468,6 +1477,19 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, goto spin_unlock; } + /* + * Flush FIFO: only relevant when the transfer is source peripheral + * synchronized. Another flush is needed here because CUBC is updated + * when the controller sends the data write command. It can lead to + * report data that are not written in the memory or the device. The + * FIFO flush ensures that data are really written. + */ + if ((desc->lld.mbr_cfg & mask) == value) { + at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask); + while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS)) + cpu_relax(); + } + /* * Remove size of all microblocks already transferred and the current * one. Then add the remaining size to transfer of the current From 5b3114b2af2fe1cf6d465d594faefaea6c1f328b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2016 14:49:37 -0700 Subject: [PATCH 008/118] mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask commit e838a45f9392a5bd2be1cd3ab0b16ae85857461c upstream. Commit d0164adc89f6 ("mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd") modified __GFP_WAIT to explicitly identify the difference between atomic callers and those that were unwilling to sleep. Later the definition was removed entirely. The GFP_RECLAIM_MASK is the set of flags that affect watermark checking and reclaim behaviour but __GFP_ATOMIC was never added. Without it, atomic users of the slab allocator strip the __GFP_ATOMIC flag and cannot access the page allocator atomic reserves. This patch addresses the problem. The user-visible impact depends on the workload but potentially atomic allocations unnecessarily fail without this path. Link: http://lkml.kernel.org/r/20160610093832.GK2527@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Marcin Wojtas Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c40..6979b2bd3227a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -22,7 +22,8 @@ */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ + __GFP_ATOMIC) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) From 21e9f8977968f2adfbea1f91786d26b8080c80d5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 24 Jun 2016 14:50:10 -0700 Subject: [PATCH 009/118] mm, compaction: abort free scanner if split fails commit a4f04f2c6955aff5e2c08dcb40aca247ff4d7370 upstream. If the memory compaction free scanner cannot successfully split a free page (only possible due to per-zone low watermark), terminate the free scanner rather than continuing to scan memory needlessly. If the watermark is insufficient for a free page of order <= cc->order, then terminate the scanner since all future splits will also likely fail. This prevents the compaction freeing scanner from scanning all memory on very large zones (very noticeable for zones > 128GB, for instance) when all splits will likely fail while holding zone->lock. compaction_alloc() iterating a 128GB zone has been benchmarked to take over 400ms on some systems whereas any free page isolated and ready to be split ends up failing in split_free_page() because of the low watermark check and thus the iteration continues. The next time compaction occurs, the freeing scanner will likely start at the end of the zone again since no success was made previously and we get the same lengthy iteration until the zone is brought above the low watermark. All thp page faults can take >400ms in such a state without this fix. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606211820350.97086@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Joonsoo Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 7881e072dc33b..3aed7ade34825 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -475,25 +475,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated) + break; + total_isolated += isolated; + cc->nr_freepages += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); page++; } - - /* If a page was split, advance to the end of it */ - if (isolated) { - cc->nr_freepages += isolated; - if (!strict && - cc->nr_migratepages <= cc->nr_freepages) { - blockpfn += isolated; - break; - } - - blockpfn += isolated - 1; - cursor += isolated - 1; - continue; + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; isolate_fail: if (strict) @@ -503,6 +501,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, } + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + /* * There is a tiny chance that we have read bogus compound_order(), * so be careful to not go outside of the pageblock. @@ -524,9 +525,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (strict && blockpfn < end_pfn) total_isolated = 0; - if (locked) - spin_unlock_irqrestore(&cc->zone->lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); @@ -966,6 +964,7 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -990,8 +989,12 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, false); + /* If isolation failed early, do not continue needlessly */ + if (!isolated && isolate_start_pfn < block_end_pfn && + cc->nr_migratepages > cc->nr_freepages) + break; /* * If we isolated enough freepages, or aborted due to async From d32978b8f50e2a1f14dd9a0dd64f563638dafb5a Mon Sep 17 00:00:00 2001 From: Torsten Hilbrich Date: Fri, 24 Jun 2016 14:50:18 -0700 Subject: [PATCH 010/118] fs/nilfs2: fix potential underflow in call to crc32_le commit 63d2f95d63396059200c391ca87161897b99e74a upstream. The value `bytes' comes from the filesystem which is about to be mounted. We cannot trust that the value is always in the range we expect it to be. Check its value before using it to calculate the length for the crc32_le call. It value must be larger (or equal) sumoff + 4. This fixes a kernel bug when accidentially mounting an image file which had the nilfs2 magic value 0x3434 at the right offset 0x406 by chance. The bytes 0x01 0x00 were stored at 0x408 and were interpreted as a s_bytes value of 1. This caused an underflow when substracting sumoff + 4 (20) in the call to crc32_le. BUG: unable to handle kernel paging request at ffff88021e600000 IP: crc32_le+0x36/0x100 ... Call Trace: nilfs_valid_sb.part.5+0x52/0x60 [nilfs2] nilfs_load_super_block+0x142/0x300 [nilfs2] init_nilfs+0x60/0x390 [nilfs2] nilfs_mount+0x302/0x520 [nilfs2] mount_fs+0x38/0x160 vfs_kern_mount+0x67/0x110 do_mount+0x269/0xe00 SyS_mount+0x9f/0x100 entry_SYSCALL_64_fastpath+0x16/0x71 Link: http://lkml.kernel.org/r/1466778587-5184-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Torsten Hilbrich Tested-by: Torsten Hilbrich Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/the_nilfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801afb53b..37e49cb2ac4c4 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -443,7 +443,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); - if (bytes > BLOCK_SIZE) + if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); From 41a3b3cbb6846247f36e09f96f3680f94791f8b8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 14 Jul 2016 12:06:50 -0700 Subject: [PATCH 011/118] mm, compaction: prevent VM_BUG_ON when terminating freeing scanner commit a46cbf3bc53b6a93fb84a5ffb288c354fa807954 upstream. It's possible to isolate some freepages in a pageblock and then fail split_free_page() due to the low watermark check. In this case, we hit VM_BUG_ON() because the freeing scanner terminated early without a contended lock or enough freepages. This should never have been a VM_BUG_ON() since it's not a fatal condition. It should have been a VM_WARN_ON() at best, or even handled gracefully. Regardless, we need to terminate anytime the full pageblock scan was not done. The logic belongs in isolate_freepages_block(), so handle its state gracefully by terminating the pageblock loop and making a note to restart at the same pageblock next time since it was not possible to complete the scan this time. [rientjes@google.com: don't rescan pages in a pageblock] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1607111244150.83138@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606291436300.145590@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Minchan Kim Tested-by: Minchan Kim Cc: Joonsoo Kim Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3aed7ade34825..dba02dec71952 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -964,8 +964,6 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; - /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need @@ -989,36 +987,30 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); - /* If isolation failed early, do not continue needlessly */ - if (!isolated && isolate_start_pfn < block_end_pfn && - cc->nr_migratepages > cc->nr_freepages) - break; + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, + freelist, false); /* - * If we isolated enough freepages, or aborted due to async - * compaction being contended, terminate the loop. - * Remember where the free scanner should restart next time, - * which is where isolate_freepages_block() left off. - * But if it scanned the whole pageblock, isolate_start_pfn - * now points at block_end_pfn, which is the start of the next - * pageblock. - * In that case we will however want to restart at the start - * of the previous pageblock. + * If we isolated enough freepages, or aborted due to lock + * contention, terminate. */ if ((cc->nr_freepages >= cc->nr_migratepages) || cc->contended) { - if (isolate_start_pfn >= block_end_pfn) + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ isolate_start_pfn = block_start_pfn - pageblock_nr_pages; + } break; - } else { + } else if (isolate_start_pfn < block_end_pfn) { /* - * isolate_freepages_block() should not terminate - * prematurely unless contended, or isolated enough + * If isolation failed early, do not continue + * needlessly. */ - VM_BUG_ON(isolate_start_pfn < block_end_pfn); + break; } } From e534d9261acee101807f838e495d43a9d7d83cb6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:20 -0700 Subject: [PATCH 012/118] mm, meminit: always return a valid node from early_pfn_to_nid commit e4568d3803852d00effd41dcdd489e726b998879 upstream. early_pfn_to_nid can return node 0 if a PFN is invalid on machines that has no node 0. A machine with only node 1 was observed to crash with the following message: BUG: unable to handle kernel paging request at 000000000002a3c8 PGD 0 Modules linked in: Hardware name: Supermicro H8DSP-8/H8DSP-8, BIOS 080011 06/30/2006 task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000 RIP: reserve_bootmem_region+0x6a/0xef CR2: 000000000002a3c8 CR3: 0000000001c06000 CR4: 00000000000006b0 Call Trace: free_all_bootmem+0x4b/0x12a mem_init+0x70/0xa3 start_kernel+0x25b/0x49b The problem is that early_page_uninitialised uses the early_pfn_to_nid helper which returns node 0 for invalid PFNs. No caller of early_pfn_to_nid cares except early_page_uninitialised. This patch has early_pfn_to_nid always return a valid node. Link: http://lkml.kernel.org/r/1468008031-3848-3-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 99c1738684ece..ce9d0d47ddc09 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1057,7 +1057,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; From becdfa32eeaf230253b20490179134d1bb898c34 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:23 -0700 Subject: [PATCH 013/118] mm, meminit: ensure node is online before checking whether pages are uninitialised commit ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 upstream. early_page_uninitialised looks up an arbitrary PFN. While a machine without node 0 will boot with "mm, page_alloc: Always return a valid node from early_pfn_to_nid", it works because it assumes that nodes are always in PFN order. This is not guaranteed so this patch adds robustness by always checking if the node being checked is online. Link: http://lkml.kernel.org/r/1468008031-3848-4-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce9d0d47ddc09..2bcdfbf8c36d9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -275,7 +275,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + int nid = early_pfn_to_nid(pfn); + + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; From f2e7c1f79f13a6a073d98f4e69fe81aa230f2607 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Jul 2016 12:07:29 -0700 Subject: [PATCH 014/118] vmlinux.lds: account for destructor sections commit e41f501d391265ff568f3e49d6128cc30856a36f upstream. If CONFIG_KASAN is enabled and gcc is configured with --disable-initfini-array and/or gold linker is used, gcc emits .ctors/.dtors and .text.startup/.text.exit sections instead of .init_array/.fini_array. .dtors section is not explicitly accounted in the linker script and messes vvar/percpu layout. We want: ffffffff822bfd80 D _edata ffffffff822c0000 D __vvar_beginning_hack ffffffff822c0000 A __vvar_page ffffffff822c0080 0000000000000098 D vsyscall_gtod_data ffffffff822c1000 A __init_begin ffffffff822c1000 D init_per_cpu__irq_stack_union ffffffff822c1000 A __per_cpu_load ffffffff822d3000 D init_per_cpu__gdt_page We got: ffffffff8279a600 D _edata ffffffff8279b000 A __vvar_page ffffffff8279c000 A __init_begin ffffffff8279c000 D init_per_cpu__irq_stack_union ffffffff8279c000 A __per_cpu_load ffffffff8279e000 D __vvar_beginning_hack ffffffff8279e080 0000000000000098 D vsyscall_gtod_data ffffffff827ae000 D init_per_cpu__gdt_page This happens because __vvar_page and .vvar get different addresses in arch/x86/kernel/vmlinux.lds.S: . = ALIGN(PAGE_SIZE); __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { /* work around gold bug 13023 */ __vvar_beginning_hack = .; Discard .dtors/.fini_array/.text.exit, since we don't call dtors. Merge .text.startup into init text. Link: http://lkml.kernel.org/r/1467386363-120030-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/vmlinux.lds.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4bd0e2c173c0..ef2e8c97e1832 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -531,15 +531,19 @@ #define INIT_TEXT \ *(.init.text) \ + *(.text.startup) \ MEM_DISCARD(init.text) #define EXIT_DATA \ *(.exit.data) \ + *(.fini_array) \ + *(.dtors) \ MEM_DISCARD(exit.data) \ MEM_DISCARD(exit.rodata) #define EXIT_TEXT \ *(.exit.text) \ + *(.text.exit) \ MEM_DISCARD(exit.text) #define EXIT_CALL \ From 78edebc495bbd8e3c2cced6a937467140a4fd52b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 20 Jul 2016 15:45:08 -0700 Subject: [PATCH 015/118] pps: do not crash when failed to register commit 368301f2fe4b07e5fb71dba3cc566bc59eb6705f upstream. With this command sequence: modprobe plip modprobe pps_parport rmmod pps_parport the partport_pps modules causes this crash: BUG: unable to handle kernel NULL pointer dereference at (null) IP: parport_detach+0x1d/0x60 [pps_parport] Oops: 0000 [#1] SMP ... Call Trace: parport_unregister_driver+0x65/0xc0 [parport] SyS_delete_module+0x187/0x210 The sequence that builds up to this is: 1) plip is loaded and takes the parport device for exclusive use: plip0: Parallel port at 0x378, using IRQ 7. 2) pps_parport then fails to grab the device: pps_parport: parallel port PPS client parport0: cannot grant exclusive access for device pps_parport pps_parport: couldn't register with parport0 3) rmmod of pps_parport is then killed because it tries to access pardev->name, but pardev (taken from port->cad) is NULL. So add a check for NULL in the test there too. Link: http://lkml.kernel.org/r/20160714115245.12651-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/pps/clients/pps_parport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index 38a8bbe748100..83797d89c30fe 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -195,7 +195,7 @@ static void parport_detach(struct parport *port) struct pps_client_pp *device; /* FIXME: oooh, this is ugly! */ - if (strcmp(pardev->name, KBUILD_MODNAME)) + if (!pardev || strcmp(pardev->name, KBUILD_MODNAME)) /* not our port */ return; From dc20f3244ae920430d9d9f19939a13a0279380ca Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: [PATCH 016/118] kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w commit 57675cb976eff977aefb428e68e4e0236d48a9ff upstream. Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c0cdb5a73f80..67d1e1597d9c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4951,14 +4951,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif From e0bc4e7e1c876f8e58ba381bf5194a8b8c8d448f Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 23 Jun 2016 11:00:39 +0300 Subject: [PATCH 017/118] arc: unwind: warn only once if DW2_UNWIND is disabled commit 9bd54517ee86cb164c734f72ea95aeba4804f10b upstream. If CONFIG_ARC_DW2_UNWIND is disabled every time arc_unwind_core() gets called following message gets printed in debug console: ----------------->8--------------- CONFIG_ARC_DW2_UNWIND needs to be enabled ----------------->8--------------- That message makes sense if user indeed wants to see a backtrace or get nice function call-graphs in perf but what if user disabled unwinder for the purpose? Why pollute his debug console? So instead we'll warn user about possibly missing feature once and let him decide if that was what he or she really wanted. Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 001de4ce711ea..11b50959f20ed 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -142,7 +142,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, * prelogue is setup (callee regs saved and then fp set and not other * way around */ - pr_warn("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); + pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); return 0; #endif From 6bce4d0eb37b1c4268b728985e98dbdcd9592632 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 28 Jun 2016 09:42:25 +0530 Subject: [PATCH 018/118] ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) commit f52e126cc7476196f44f3c313b7d9f0699a881fc upstream. With recent binutils update to support dwarf CFI pseudo-ops in gas, we now get .eh_frame vs. .debug_frame. Although the call frame info is exactly the same in both, the CIE differs, which the current kernel unwinder can't cope with. This broke both the kernel unwinder as well as loadable modules (latter because of a new unhandled relo R_ARC_32_PCREL from .rela.eh_frame in the module loader) The ideal solution would be to switch unwinder to .eh_frame. For now however we can make do by just ensureing .debug_frame is generated by removing -fasynchronous-unwind-tables .eh_frame generated with -gdwarf-2 -fasynchronous-unwind-tables .debug_frame generated with -gdwarf-2 Fixes STAR 9001058196 Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index aeb19021099e3..209d8451e23d7 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -48,8 +48,6 @@ endif endif -cflags-$(CONFIG_ARC_DW2_UNWIND) += -fasynchronous-unwind-tables - # By default gcc 4.8 generates dwarf4 which kernel unwinder can't grok ifeq ($(atleast_gcc48),y) cflags-$(CONFIG_ARC_DW2_UNWIND) += -gdwarf-2 From 66af4230b41fe0fe4d88e011a849437f176e7732 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 21 Jun 2016 14:26:36 -0400 Subject: [PATCH 019/118] xen/pciback: Fix conf_space read/write overlap check. commit 02ef871ecac290919ea0c783d05da7eedeffc10e upstream. Current overlap check is evaluating to false a case where a filter field is fully contained (proper subset) of a r/w request. This change applies classical overlap check instead to include all the scenarios. More specifically, for (Hilscher GmbH CIFX 50E-DP(M/S)) device driver the logic is such that the entire confspace is read and written in 4 byte chunks. In this case as an example, CACHE_LINE_SIZE, LATENCY_TIMER and PCI_BIST are arriving together in one call to xen_pcibk_config_write() with offset == 0xc and size == 4. With the exsisting overlap check the LATENCY_TIMER field (offset == 0xd, length == 1) is fully contained in the write request and hence is excluded from write, which is incorrect. Signed-off-by: Andrey Grodzovsky Reviewed-by: Boris Ostrovsky Reviewed-by: Jan Beulich Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xen-pciback/conf_space.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 9c234209d8b52..47a4177b16d20 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -183,8 +183,7 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { + if (req_end > field_start && field_end > req_start) { err = conf_space_read(dev, cfg_entry, field_start, &tmp_val); if (err) @@ -230,8 +229,7 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { + if (req_end > field_start && field_end > req_start) { tmp_val = 0; err = xen_pcibk_config_read(dev, field_start, From d1e6344e0b97f8c7d2e14a5bd892d0180274e0c5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:23:57 -0600 Subject: [PATCH 020/118] xenbus: don't BUG() on user mode induced condition commit 0beef634b86a1350c31da5fcc2992f0d7c8a622b upstream. Inability to locate a user mode specified transaction ID should not lead to a kernel crash. For other than XS_TRANSACTION_START also don't issue anything to xenbus if the specified ID doesn't match that of any active transaction. Signed-off-by: Jan Beulich Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_dev_frontend.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 9433e46518c8d..531e764749838 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -316,11 +316,18 @@ static int xenbus_write_transaction(unsigned msg_type, rc = -ENOMEM; goto out; } + } else { + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == u->u.msg.tx_id) + break; + if (&trans->list == &u->transactions) + return -ESRCH; } reply = xenbus_dev_request_and_reply(&u->u.msg); if (IS_ERR(reply)) { - kfree(trans); + if (msg_type == XS_TRANSACTION_START) + kfree(trans); rc = PTR_ERR(reply); goto out; } @@ -333,12 +340,7 @@ static int xenbus_write_transaction(unsigned msg_type, list_add(&trans->list, &u->transactions); } } else if (u->u.msg.type == XS_TRANSACTION_END) { - list_for_each_entry(trans, &u->transactions, list) - if (trans->handle.id == u->u.msg.tx_id) - break; - BUG_ON(&trans->list == &u->transactions); list_del(&trans->list); - kfree(trans); } From ee8b7ff00d5b8fc931a63d31349e9a7189cc72d7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:32:04 -0600 Subject: [PATCH 021/118] xenbus: don't bail early from xenbus_dev_request_and_reply() commit 7469be95a487319514adce2304ad2af3553d2fc9 upstream. xenbus_dev_request_and_reply() needs to track whether a transaction is open. For XS_TRANSACTION_START messages it calls transaction_start() and for XS_TRANSACTION_END messages it calls transaction_end(). If sending an XS_TRANSACTION_START message fails or responds with an an error, the transaction is not open and transaction_end() must be called. If sending an XS_TRANSACTION_END message fails, the transaction is still open, but if an error response is returned the transaction is closed. Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus stalling shutdown/restart") introduced a regression where failed XS_TRANSACTION_START messages were leaving the transaction open. This can cause problems with suspend (and migration) as all transactions must be closed before suspending. It appears that the problematic change was added accidentally, so just remove it. Signed-off-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_xs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index ba804f3d8278d..ce65591b4168f 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -250,9 +250,6 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); - if (IS_ERR(ret)) - return ret; - if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) From 90bed827ea910f82ab17ee154f501b5ae71617e6 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:07 -0400 Subject: [PATCH 022/118] ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e upstream. The stack object “tread” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 7c6155f5865b8..245c5f340ae5a 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1746,6 +1746,7 @@ static int snd_timer_user_params(struct file *file, if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) { if (tu->tread) { struct snd_timer_tread tread; + memset(&tread, 0, sizeof(tread)); tread.event = SNDRV_TIMER_EVENT_EARLY; tread.tstamp.tv_sec = 0; tread.tstamp.tv_nsec = 0; From 3e6af33c73fb7ec7be8dedd01047162ef64a26a5 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:20 -0400 Subject: [PATCH 023/118] ALSA: timer: Fix leak in events via snd_timer_user_ccallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6 upstream. The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 245c5f340ae5a..3a5e0dd9cebe0 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1247,6 +1247,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, tu->tstamp = *tstamp; if ((tu->filter & (1 << event)) == 0 || !tu->tread) return; + memset(&r1, 0, sizeof(r1)); r1.event = event; r1.tstamp = *tstamp; r1.val = resolution; From 8fd58e050f90ed5d5161413c75a8a8271934566c Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:32 -0400 Subject: [PATCH 024/118] ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e4ec8cc8039a7063e24204299b462bd1383184a5 upstream. The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 3a5e0dd9cebe0..637d034bb084f 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1282,6 +1282,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, } if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) && tu->last_resolution != resolution) { + memset(&r1, 0, sizeof(r1)); r1.event = SNDRV_TIMER_EVENT_RESOLUTION; r1.tstamp = tstamp; r1.val = resolution; From 12a83f6702402803190450218c0a80a1d5fb2b09 Mon Sep 17 00:00:00 2001 From: Sinclair Yeh Date: Thu, 23 Jun 2016 17:37:34 -0700 Subject: [PATCH 025/118] Input: vmmouse - remove port reservation commit 60842ef8128e7bf58c024814cd0dc14319232b6c upstream. The VMWare EFI BIOS will expose port 0x5658 as an ACPI resource. This causes the port to be reserved by the APCI module as the system comes up, making it unavailable to be reserved again by other drivers, thus preserving this VMWare port for special use in a VMWare guest. This port is designed to be shared among multiple VMWare services, such as the VMMOUSE. Because of this, VMMOUSE should not try to reserve this port on its own. The VMWare non-EFI BIOS does not do this to preserve compatibility with existing/legacy VMs. It is known that there is small chance a VM may be configured such that these ports get reserved by other non-VMWare devices, and if this ever happens, the result is undefined. Signed-off-by: Sinclair Yeh Reviewed-by: Thomas Hellstrom Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/vmmouse.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c index a3f0f5a47490e..0f586780ceb4b 100644 --- a/drivers/input/mouse/vmmouse.c +++ b/drivers/input/mouse/vmmouse.c @@ -355,18 +355,11 @@ int vmmouse_detect(struct psmouse *psmouse, bool set_properties) return -ENXIO; } - if (!request_region(VMMOUSE_PROTO_PORT, 4, "vmmouse")) { - psmouse_dbg(psmouse, "VMMouse port in use.\n"); - return -EBUSY; - } - /* Check if the device is present */ response = ~VMMOUSE_PROTO_MAGIC; VMMOUSE_CMD(GETVERSION, 0, version, response, dummy1, dummy2); - if (response != VMMOUSE_PROTO_MAGIC || version == 0xffffffffU) { - release_region(VMMOUSE_PROTO_PORT, 4); + if (response != VMMOUSE_PROTO_MAGIC || version == 0xffffffffU) return -ENXIO; - } if (set_properties) { psmouse->vendor = VMMOUSE_VENDOR; @@ -374,8 +367,6 @@ int vmmouse_detect(struct psmouse *psmouse, bool set_properties) psmouse->model = version; } - release_region(VMMOUSE_PROTO_PORT, 4); - return 0; } @@ -394,7 +385,6 @@ static void vmmouse_disconnect(struct psmouse *psmouse) psmouse_reset(psmouse); input_unregister_device(priv->abs_dev); kfree(priv); - release_region(VMMOUSE_PROTO_PORT, 4); } /** @@ -438,15 +428,10 @@ int vmmouse_init(struct psmouse *psmouse) struct input_dev *rel_dev = psmouse->dev, *abs_dev; int error; - if (!request_region(VMMOUSE_PROTO_PORT, 4, "vmmouse")) { - psmouse_dbg(psmouse, "VMMouse port in use.\n"); - return -EBUSY; - } - psmouse_reset(psmouse); error = vmmouse_enable(psmouse); if (error) - goto release_region; + return error; priv = kzalloc(sizeof(*priv), GFP_KERNEL); abs_dev = input_allocate_device(); @@ -502,8 +487,5 @@ int vmmouse_init(struct psmouse *psmouse) kfree(priv); psmouse->private = NULL; -release_region: - release_region(VMMOUSE_PROTO_PORT, 4); - return error; } From 4bc476735615a766b6f3014984b3d06378f8f26a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 21 Jun 2016 16:09:00 -0700 Subject: [PATCH 026/118] Input: elantech - add more IC body types to the list commit 226ba707744a51acb4244724e09caacb1d96aed9 upstream. The touchpad in HP Pavilion 14-ab057ca reports it's version as 12 and according to Elan both 11 and 12 are valid IC types and should be identified as hw_version 4. Reported-by: Patrick Lessard Tested-by: Patrick Lessard Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elantech.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 78f93cf68840d..be5b399da5d3e 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1568,13 +1568,7 @@ static int elantech_set_properties(struct elantech_data *etd) case 5: etd->hw_version = 3; break; - case 6: - case 7: - case 8: - case 9: - case 10: - case 13: - case 14: + case 6 ... 14: etd->hw_version = 4; break; default: From 526410bc85d3f9e30515b2086eaea6440231ba48 Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Thu, 23 Jun 2016 10:24:42 -0700 Subject: [PATCH 027/118] Input: xpad - fix oops when attaching an unknown Xbox One gamepad commit c7f1429389ec1aa25e042bb13451385fbb596f8c upstream. Xbox One controllers have multiple interfaces which all have the same class, subclass, and protocol. One of the these interfaces has only a single endpoint. When Xpad attempts to bind to this interface, it causes an oops when trying initialize the output URB by trying to access the second endpoint's descriptor. This situation was avoided for known Xbox One devices by checking the XTYPE constant associated with the VID and PID tuple. However, this breaks when new or previously unknown Xbox One controllers are attached to the system. This change addresses the problem by deriving the XTYPE for Xbox One controllers based on the interface protocol before checking the interface number. Fixes: 1a48ff81b391 ("Input: xpad - add support for Xbox One controllers") Signed-off-by: Cameron Gutman Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index fd4100d56d8c5..35e444b4b8b05 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1206,16 +1206,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id break; } - if (xpad_device[i].xtype == XTYPE_XBOXONE && - intf->cur_altsetting->desc.bInterfaceNumber != 0) { - /* - * The Xbox One controller lists three interfaces all with the - * same interface class, subclass and protocol. Differentiate by - * interface number. - */ - return -ENODEV; - } - xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL); if (!xpad) return -ENOMEM; @@ -1246,6 +1236,8 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) { if (intf->cur_altsetting->desc.bInterfaceProtocol == 129) xpad->xtype = XTYPE_XBOX360W; + else if (intf->cur_altsetting->desc.bInterfaceProtocol == 208) + xpad->xtype = XTYPE_XBOXONE; else xpad->xtype = XTYPE_XBOX360; } else { @@ -1260,6 +1252,17 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id xpad->mapping |= MAP_STICKS_TO_NULL; } + if (xpad->xtype == XTYPE_XBOXONE && + intf->cur_altsetting->desc.bInterfaceNumber != 0) { + /* + * The Xbox One controller lists three interfaces all with the + * same interface class, subclass and protocol. Differentiate by + * interface number. + */ + error = -ENODEV; + goto err_free_in_urb; + } + error = xpad_init_output(intf, xpad); if (error) goto err_free_in_urb; From 1dbdba62abb49c84981354d821b22e0b6562232a Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Thu, 23 Jun 2016 10:54:17 -0700 Subject: [PATCH 028/118] Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 commit 12afb34400eb2b301f06b2aa3535497d14faee59 upstream. Somehow the patch that added two-finger touch support forgot to update W8001_MAX_LENGTH from 11 to 13. Signed-off-by: Ping Cheng Reviewed-by: Peter Hutterer Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/wacom_w8001.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/wacom_w8001.c b/drivers/input/touchscreen/wacom_w8001.c index 2792ca397dd08..3ed0ce1e4dcb1 100644 --- a/drivers/input/touchscreen/wacom_w8001.c +++ b/drivers/input/touchscreen/wacom_w8001.c @@ -27,7 +27,7 @@ MODULE_AUTHOR("Jaya Kumar "); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); -#define W8001_MAX_LENGTH 11 +#define W8001_MAX_LENGTH 13 #define W8001_LEAD_MASK 0x80 #define W8001_LEAD_BYTE 0x80 #define W8001_TAB_MASK 0x40 From c2e5023425c82caf1957e401a5183b9e62f43ebb Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Wed, 29 Jun 2016 09:51:35 -0700 Subject: [PATCH 029/118] Input: xpad - validate USB endpoint count during probe commit caca925fca4fb30c67be88cacbe908eec6721e43 upstream. This prevents a malicious USB device from causing an oops. Signed-off-by: Cameron Gutman Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 35e444b4b8b05..2b2f9d66c2c76 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1200,6 +1200,9 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id int ep_irq_in_idx; int i, error; + if (intf->cur_altsetting->desc.bNumEndpoints != 2) + return -ENODEV; + for (i = 0; xpad_device[i].idVendor; i++) { if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) && (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct)) From aab045e9a95b139aa628498787ce132ade04ff47 Mon Sep 17 00:00:00 2001 From: Michael Welling Date: Wed, 20 Jul 2016 10:02:07 -0700 Subject: [PATCH 030/118] Input: tsc200x - report proper input_dev name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e9003c9cfaa17d26991688268b04244adb67ee2b upstream. Passes input_id struct to the common probe function for the tsc200x drivers instead of just the bustype. This allows for the use of the product variable to set the input_dev->name variable according to the type of touchscreen used. Note that when we introduced support for TSC2004 we started calling everything TSC200X, so let's keep this quirk. Signed-off-by: Michael Welling Acked-by: Pavel Machek Acked-by: Pali Rohár Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/tsc2004.c | 7 ++++++- drivers/input/touchscreen/tsc2005.c | 7 ++++++- drivers/input/touchscreen/tsc200x-core.c | 15 ++++++++++++--- drivers/input/touchscreen/tsc200x-core.h | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/input/touchscreen/tsc2004.c b/drivers/input/touchscreen/tsc2004.c index 7295c198aa086..6fe55d598faca 100644 --- a/drivers/input/touchscreen/tsc2004.c +++ b/drivers/input/touchscreen/tsc2004.c @@ -22,6 +22,11 @@ #include #include "tsc200x-core.h" +static const struct input_id tsc2004_input_id = { + .bustype = BUS_I2C, + .product = 2004, +}; + static int tsc2004_cmd(struct device *dev, u8 cmd) { u8 tx = TSC200X_CMD | TSC200X_CMD_12BIT | cmd; @@ -42,7 +47,7 @@ static int tsc2004_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { - return tsc200x_probe(&i2c->dev, i2c->irq, BUS_I2C, + return tsc200x_probe(&i2c->dev, i2c->irq, &tsc2004_input_id, devm_regmap_init_i2c(i2c, &tsc200x_regmap_config), tsc2004_cmd); } diff --git a/drivers/input/touchscreen/tsc2005.c b/drivers/input/touchscreen/tsc2005.c index b9f593dfd2ef8..f2c5f0e47f77d 100644 --- a/drivers/input/touchscreen/tsc2005.c +++ b/drivers/input/touchscreen/tsc2005.c @@ -24,6 +24,11 @@ #include #include "tsc200x-core.h" +static const struct input_id tsc2005_input_id = { + .bustype = BUS_SPI, + .product = 2005, +}; + static int tsc2005_cmd(struct device *dev, u8 cmd) { u8 tx = TSC200X_CMD | TSC200X_CMD_12BIT | cmd; @@ -62,7 +67,7 @@ static int tsc2005_probe(struct spi_device *spi) if (error) return error; - return tsc200x_probe(&spi->dev, spi->irq, BUS_SPI, + return tsc200x_probe(&spi->dev, spi->irq, &tsc2005_input_id, devm_regmap_init_spi(spi, &tsc200x_regmap_config), tsc2005_cmd); } diff --git a/drivers/input/touchscreen/tsc200x-core.c b/drivers/input/touchscreen/tsc200x-core.c index 15240c1ee850a..dfa7f1c4f5453 100644 --- a/drivers/input/touchscreen/tsc200x-core.c +++ b/drivers/input/touchscreen/tsc200x-core.c @@ -450,7 +450,7 @@ static void tsc200x_close(struct input_dev *input) mutex_unlock(&ts->mutex); } -int tsc200x_probe(struct device *dev, int irq, __u16 bustype, +int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id, struct regmap *regmap, int (*tsc200x_cmd)(struct device *dev, u8 cmd)) { @@ -547,9 +547,18 @@ int tsc200x_probe(struct device *dev, int irq, __u16 bustype, snprintf(ts->phys, sizeof(ts->phys), "%s/input-ts", dev_name(dev)); - input_dev->name = "TSC200X touchscreen"; + if (tsc_id->product == 2004) { + input_dev->name = "TSC200X touchscreen"; + } else { + input_dev->name = devm_kasprintf(dev, GFP_KERNEL, + "TSC%04d touchscreen", + tsc_id->product); + if (!input_dev->name) + return -ENOMEM; + } + input_dev->phys = ts->phys; - input_dev->id.bustype = bustype; + input_dev->id = *tsc_id; input_dev->dev.parent = dev; input_dev->evbit[0] = BIT(EV_ABS) | BIT(EV_KEY); input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); diff --git a/drivers/input/touchscreen/tsc200x-core.h b/drivers/input/touchscreen/tsc200x-core.h index 7a482d1026148..49a63a3c68409 100644 --- a/drivers/input/touchscreen/tsc200x-core.h +++ b/drivers/input/touchscreen/tsc200x-core.h @@ -70,7 +70,7 @@ extern const struct regmap_config tsc200x_regmap_config; extern const struct dev_pm_ops tsc200x_pm_ops; -int tsc200x_probe(struct device *dev, int irq, __u16 bustype, +int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id, struct regmap *regmap, int (*tsc200x_cmd)(struct device *dev, u8 cmd)); int tsc200x_remove(struct device *dev); From ca3455867e4a98d34b65400ee82c7c3dacce510c Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 27 May 2016 14:17:10 +0800 Subject: [PATCH 031/118] pvclock: Add CPU barriers to get correct version value commit 749d088b8e7f4b9826ede02b9a043e417fa84aa1 upstream. Protocol for the "version" fields is: hypervisor raises it (making it uneven) before it starts updating the fields and raises it again (making it even) when it is done. Thus the guest can make sure the time values it got are consistent by checking the version before and after reading them. Add CPU barries after getting version value just like what function vread_pvclock does, because all of callees in this function is inline. Fixes: 502dfeff239e8313bfbe906ca0a1a6827ac8481b Signed-off-by: Minfei Huang Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pvclock.h | 2 ++ arch/x86/kernel/pvclock.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7a6bed5c08bc3..baad72e4c1000 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -76,6 +76,8 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; + /* Make the latest version visible */ + smp_rmb(); offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a587..bf0ce75735b02 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -66,6 +66,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); return flags & valid_flags; @@ -80,6 +82,8 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { From 6701df3c0a3672faef3e2cfbc4747254e603324a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 31 May 2016 14:17:06 -0700 Subject: [PATCH 032/118] pinctrl: single: Fix missing flush of posted write for a wakeirq commit 0ac3c0a4025f41748a083bdd4970cb3ede802b15 upstream. With many repeated suspend resume cycles, the pin specific wakeirq may not always work on omaps. This is because the write to enable the pin interrupt may not have reached the device over the interconnect before suspend happens. Let's fix the issue with a flush of posted write with a readback. Reported-by: Nishanth Menon Signed-off-by: Tony Lindgren Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-single.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 23b6b8c29a99e..73d8d47ea465a 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -1576,6 +1576,9 @@ static inline void pcs_irq_set(struct pcs_soc_data *pcs_soc, else mask &= ~soc_mask; pcs->write(mask, pcswi->reg); + + /* flush posted write */ + mask = pcs->read(pcswi->reg); raw_spin_unlock(&pcs->lock); } From 8f808f122f445fe1b391d1ee047dadf16437c749 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Wed, 1 Jun 2016 22:21:53 +0300 Subject: [PATCH 033/118] pinctrl: imx: Do not treat a PIN without MUX register as an error commit ba562d5e54fd3136bfea0457add3675850247774 upstream. Some PINs do not have a MUX register, it is not an error. It is necessary to allow the continuation of the PINs configuration, otherwise the whole PIN-group will be configured incorrectly. Signed-off-by: Alexander Shiyan Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/freescale/pinctrl-imx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index 1029aa7889b55..398ec45aadef9 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -207,9 +207,9 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, pin_reg = &info->pin_regs[pin_id]; if (pin_reg->mux_reg == -1) { - dev_err(ipctl->dev, "Pin(%s) does not support mux function\n", + dev_dbg(ipctl->dev, "Pin(%s) does not support mux function\n", info->pins[pin_id].name); - return -EINVAL; + continue; } if (info->flags & SHARE_MUX_CONF_REG) { From 75d6026fd7d605a668ef532193c2bae707a4316c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 May 2016 15:42:13 -0400 Subject: [PATCH 034/118] cgroup: set css->id to -1 during init commit 8fa3b8d689a54d6d04ff7803c724fb7aca6ce98e upstream. If percpu_ref initialization fails during css_create(), the free path can end up trying to free css->id of zero. As ID 0 is unused, it doesn't cause a critical breakage but it does trigger a warning message. Fix it by setting css->id to -1 from init_and_link_css(). Signed-off-by: Tejun Heo Cc: Wenwei Tao Fixes: 01e586598b22 ("cgroup: release css->id after css_free") Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1c9d701f7a729..a3424f28aaf4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4793,6 +4793,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; From acbda596b5cdf4784938fe6feca43b28c2dc8a53 Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Thu, 9 Jun 2016 17:28:39 -0400 Subject: [PATCH 035/118] power_supply: power_supply_read_temp only if use_cnt > 0 commit 5bc28b93a36e3cb3acc2870fb75cb6ffb182fece upstream. Change power_supply_read_temp() to use power_supply_get_property() so that it will check the use_cnt and ensure it is > 0. The use_cnt will be incremented at the end of __power_supply_register, so this will block to case where get_property can be called before the supply is fully registered. This fixes the issue show in the stack below: [ 1.452598] power_supply_read_temp+0x78/0x80 [ 1.458680] thermal_zone_get_temp+0x5c/0x11c [ 1.464765] thermal_zone_device_update+0x34/0xb4 [ 1.471195] thermal_zone_device_register+0x87c/0x8cc [ 1.477974] __power_supply_register+0x364/0x424 [ 1.484317] power_supply_register_no_ws+0x10/0x18 [ 1.490833] bq27xxx_battery_setup+0x10c/0x164 [ 1.497003] bq27xxx_battery_i2c_probe+0xd0/0x1b0 [ 1.503435] i2c_device_probe+0x174/0x240 [ 1.509172] driver_probe_device+0x1fc/0x29c [ 1.515167] __driver_attach+0xa4/0xa8 [ 1.520643] bus_for_each_dev+0x58/0x98 [ 1.526204] driver_attach+0x20/0x28 [ 1.531505] bus_add_driver+0x1c8/0x22c [ 1.537067] driver_register+0x68/0x108 [ 1.542630] i2c_register_driver+0x38/0x7c [ 1.548457] bq27xxx_battery_i2c_driver_init+0x18/0x20 [ 1.555321] do_one_initcall+0x38/0x12c [ 1.560886] kernel_init_freeable+0x148/0x1ec [ 1.566972] kernel_init+0x10/0xfc [ 1.572101] ret_from_fork+0x10/0x40 Also make the same change to ps_get_max_charge_cntl_limit() and ps_get_cur_chrage_cntl_limit() to be safe. Lastly, change the return value of power_supply_get_property() to -EAGAIN from -ENODEV if use_cnt <= 0. Fixes: 297d716f6260 ("power_supply: Change ownership from driver to core") Signed-off-by: Rhyland Klein Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/power_supply_core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index 456987c88baab..b13cd074c52af 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -565,11 +565,12 @@ static int power_supply_read_temp(struct thermal_zone_device *tzd, WARN_ON(tzd == NULL); psy = tzd->devdata; - ret = psy->desc->get_property(psy, POWER_SUPPLY_PROP_TEMP, &val); + ret = power_supply_get_property(psy, POWER_SUPPLY_PROP_TEMP, &val); + if (ret) + return ret; /* Convert tenths of degree Celsius to milli degree Celsius. */ - if (!ret) - *temp = val.intval * 100; + *temp = val.intval * 100; return ret; } @@ -612,10 +613,12 @@ static int ps_get_max_charge_cntl_limit(struct thermal_cooling_device *tcd, int ret; psy = tcd->devdata; - ret = psy->desc->get_property(psy, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, &val); - if (!ret) - *state = val.intval; + ret = power_supply_get_property(psy, + POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, &val); + if (ret) + return ret; + + *state = val.intval; return ret; } @@ -628,10 +631,12 @@ static int ps_get_cur_chrage_cntl_limit(struct thermal_cooling_device *tcd, int ret; psy = tcd->devdata; - ret = psy->desc->get_property(psy, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val); - if (!ret) - *state = val.intval; + ret = power_supply_get_property(psy, + POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val); + if (ret) + return ret; + + *state = val.intval; return ret; } From a3bdfa7b6185c677921df51cd2e44ab8aa656a2d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 1 Jul 2016 14:56:07 +0200 Subject: [PATCH 036/118] locks: use file_inode() commit 6343a2120862f7023006c8091ad95c1f16a32077 upstream. (Another one for the f_path debacle.) ltp fcntl33 testcase caused an Oops in selinux_file_send_sigiotask. The reason is that generic_add_lease() used filp->f_path.dentry->inode while all the others use file_inode(). This makes a difference for files opened on overlayfs since the former will point to the overlay inode the latter to the underlying inode. So generic_add_lease() added the lease to the overlay inode and generic_delete_lease() removed it from the underlying inode. When the file was released the lease remained on the overlay inode's lock list, resulting in use after free. Reported-by: Eryu Guan Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Signed-off-by: Miklos Szeredi Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 6333263b7bc86..8eddae23e10be 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1602,7 +1602,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; From ae159a027893a8bdeaab0c5f863ca6e57b4468d3 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 037/118] Revert "ecryptfs: forbid opening files without mmap handler" commit 78c4e172412de5d0456dc00d2b34050aa0b683b5 upstream. This reverts commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index e818f5ac7a269..866bb18efefea 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } From ed5c955e31ff07fa74738b6e0d94c5c17ebbf7c7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 038/118] ecryptfs: don't allow mmap when the lower fs doesn't support it commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b upstream. There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7c..11309683d65fb 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -170,6 +170,19 @@ static int read_or_initialize_metadata(struct dentry *dentry) return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -364,7 +377,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, From 26015f0ad252dd1ae397a6b6e9400ca868f4e584 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 15 Jul 2016 00:22:07 -0400 Subject: [PATCH 039/118] ext4: verify extent header depth commit 7bc9491645118c9461bd21099c31755ff6783593 upstream. Although the extent tree depth of 5 should enough be for the worst case of 2*32 extents of length 1, the extent tree code does not currently to merge nodes which are less than half-full with a sibling node, or to shrink the tree depth if possible. So it's possible, at least in theory, for the tree depth to be greater than 5. However, even in the worst case, a tree depth of 32 is highly unlikely, and if the file system is maliciously corrupted, an insanely large eh_depth can cause memory allocation failures that will trigger kernel warnings (here, eh_depth = 65280): JBD2: ext4.exe wants too many credits credits:195849 rsv_credits:0 max:256 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 50 at fs/jbd2/transaction.c:293 start_this_handle+0x569/0x580 CPU: 0 PID: 50 Comm: ext4.exe Not tainted 4.7.0-rc5+ #508 Stack: 604a8947 625badd8 0002fd09 00000000 60078643 00000000 62623910 601bf9bc 62623970 6002fc84 626239b0 900000125 Call Trace: [<6001c2dc>] show_stack+0xdc/0x1a0 [<601bf9bc>] dump_stack+0x2a/0x2e [<6002fc84>] __warn+0x114/0x140 [<6002fdff>] warn_slowpath_null+0x1f/0x30 [<60165829>] start_this_handle+0x569/0x580 [<60165d4e>] jbd2__journal_start+0x11e/0x220 [<60146690>] __ext4_journal_start_sb+0x60/0xa0 [<60120a81>] ext4_truncate+0x131/0x3a0 [<60123677>] ext4_setattr+0x757/0x840 [<600d5d0f>] notify_change+0x16f/0x2a0 [<600b2b16>] do_truncate+0x76/0xc0 [<600c3e56>] path_openat+0x806/0x1300 [<600c55c9>] do_filp_open+0x89/0xf0 [<600b4074>] do_sys_open+0x134/0x1e0 [<600b4140>] SyS_open+0x20/0x30 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 ---[ end trace 08b0b88b6387a244 ]--- [ Commit message modified and the extent tree depath check changed from 5 to 32 -- tytso ] Cc: Darrick J. Wong Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3578b25fccfd8..62880586ed85f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -469,6 +469,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { From 7d9f345ca6810010456e99c7cdea112b500b9547 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jun 2016 10:54:23 +0200 Subject: [PATCH 040/118] 9p: use file_dentry() commit b403f0e37a11f84f7ceaf40b0075499e5bcfd220 upstream. v9fs may be used as lower layer of overlayfs and accessing f_path.dentry can lead to a crash. In this case it's a NULL pointer dereference in p9_fid_create(). Fix by replacing direct access of file->f_path.dentry with the file_dentry() accessor, which will always return a native object. Reported-by: Alessio Igor Bogani Signed-off-by: Miklos Szeredi Tested-by: Alessio Igor Bogani Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc82..12ceaf52dae60 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { - fid = v9fs_fid_clone(file->f_path.dentry); + fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(file->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(file)); if (IS_ERR(fid)) { err = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); @@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(filp->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(filp)); if (IS_ERR(fid)) { retval = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); From df582d46960cc3151bc2dc564af9128b4c7f9b4b Mon Sep 17 00:00:00 2001 From: Andrey Ulanov Date: Fri, 15 Apr 2016 14:24:41 -0700 Subject: [PATCH 041/118] namespace: update event counter when umounting a deleted dentry commit e06b933e6ded42384164d28a2060b7f89243b895 upstream. - m_start() in fs/namespace.c expects that ns->event is incremented each time a mount added or removed from ns->list. - umount_tree() removes items from the list but does not increment event counter, expecting that it's done before the function is called. - There are some codepaths that call umount_tree() without updating "event" counter. e.g. from __detach_mounts(). - When this happens m_start may reuse a cached mount structure that no longer belongs to ns->list (i.e. use after free which usually leads to infinite loop). This change fixes the above problem by incrementing global event counter before invoking umount_tree(). Change-Id: I622c8e84dcb9fb63542372c5dbf0178ee86bb589 Signed-off-by: Andrey Ulanov Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/namespace.c b/fs/namespace.c index 33064fcbfff94..5be02a0635be0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry) goto out_unlock; lock_mount_hash(); + event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { From 873b6e316a581c976a29a1d8c889c76394d7393d Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 13 Jun 2016 17:46:49 +0000 Subject: [PATCH 042/118] spi: sunxi: fix transfer timeout commit 719bd6542044efd9b338a53dba1bef45f40ca169 upstream. The trasfer timeout is fixed at 1000 ms. Reading a 4Mbyte flash over 1MHz SPI bus takes way longer than that. Calculate the timeout from the actual time the transfer is supposed to take and multiply by 2 for good measure. Signed-off-by: Michal Suchanek Acked-by: Maxime Ripard Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-sun4i.c | 10 +++++++++- drivers/spi/spi-sun6i.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index fbb0a4d74e91c..8363287304576 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -170,6 +170,7 @@ static int sun4i_spi_transfer_one(struct spi_master *master, { struct sun4i_spi *sspi = spi_master_get_devdata(master); unsigned int mclk_rate, div, timeout; + unsigned int start, end, tx_time; unsigned int tx_len = 0; int ret = 0; u32 reg; @@ -279,9 +280,16 @@ static int sun4i_spi_transfer_one(struct spi_master *master, reg = sun4i_spi_read(sspi, SUN4I_CTL_REG); sun4i_spi_write(sspi, SUN4I_CTL_REG, reg | SUN4I_CTL_XCH); + tx_time = max(tfr->len * 8 * 2 / (tfr->speed_hz / 1000), 100U); + start = jiffies; timeout = wait_for_completion_timeout(&sspi->done, - msecs_to_jiffies(1000)); + msecs_to_jiffies(tx_time)); + end = jiffies; if (!timeout) { + dev_warn(&master->dev, + "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", + dev_name(&spi->dev), tfr->len, tfr->speed_hz, + jiffies_to_msecs(end - start), tx_time); ret = -ETIMEDOUT; goto out; } diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c index ac48f59705a87..e77add01b0e90 100644 --- a/drivers/spi/spi-sun6i.c +++ b/drivers/spi/spi-sun6i.c @@ -160,6 +160,7 @@ static int sun6i_spi_transfer_one(struct spi_master *master, { struct sun6i_spi *sspi = spi_master_get_devdata(master); unsigned int mclk_rate, div, timeout; + unsigned int start, end, tx_time; unsigned int tx_len = 0; int ret = 0; u32 reg; @@ -269,9 +270,16 @@ static int sun6i_spi_transfer_one(struct spi_master *master, reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); sun6i_spi_write(sspi, SUN6I_TFR_CTL_REG, reg | SUN6I_TFR_CTL_XCH); + tx_time = max(tfr->len * 8 * 2 / (tfr->speed_hz / 1000), 100U); + start = jiffies; timeout = wait_for_completion_timeout(&sspi->done, - msecs_to_jiffies(1000)); + msecs_to_jiffies(tx_time)); + end = jiffies; if (!timeout) { + dev_warn(&master->dev, + "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", + dev_name(&spi->dev), tfr->len, tfr->speed_hz, + jiffies_to_msecs(end - start), tx_time); ret = -ETIMEDOUT; goto out; } From 9162d29bc48a43a30217fc1fd939023a75ce604b Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 13 Jun 2016 17:46:49 +0000 Subject: [PATCH 043/118] spi: sun4i: fix FIFO limit commit 6d9fe44bd73d567d04d3a68a2d2fa521ab9532f2 upstream. When testing SPI without DMA I noticed that filling the FIFO on the spi controller causes timeout. Always leave room for one byte in the FIFO. Signed-off-by: Michal Suchanek Acked-by: Maxime Ripard Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-sun4i.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index 8363287304576..39d7c7c701121 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -177,7 +177,10 @@ static int sun4i_spi_transfer_one(struct spi_master *master, /* We don't support transfer larger than the FIFO */ if (tfr->len > SUN4I_FIFO_DEPTH) - return -EINVAL; + return -EMSGSIZE; + + if (tfr->tx_buf && tfr->len >= SUN4I_FIFO_DEPTH) + return -EMSGSIZE; reinit_completion(&sspi->done); sspi->tx_buf = tfr->tx_buf; @@ -270,8 +273,12 @@ static int sun4i_spi_transfer_one(struct spi_master *master, sun4i_spi_write(sspi, SUN4I_BURST_CNT_REG, SUN4I_BURST_CNT(tfr->len)); sun4i_spi_write(sspi, SUN4I_XMIT_CNT_REG, SUN4I_XMIT_CNT(tx_len)); - /* Fill the TX FIFO */ - sun4i_spi_fill_fifo(sspi, SUN4I_FIFO_DEPTH); + /* + * Fill the TX FIFO + * Filling the FIFO fully causes timeout for some reason + * at least on spi2 on A10s + */ + sun4i_spi_fill_fifo(sspi, SUN4I_FIFO_DEPTH - 1); /* Enable the interrupts */ sun4i_spi_write(sspi, SUN4I_INT_CTL_REG, SUN4I_INT_CTL_TC); From 68f99031897d63ae4937b0f945475dc6782afde4 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 17 May 2016 20:57:50 +0200 Subject: [PATCH 044/118] clk: rockchip: initialize flags of clk_init_data in mmc-phase clock commit 595144c1141c951a3c6bb9004ae6a2bc29aad66f upstream. The flags element of clk_init_data was never initialized for mmc- phase-clocks resulting in the element containing a random value and thus possibly enabling unwanted clock flags. Fixes: 89bf26cbc1a0 ("clk: rockchip: Add support for the mmc clock phases using the framework") Signed-off-by: Heiko Stuebner Signed-off-by: Greg Kroah-Hartman --- drivers/clk/rockchip/clk-mmc-phase.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/rockchip/clk-mmc-phase.c b/drivers/clk/rockchip/clk-mmc-phase.c index 2685644826a06..33c20c6b45afc 100644 --- a/drivers/clk/rockchip/clk-mmc-phase.c +++ b/drivers/clk/rockchip/clk-mmc-phase.c @@ -153,6 +153,7 @@ struct clk *rockchip_clk_register_mmc(const char *name, return NULL; init.name = name; + init.flags = 0; init.num_parents = num_parents; init.parent_names = parent_names; init.ops = &rockchip_mmc_clk_ops; From 69ca969a2626dc4b3bb83b953c053a01e3b9f7e6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Jun 2016 16:58:46 +0300 Subject: [PATCH 045/118] platform/chrome: cros_ec_dev - double fetch bug in ioctl commit 096cdc6f52225835ff503f987a0d68ef770bb78e upstream. We verify "u_cmd.outsize" and "u_cmd.insize" but we need to make sure that those values have not changed between the two copy_from_user() calls. Otherwise it could lead to a buffer overflow. Additionally, cros_ec_cmd_xfer() can set s_cmd->insize to a lower value. We should use the new smaller value so we don't copy too much data to the user. Reported-by: Pengfei Wang Fixes: a841178445bb ('mfd: cros_ec: Use a zero-length array for command data') Signed-off-by: Dan Carpenter Reviewed-by: Kees Cook Tested-by: Gwendal Grignou Signed-off-by: Olof Johansson Signed-off-by: Greg Kroah-Hartman --- drivers/platform/chrome/cros_ec_dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c index d45cd254ed1c8..2b331d5b9e799 100644 --- a/drivers/platform/chrome/cros_ec_dev.c +++ b/drivers/platform/chrome/cros_ec_dev.c @@ -147,13 +147,19 @@ static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) goto exit; } + if (u_cmd.outsize != s_cmd->outsize || + u_cmd.insize != s_cmd->insize) { + ret = -EINVAL; + goto exit; + } + s_cmd->command += ec->cmd_offset; ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd); /* Only copy data to userland if data was received. */ if (ret < 0) goto exit; - if (copy_to_user(arg, s_cmd, sizeof(*s_cmd) + u_cmd.insize)) + if (copy_to_user(arg, s_cmd, sizeof(*s_cmd) + s_cmd->insize)) ret = -EFAULT; exit: kfree(s_cmd); From b782756a66d302dfc8f3b9786672c965eae35a17 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Jul 2016 14:07:16 +0200 Subject: [PATCH 046/118] qeth: delete napi struct when removing a qeth device commit 7831b4ff0d926e0deeaabef9db8800ed069a2757 upstream. A qeth_card contains a napi_struct linked to the net_device during device probing. This struct must be deleted when removing the qeth device, otherwise Panic on oops can occur when qeth devices are repeatedly removed and added. Fixes: a1c3ed4c9ca ("qeth: NAPI support for l2 and l3 discipline") Signed-off-by: Ursula Braun Tested-by: Alexander Klein Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_l2_main.c | 1 + drivers/s390/net/qeth_l3_main.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8f1b091e17327..12b2cb7769f9c 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1051,6 +1051,7 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) qeth_l2_set_offline(cgdev); if (card->dev) { + netif_napi_del(&card->napi); unregister_netdev(card->dev); card->dev = NULL; } diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 543960e96b42b..50cec6b13d27a 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3246,6 +3246,7 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_l3_set_offline(cgdev); if (card->dev) { + netif_napi_del(&card->napi); unregister_netdev(card->dev); card->dev = NULL; } From 5161144c3a9d6ea775b293edbb8523deaeff4442 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 1 Jul 2016 00:39:35 -0700 Subject: [PATCH 047/118] block: fix use-after-free in sys_ioprio_get() commit 8ba8682107ee2ca3347354e018865d8e1967c5f4 upstream. get_task_ioprio() accesses the task->io_context without holding the task lock and thus can race with exit_io_context(), leading to a use-after-free. The reproducer below hits this within a few seconds on my 4-core QEMU VM: #define _GNU_SOURCE #include #include #include #include int main(int argc, char **argv) { pid_t pid, child; long nproc, i; /* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */ syscall(SYS_ioprio_set, 1, 0, 0x6000); nproc = sysconf(_SC_NPROCESSORS_ONLN); for (i = 0; i < nproc; i++) { pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { pid = fork(); assert(pid != -1); if (pid == 0) { _exit(0); } else { child = wait(NULL); assert(child == pid); } } } pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } } } for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } return 0; } This gets us KASAN dumps like this: [ 35.526914] ================================================================== [ 35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c [ 35.530009] Read of size 2 by task ioprio-gpf/363 [ 35.530009] ============================================================================= [ 35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected [ 35.530009] ----------------------------------------------------------------------------- [ 35.530009] Disabling lock debugging due to kernel taint [ 35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360 [ 35.530009] ___slab_alloc+0x55d/0x5a0 [ 35.530009] __slab_alloc.isra.20+0x2b/0x40 [ 35.530009] kmem_cache_alloc_node+0x84/0x200 [ 35.530009] create_task_io_context+0x2b/0x370 [ 35.530009] get_task_io_context+0x92/0xb0 [ 35.530009] copy_process.part.8+0x5029/0x5660 [ 35.530009] _do_fork+0x155/0x7e0 [ 35.530009] SyS_clone+0x19/0x20 [ 35.530009] do_syscall_64+0x195/0x3a0 [ 35.530009] return_from_SYSCALL_64+0x0/0x6a [ 35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060 [ 35.530009] __slab_free+0x27b/0x3d0 [ 35.530009] kmem_cache_free+0x1fb/0x220 [ 35.530009] put_io_context+0xe7/0x120 [ 35.530009] put_io_context_active+0x238/0x380 [ 35.530009] exit_io_context+0x66/0x80 [ 35.530009] do_exit+0x158e/0x2b90 [ 35.530009] do_group_exit+0xe5/0x2b0 [ 35.530009] SyS_exit_group+0x1d/0x20 [ 35.530009] entry_SYSCALL_64_fastpath+0x1a/0xa4 [ 35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080 [ 35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001 [ 35.530009] ================================================================== Fix it by grabbing the task lock while we poke at the io_context. Reported-by: Dmitry Vyukov Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/ioprio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioprio.c b/block/ioprio.c index cc7800e9eb441..01b8116298a13 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } From d29e5fa5859c37ae2076f1a0fa28d894e2857249 Mon Sep 17 00:00:00 2001 From: Taras Kondratiuk Date: Wed, 13 Jul 2016 22:05:38 +0000 Subject: [PATCH 048/118] mmc: block: fix packed command header endianness commit f68381a70bb2b26c31b13fdaf67c778f92fd32b4 upstream. The code that fills packed command header assumes that CPU runs in little-endian mode. Hence the header is malformed in big-endian mode and causes MMC data transfer errors: [ 563.200828] mmcblk0: error -110 transferring data, sector 2048, nr 8, cmd response 0x900, card status 0xc40 [ 563.219647] mmcblk0: packed cmd failed, nr 2, sectors 16, failure index: -1 Convert header data to LE. Signed-off-by: Taras Kondratiuk Fixes: ce39f9d17c14 ("mmc: support packed write command for eMMC4.5 devices") Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/card/block.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index c641c202fe7e4..64950035613be 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1767,8 +1767,8 @@ static void mmc_blk_packed_hdr_wrq_prep(struct mmc_queue_req *mqrq, packed_cmd_hdr = packed->cmd_hdr; memset(packed_cmd_hdr, 0, sizeof(packed->cmd_hdr)); - packed_cmd_hdr[0] = (packed->nr_entries << 16) | - (PACKED_CMD_WR << 8) | PACKED_CMD_VER; + packed_cmd_hdr[0] = cpu_to_le32((packed->nr_entries << 16) | + (PACKED_CMD_WR << 8) | PACKED_CMD_VER); hdr_blocks = mmc_large_sector(card) ? 8 : 1; /* @@ -1782,14 +1782,14 @@ static void mmc_blk_packed_hdr_wrq_prep(struct mmc_queue_req *mqrq, ((brq->data.blocks * brq->data.blksz) >= card->ext_csd.data_tag_unit_size); /* Argument of CMD23 */ - packed_cmd_hdr[(i * 2)] = + packed_cmd_hdr[(i * 2)] = cpu_to_le32( (do_rel_wr ? MMC_CMD23_ARG_REL_WR : 0) | (do_data_tag ? MMC_CMD23_ARG_TAG_REQ : 0) | - blk_rq_sectors(prq); + blk_rq_sectors(prq)); /* Argument of CMD18 or CMD25 */ - packed_cmd_hdr[((i * 2)) + 1] = + packed_cmd_hdr[((i * 2)) + 1] = cpu_to_le32( mmc_card_blockaddr(card) ? - blk_rq_pos(prq) : blk_rq_pos(prq) << 9; + blk_rq_pos(prq) : blk_rq_pos(prq) << 9); packed->blocks += blk_rq_sectors(prq); i++; } From 34bf12312bd4222a1b945be3f58173edc8aa3f22 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 15:53:54 +0200 Subject: [PATCH 049/118] sched/fair: Fix effective_load() to consistently use smoothed load commit 7dd4912594daf769a46744848b05bd5bc6d62469 upstream. Starting with the following commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") calc_tg_weight() doesn't compute the right value as expected by effective_load(). The difference is in the 'correction' term. In order to ensure \Sum rw_j >= rw_i we cannot use tg->load_avg directly, since that might be lagging a correction on the current cfs_rq->avg.load_avg value. Therefore we use tg->load_avg - cfs_rq->tg_load_avg_contrib + cfs_rq->avg.load_avg. Now, per the referenced commit, calc_tg_weight() doesn't use cfs_rq->avg.load_avg, as is later used in @w, but uses cfs_rq->load.weight instead. So stop using calc_tg_weight() and do it explicitly. The effects of this bug are wake_affine() making randomly poor choices in cgroup-intense workloads. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51c615279b233..b8b516c37bf1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -687,8 +687,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -4594,19 +4592,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) From b82c78948a5311ca3952900dffbee5c932c2d03b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 4 Jul 2016 16:49:48 +0200 Subject: [PATCH 050/118] ovl: handle ATTR_KILL* commit b99c2d913810e56682a538c9f2394d76fca808f8 upstream. Before 4bacc9c9234c ("overlayfs: Make f_path...") file->f_path pointed to the underlying file, hence suid/sgid removal on write worked fine. After that patch file->f_path pointed to the overlay file, and the file mode bits weren't copied to overlay_inode->i_mode. So the suid/sgid removal simply stopped working. The fix is to copy the mode bits, but then ovl_setattr() needs to clear ATTR_MODE to avoid the BUG() in notify_change(). So do this first, then in the next patch copy the mode. Reported-by: Eryu Guan Signed-off-by: Miklos Szeredi Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Cc: Eric Schultz Cc: Eric Hameleers [backported by Eric Hameleers as seen in https://bugzilla.kernel.org/show_bug.cgi?id=150711] Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 0597820f5d9d7..4f729ffff75d4 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,6 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + mutex_lock(&upperdentry->d_inode->i_mutex); err = notify_change(upperdentry, attr, NULL); if (!err) From b4fedbef96b8d29f336d355d2b0858518e405090 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 3 Dec 2015 23:33:18 +0100 Subject: [PATCH 051/118] perf/x86: fix PEBS issues on Intel Atom/Core2 commit 1424a09a9e1839285e948d4ea9fdfca26c9a2086 upstream. This patch fixes broken PEBS support on Intel Atom and Core2 due to wrong pointer arithmetic in intel_pmu_drain_pebs_core(). The get_next_pebs_record_by_bit() was called on PEBS format fmt0 which does not use the pebs_record_nhm layout. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Fixes: 21509084f999 ("perf/x86/intel: Handle multiple records in the PEBS buffer") Link: http://lkml.kernel.org/r/1449182000-31524-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7abb2b88572e0..1e7de3cefc9c4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1110,6 +1110,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) void *at; u64 pebs_status; + /* + * fmt0 does not have a status bitfield (does not use + * perf_record_nhm format) + */ + if (x86_pmu.intel_cap.pebs_format < 1) + return base; + if (base == NULL) return NULL; @@ -1195,7 +1202,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) if (!event->attr.precise_ip) return; - n = (top - at) / x86_pmu.pebs_record_size; + n = top - at; if (n <= 0) return; From 63b9e0f32f72892de7064c6888484b881ddbb42f Mon Sep 17 00:00:00 2001 From: Wolfgang Grandegger Date: Mon, 13 Jun 2016 15:44:19 +0200 Subject: [PATCH 052/118] can: at91_can: RX queue could get stuck at high bus load commit 43200a4480cbbe660309621817f54cbb93907108 upstream. At high bus load it could happen that "at91_poll()" enters with all RX message boxes filled up. If then at the end the "quota" is exceeded as well, "rx_next" will not be reset to the first RX mailbox and hence the interrupts remain disabled. Signed-off-by: Wolfgang Grandegger Tested-by: Amr Bekhit Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/at91_can.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 8b3275d7792ac..8f5e93cb79752 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -712,9 +712,10 @@ static int at91_poll_rx(struct net_device *dev, int quota) /* upper group completed, look again in lower */ if (priv->rx_next > get_mb_rx_low_last(priv) && - quota > 0 && mb > get_mb_rx_last(priv)) { + mb > get_mb_rx_last(priv)) { priv->rx_next = get_mb_rx_first(priv); - goto again; + if (quota > 0) + goto again; } return received; From 1cee72ed4856504fd597145ce10b29751c4d27a1 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Thu, 16 Jun 2016 11:10:19 -0500 Subject: [PATCH 053/118] can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access commit 427460c83cdf55069eee49799a0caef7dde8df69 upstream. When testing CAN write floods on Altera's CycloneV, the first 2 bytes are sometimes 0x00, 0x00 or corrupted instead of the values sent. Also observed bytes 4 & 5 were corrupted in some cases. The D_CAN Data registers are 32 bits and changing from 16 bit writes to 32 bit writes fixes the problem. Testing performed on Altera CycloneV (D_CAN). Requesting tests on other C_CAN & D_CAN platforms. Reported-by: Richard Andrysek Signed-off-by: Thor Thayer Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/c_can/c_can.c | 38 ++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index f91b094288dad..e3dccd3200d5d 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -332,9 +332,23 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface, priv->write_reg(priv, C_CAN_IFACE(MSGCTRL_REG, iface), ctrl); - for (i = 0; i < frame->can_dlc; i += 2) { - priv->write_reg(priv, C_CAN_IFACE(DATA1_REG, iface) + i / 2, - frame->data[i] | (frame->data[i + 1] << 8)); + if (priv->type == BOSCH_D_CAN) { + u32 data = 0, dreg = C_CAN_IFACE(DATA1_REG, iface); + + for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) { + data = (u32)frame->data[i]; + data |= (u32)frame->data[i + 1] << 8; + data |= (u32)frame->data[i + 2] << 16; + data |= (u32)frame->data[i + 3] << 24; + priv->write_reg32(priv, dreg, data); + } + } else { + for (i = 0; i < frame->can_dlc; i += 2) { + priv->write_reg(priv, + C_CAN_IFACE(DATA1_REG, iface) + i / 2, + frame->data[i] | + (frame->data[i + 1] << 8)); + } } } @@ -402,10 +416,20 @@ static int c_can_read_msg_object(struct net_device *dev, int iface, u32 ctrl) } else { int i, dreg = C_CAN_IFACE(DATA1_REG, iface); - for (i = 0; i < frame->can_dlc; i += 2, dreg ++) { - data = priv->read_reg(priv, dreg); - frame->data[i] = data; - frame->data[i + 1] = data >> 8; + if (priv->type == BOSCH_D_CAN) { + for (i = 0; i < frame->can_dlc; i += 4, dreg += 2) { + data = priv->read_reg32(priv, dreg); + frame->data[i] = data; + frame->data[i + 1] = data >> 8; + frame->data[i + 2] = data >> 16; + frame->data[i + 3] = data >> 24; + } + } else { + for (i = 0; i < frame->can_dlc; i += 2, dreg++) { + data = priv->read_reg(priv, dreg); + frame->data[i] = data; + frame->data[i + 1] = data >> 8; + } } } From 864844524efebf19da164ed38f25aa3fb3a2d2de Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 21 Jun 2016 12:14:07 +0200 Subject: [PATCH 054/118] can: fix handling of unmodifiable configuration options fix commit bce271f255dae8335dc4d2ee2c4531e09cc67f5a upstream. With upstream commit bb208f144cf3f59 (can: fix handling of unmodifiable configuration options) a new can_validate() function was introduced. When invoking 'ip link set can0 type can' without any configuration data can_validate() tries to validate the content without taking into account that there's totally no content. This patch adds a check for missing content. Reported-by: ajneu Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 910c12e2638e3..348dd5001fa48 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -798,6 +798,9 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[]) * - control mode with CAN_CTRLMODE_FD set */ + if (!data) + return 0; + if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); From d9e1886bddeb99038c127f384c254a7c4997ecc5 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 21 Jun 2016 15:45:47 +0200 Subject: [PATCH 055/118] can: fix oops caused by wrong rtnl dellink usage commit 25e1ed6e64f52a692ba3191c4fde650aab3ecc07 upstream. For 'real' hardware CAN devices the netlink interface is used to set CAN specific communication parameters. Real CAN hardware can not be created nor removed with the ip tool ... This patch adds a private dellink function for the CAN device driver interface that does just nothing. It's a follow up to commit 993e6f2fd ("can: fix oops caused by wrong rtnl newlink usage") but for dellink. Reported-by: ajneu Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 348dd5001fa48..ad535a854e5cf 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -1011,6 +1011,11 @@ static int can_newlink(struct net *src_net, struct net_device *dev, return -EOPNOTSUPP; } +static void can_dellink(struct net_device *dev, struct list_head *head) +{ + return; +} + static struct rtnl_link_ops can_link_ops __read_mostly = { .kind = "can", .maxtype = IFLA_CAN_MAX, @@ -1019,6 +1024,7 @@ static struct rtnl_link_ops can_link_ops __read_mostly = { .validate = can_validate, .newlink = can_newlink, .changelink = can_changelink, + .dellink = can_dellink, .get_size = can_get_size, .fill_info = can_fill_info, .get_xstats_size = can_get_xstats_size, From b3a061d1d8288e89a899653fff4ef021df8ed2b3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 3 Jul 2016 10:54:54 +0200 Subject: [PATCH 056/118] RDS: fix rds_tcp_init() error path commit 3dad5424adfb346c871847d467f97dcdca64ea97 upstream. If register_pernet_subsys() fails, we shouldn't try to call unregister_pernet_subsys(). Fixes: 467fa15356 ("RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.") Cc: Sowmini Varadhan Cc: David S. Miller Signed-off-by: Vegard Nossum Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rds/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddbacd8750..18e50a8fc05f2 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -421,7 +421,7 @@ static int rds_tcp_init(void) ret = rds_tcp_recv_init(); if (ret) - goto out_slab; + goto out_pernet; ret = rds_trans_register(&rds_tcp_transport); if (ret) @@ -433,8 +433,9 @@ static int rds_tcp_init(void) out_recv: rds_tcp_recv_exit(); -out_slab: +out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); +out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; From c4c2a8f5b740e3ce527357fba43c68dfc3e982ba Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 23 Jun 2016 15:05:26 -0400 Subject: [PATCH 057/118] SCSI: fix new bug in scsi_dev_info_list string matching commit 5e7ff2ca7f2da55fe777167849d0c93403bd0dc8 upstream. Commit b704f70ce200 ("SCSI: fix bug in scsi_dev_info_list matching") changed the way vendor- and model-string matching was carried out in the routine that looks up entries in a SCSI devinfo list. The new matching code failed to take into account the case of a maximum-length string; in such cases it could end up testing for a terminating '\0' byte beyond the end of the memory allocated to the string. This out-of-bounds bug was detected by UBSAN. I don't know if anybody has actually encountered this bug. The symptom would be that a device entry in the blacklist might not be matched properly if it contained an 8-character vendor name or a 16-character model name. Such entries certainly exist in scsi_static_device_list. This patch fixes the problem by adding a check for a maximum-length string before the '\0' test. Signed-off-by: Alan Stern Fixes: b704f70ce200 ("SCSI: fix bug in scsi_dev_info_list matching") Tested-by: Wilfried Klaebe Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_devinfo.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 93cbefa75b26d..11cdb172cfafd 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -426,7 +426,7 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, * here, and we don't know what device it is * trying to work with, leave it as-is. */ - vmax = 8; /* max length of vendor */ + vmax = sizeof(devinfo->vendor); vskip = vendor; while (vmax > 0 && *vskip == ' ') { vmax--; @@ -436,7 +436,7 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, while (vmax > 0 && vskip[vmax - 1] == ' ') --vmax; - mmax = 16; /* max length of model */ + mmax = sizeof(devinfo->model); mskip = model; while (mmax > 0 && *mskip == ' ') { mmax--; @@ -452,10 +452,12 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, * Behave like the older version of get_device_flags. */ if (memcmp(devinfo->vendor, vskip, vmax) || - devinfo->vendor[vmax]) + (vmax < sizeof(devinfo->vendor) && + devinfo->vendor[vmax])) continue; if (memcmp(devinfo->model, mskip, mmax) || - devinfo->model[mmax]) + (mmax < sizeof(devinfo->model) && + devinfo->model[mmax])) continue; return devinfo; } else { From 87271783380afbd50d13333fadb1e3a93017d5da Mon Sep 17 00:00:00 2001 From: Brian King Date: Mon, 27 Jun 2016 09:09:40 -0500 Subject: [PATCH 058/118] ipr: Clear interrupt on croc/crocodile when running with LSI commit 54e430bbd490e18ab116afa4cd90dcc45787b3df upstream. If we fall back to using LSI on the Croc or Crocodile chip we need to clear the interrupt so we don't hang the system. Tested-by: Benjamin Herrenschmidt Signed-off-by: Brian King Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ipr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 43ac62623bf26..7a58128a00000 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -10095,6 +10095,7 @@ static int ipr_probe_ioa(struct pci_dev *pdev, ioa_cfg->intr_flag = IPR_USE_MSI; else { ioa_cfg->intr_flag = IPR_USE_LSI; + ioa_cfg->clear_isr = 1; ioa_cfg->nvectors = 1; dev_info(&pdev->dev, "Cannot enable MSI.\n"); } From d863bec646a590584eabcb40550bff0708c26b0d Mon Sep 17 00:00:00 2001 From: James Patrick-Evans Date: Fri, 15 Jul 2016 16:40:45 +0100 Subject: [PATCH 059/118] media: fix airspy usb probe error path commit aa93d1fee85c890a34f2510a310e55ee76a27848 upstream. Fix a memory leak on probe error of the airspy usb device driver. The problem is triggered when more than 64 usb devices register with v4l2 of type VFL_TYPE_SDR or VFL_TYPE_SUBDEV. The memory leak is caused by the probe function of the airspy driver mishandeling errors and not freeing the corresponding control structures when an error occours registering the device to v4l2 core. A badusb device can emulate 64 of these devices, and then through continual emulated connect/disconnect of the 65th device, cause the kernel to run out of RAM and crash the kernel, thus causing a local DOS vulnerability. Fixes CVE-2016-5400 Signed-off-by: James Patrick-Evans Reviewed-by: Kees Cook Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/airspy/airspy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/usb/airspy/airspy.c b/drivers/media/usb/airspy/airspy.c index 565a593107476..34b35ebd60ac0 100644 --- a/drivers/media/usb/airspy/airspy.c +++ b/drivers/media/usb/airspy/airspy.c @@ -1073,7 +1073,7 @@ static int airspy_probe(struct usb_interface *intf, if (ret) { dev_err(s->dev, "Failed to register as video device (%d)\n", ret); - goto err_unregister_v4l2_dev; + goto err_free_controls; } dev_info(s->dev, "Registered as %s\n", video_device_node_name(&s->vdev)); @@ -1082,7 +1082,6 @@ static int airspy_probe(struct usb_interface *intf, err_free_controls: v4l2_ctrl_handler_free(&s->hdl); -err_unregister_v4l2_dev: v4l2_device_unregister(&s->v4l2_dev); err_free_mem: kfree(s); From 470f47fcf2a5f9c22081c1c4708e6948e3c2dc13 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Jul 2016 01:39:11 +0300 Subject: [PATCH 060/118] posix_cpu_timer: Exit early when process has been reaped commit 2c13ce8f6b2f6fd9ba2f9261b1939fc0f62d1307 upstream. Variable "now" seems to be genuinely used unintialized if branch if (CPUCLOCK_PERTHREAD(timer->it_clock)) { is not taken and branch if (unlikely(sighand == NULL)) { is taken. In this case the process has been reaped and the timer is marked as disarmed anyway. So none of the postprocessing of the sample is required. Return right away. Signed-off-by: Alexey Dobriyan Link: http://lkml.kernel.org/r/20160707223911.GA26483@p183.telecom.by Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d520..80016b329d944 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -808,6 +808,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); From 1b0b5ca8f498a29c8646a3fd3bd5accbb8f8a156 Mon Sep 17 00:00:00 2001 From: Lukasz Gemborowski Date: Mon, 27 Jun 2016 12:57:47 +0200 Subject: [PATCH 061/118] i2c: mux: reg: wrong condition checked for of_address_to_resource return value commit 22ebf00eb56fe77922de8138aa9af9996582c2b3 upstream. of_address_to_resource return 0 on successful call but devm_ioremap_resource is called only if it returns non-zero value Signed-off-by: Lukasz Gemborowski Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/muxes/i2c-mux-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 5fbd5bd0878f1..49fc2c7e560a6 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -150,7 +150,7 @@ static int i2c_mux_reg_probe_dt(struct regmux *mux, mux->data.idle_in_use = true; /* map address from "reg" if exists */ - if (of_address_to_resource(np, 0, &res)) { + if (of_address_to_resource(np, 0, &res) == 0) { mux->data.reg_size = resource_size(&res); mux->data.reg = devm_ioremap_resource(&pdev->dev, &res); if (IS_ERR(mux->data.reg)) From 79cc80f89c4219fc03644c3b30602184a44fb54e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Jul 2016 18:40:00 -0400 Subject: [PATCH 062/118] libata: LITE-ON CX1-JB256-HP needs lower max_sectors commit 1488a1e3828d60d74c9b802a05e24c0487babe4e upstream. Since 34b48db66e08 ("block: remove artifical max_hw_sectors cap"), max_sectors is no longer limited to BLK_DEF_MAX_SECTORS and LITE-ON CX1-JB256-HP keeps timing out with higher max_sectors. Revert it to the previous value. Signed-off-by: Tejun Heo Reported-by: dgerasimov@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=121671 Fixes: 34b48db66e08 ("block: remove artifical max_hw_sectors cap") Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b79cb10e289e8..bd370c98f77d2 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4138,6 +4138,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { */ { "ST380013AS", "3.20", ATA_HORKAGE_MAX_SEC_1024 }, + /* + * Device times out with higher max sects. + * https://bugzilla.kernel.org/show_bug.cgi?id=121671 + */ + { "LITEON CX1-JB256-HP", NULL, ATA_HORKAGE_MAX_SEC_1024 }, + /* Devices we expect to fail diagnostics */ /* Devices where NCQ should be avoided */ From 032951d32c13b7564dfba82758260cb7aa1149d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 Jul 2016 03:50:28 +0200 Subject: [PATCH 063/118] libceph: apply new_state before new_up_client on incrementals commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream. Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin Signed-off-by: Greg Kroah-Hartman --- net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++------------- 1 file changed, 113 insertions(+), 43 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f7..ddc3573894b09 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1191,6 +1191,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) return map; } +/* + * Encoding order is (new_up_client, new_state, new_weight). Need to + * apply in the (new_weight, new_state, new_up_client) order, because + * an incremental map may look like e.g. + * + * new_up_client: { osd=6, addr=... } # set osd_state and addr + * new_state: { osd=6, xorstate=EXISTS } # clear osd_state + */ +static int decode_new_up_state_weight(void **p, void *end, + struct ceph_osdmap *map) +{ + void *new_up_client; + void *new_state; + void *new_weight_end; + u32 len; + + new_up_client = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(struct ceph_entity_addr); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + new_state = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(u8); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + /* new_weight */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + s32 osd; + u32 w; + + ceph_decode_need(p, end, 2*sizeof(u32), e_inval); + osd = ceph_decode_32(p); + w = ceph_decode_32(p); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d weight 0x%x %s\n", osd, w, + w == CEPH_OSD_IN ? "(in)" : + (w == CEPH_OSD_OUT ? "(out)" : "")); + map->osd_weight[osd] = w; + + /* + * If we are marking in, set the EXISTS, and clear the + * AUTOOUT and NEW bits. + */ + if (w) { + map->osd_state[osd] |= CEPH_OSD_EXISTS; + map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | + CEPH_OSD_NEW); + } + } + new_weight_end = *p; + + /* new_state (up/down) */ + *p = new_state; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + u8 xorstate; + int ret; + + osd = ceph_decode_32(p); + xorstate = ceph_decode_8(p); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + BUG_ON(osd >= map->max_osd); + if ((map->osd_state[osd] & CEPH_OSD_UP) && + (xorstate & CEPH_OSD_UP)) + pr_info("osd%d down\n", osd); + if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && + (xorstate & CEPH_OSD_EXISTS)) { + pr_info("osd%d does not exist\n", osd); + map->osd_weight[osd] = CEPH_OSD_IN; + ret = set_primary_affinity(map, osd, + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + if (ret) + return ret; + memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); + map->osd_state[osd] = 0; + } else { + map->osd_state[osd] ^= xorstate; + } + } + + /* new_up_client */ + *p = new_up_client; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + struct ceph_entity_addr addr; + + osd = ceph_decode_32(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d up\n", osd); + map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + *p = new_weight_end; + return 0; + +e_inval: + return -EINVAL; +} + /* * decode and apply an incremental map update. */ @@ -1290,49 +1399,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __remove_pg_pool(&map->pg_pools, pi); } - /* new_up */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, e_inval); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); - ceph_decode_addr(&addr); - pr_info("osd%d up\n", osd); - BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; - map->osd_addr[osd] = addr; - } - - /* new_state */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - u8 xorstate; - ceph_decode_32_safe(p, end, osd, e_inval); - xorstate = **(u8 **)p; - (*p)++; /* clean flag */ - if (xorstate == 0) - xorstate = CEPH_OSD_UP; - if (xorstate & CEPH_OSD_UP) - pr_info("osd%d down\n", osd); - if (osd < map->max_osd) - map->osd_state[osd] ^= xorstate; - } - - /* new_weight */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, e_inval); - osd = ceph_decode_32(p); - off = ceph_decode_32(p); - pr_info("osd%d weight 0x%x %s\n", osd, off, - off == CEPH_OSD_IN ? "(in)" : - (off == CEPH_OSD_OUT ? "(out)" : "")); - if (osd < map->max_osd) - map->osd_weight[osd] = off; - } + /* new_up_client, new_state, new_weight */ + err = decode_new_up_state_weight(p, end, map); + if (err) + goto bad; /* new_pg_temp */ err = decode_new_pg_temp(p, end, map); From 703cfaf375e83159d2113774faa53d2c68c86d67 Mon Sep 17 00:00:00 2001 From: Dmitri Epshtein Date: Wed, 6 Jul 2016 04:18:58 +0200 Subject: [PATCH 064/118] net: mvneta: set real interrupt per packet for tx_done commit 06708f81528725148473c0869d6af5f809c6824b upstream. Commit aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") intended to set coalescing threshold to a value guaranteeing interrupt generation per each sent packet, so that buffers can be released with no delay. In fact setting threshold to '1' was wrong, because it causes interrupt every two packets. According to the documentation a reason behind it is following - interrupt occurs once sent buffers counter reaches a value, which is higher than one specified in MVNETA_TXQ_SIZE_REG(q). This behavior was confirmed during tests. Also when testing the SoC working as a NAS device, better performance was observed with int-per-packet, as it strongly depends on the fact that all transmitted packets are released immediately. This commit enables NETA controller work in interrupt per sent packet mode by setting coalescing threshold to 0. Signed-off-by: Dmitri Epshtein Signed-off-by: Marcin Wojtas Fixes aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") Acked-by: Willy Tarreau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index a4ac6fedac75b..71ec9cb08e067 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -226,7 +226,7 @@ /* Various constants */ /* Coalescing */ -#define MVNETA_TXDONE_COAL_PKTS 1 +#define MVNETA_TXDONE_COAL_PKTS 0 /* interrupt per packet */ #define MVNETA_RX_COAL_PKTS 32 #define MVNETA_RX_COAL_USEC 100 From c800964923a365289152304bdc047f0d470dbcab Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 28 Jun 2016 18:55:23 +0300 Subject: [PATCH 065/118] intel_th: pci: Add Kaby Lake PCH-H support commit 7a1a47ce35821b40f5b2ce46379ba14393bc3873 upstream. This adds Intel(R) Trace Hub PCI ID for Kaby Lake PCH-H. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 641e87936064b..d57a2f75dccf2 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -67,6 +67,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa126), .driver_data = (kernel_ulong_t)0, }, + { + /* Kaby Lake PCH-H */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa2a6), + .driver_data = (kernel_ulong_t)0, + }, { 0 }, }; From 33f9cff6ec2fbfcf0b40d4328d292c745185fdf4 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jun 2016 11:51:44 +0300 Subject: [PATCH 066/118] intel_th: Fix a deadlock in modprobing commit a36aa80f3cb2540fb1dbad6240852de4365a2e82 upstream. Driver initialization tries to request a hub (GTH) driver module from its probe callback, resulting in a deadlock. This patch solves the problem by adding a deferred work for requesting the hub module. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/core.c | 35 ++++++++++++++++++++++++++- drivers/hwtracing/intel_th/intel_th.h | 3 +++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index 165d3001c3015..c6ec5c62b7a9e 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -419,6 +419,38 @@ static struct intel_th_subdevice { }, }; +#ifdef CONFIG_MODULES +static void __intel_th_request_hub_module(struct work_struct *work) +{ + struct intel_th *th = container_of(work, struct intel_th, + request_module_work); + + request_module("intel_th_%s", th->hub->name); +} + +static int intel_th_request_hub_module(struct intel_th *th) +{ + INIT_WORK(&th->request_module_work, __intel_th_request_hub_module); + schedule_work(&th->request_module_work); + + return 0; +} + +static void intel_th_request_hub_module_flush(struct intel_th *th) +{ + flush_work(&th->request_module_work); +} +#else +static inline int intel_th_request_hub_module(struct intel_th *th) +{ + return -EINVAL; +} + +static inline void intel_th_request_hub_module_flush(struct intel_th *th) +{ +} +#endif /* CONFIG_MODULES */ + static int intel_th_populate(struct intel_th *th, struct resource *devres, unsigned int ndevres, int irq) { @@ -488,7 +520,7 @@ static int intel_th_populate(struct intel_th *th, struct resource *devres, /* need switch driver to be loaded to enumerate the rest */ if (subdev->type == INTEL_TH_SWITCH && !req) { th->hub = thdev; - err = request_module("intel_th_%s", subdev->name); + err = intel_th_request_hub_module(th); if (!err) req++; } @@ -603,6 +635,7 @@ void intel_th_free(struct intel_th *th) { int i; + intel_th_request_hub_module_flush(th); for (i = 0; i < TH_SUBDEVICE_MAX; i++) if (th->thdev[i] != th->hub) intel_th_device_remove(th->thdev[i]); diff --git a/drivers/hwtracing/intel_th/intel_th.h b/drivers/hwtracing/intel_th/intel_th.h index 57fd72b20fae3..d03a6cd1c65d5 100644 --- a/drivers/hwtracing/intel_th/intel_th.h +++ b/drivers/hwtracing/intel_th/intel_th.h @@ -197,6 +197,9 @@ struct intel_th { int id; int major; +#ifdef CONFIG_MODULES + struct work_struct request_module_work; +#endif /* CONFIG_MODULES */ #ifdef CONFIG_INTEL_TH_DEBUG struct dentry *dbg; #endif From 8e510cd92199e863bd457f6b56ad85e00dfb3cb3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 3 Aug 2016 13:44:27 +0200 Subject: [PATCH 067/118] vfs: fix deadlock in file_remove_privs() on overlayfs commit c1892c37769cf89c7e7ba57528ae2ccb5d153c9b upstream. file_remove_privs() is called with inode lock on file_inode(), which proceeds to calling notify_change() on file->f_path.dentry. Which triggers the WARN_ON_ONCE(!inode_is_locked(inode)) in addition to deadlocking later when ovl_setattr tries to lock the underlying inode again. Fix this mess by not mixing the layers, but doing everything on underlying dentry/inode. Signed-off-by: Miklos Szeredi Fixes: 07a2daab49c5 ("ovl: Copy up underlying inode's ->i_mode to overlay inode") Signed-off-by: Greg Kroah-Hartman --- fs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 1be5f9003eb38..b0edef500590c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1733,8 +1733,8 @@ static int __remove_privs(struct dentry *dentry, int kill) */ int file_remove_privs(struct file *file) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); int kill; int error = 0; @@ -1742,7 +1742,7 @@ int file_remove_privs(struct file *file) if (IS_NOSEC(inode)) return 0; - kill = file_needs_remove_privs(file); + kill = dentry_needs_remove_privs(dentry); if (kill < 0) return kill; if (kill) From 133cec911c639d2cdf544ed602442951f702e08c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 10 Aug 2016 11:49:43 +0200 Subject: [PATCH 068/118] Linux 4.4.17 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index da7621cadc8e5..76d34f763a412 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 16 +SUBLEVEL = 17 EXTRAVERSION = NAME = Blurry Fish Butt From 72c2d3bccaba4a0a4de354f9d2d24eccd05bfccf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: [PATCH 069/118] tcp: make challenge acks less predictable [ Upstream commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758 ] Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4c51158470f5..05f10df6ee861 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3427,7 +3427,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3435,13 +3435,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } From 5413f1a526d2d51d7a5768133c90936c017165c6 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: [PATCH 070/118] tcp: enable per-socket rate limiting of all 'challenge acks' [ Upstream commit 083ae308280d13d187512b9babe3454342a7987e ] The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 05f10df6ee861..12b98e257c5f2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3390,6 +3390,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3403,21 +3420,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; - - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; + return false; -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3430,9 +3435,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ From a9c221859696f976ba47ba39178af1175e4558e0 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 Jul 2016 21:11:55 +0300 Subject: [PATCH 071/118] ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space [ Upstream commit 80610229ef7b26615dbb6cb6e873709a60bacc9f ] Vegard Nossum is reporting for a crash in fib_dump_info when nh_dev = NULL and fib_nhs == 1: Pid: 50, comm: netlink.exe Not tainted 4.7.0-rc5+ RIP: 0033:[<00000000602b3d18>] RSP: 0000000062623890 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006261b800 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000024 RDI: 000000006245ba00 RBP: 00000000626238f0 R08: 000000000000029c R09: 0000000000000000 R10: 0000000062468038 R11: 000000006245ba00 R12: 000000006245ba00 R13: 00000000625f96c0 R14: 00000000601e16f0 R15: 0000000000000000 Kernel panic - not syncing: Kernel mode fault at addr 0x2e0, ip 0x602b3d18 CPU: 0 PID: 50 Comm: netlink.exe Not tainted 4.7.0-rc5+ #581 Stack: 626238f0 960226a02 00000400 000000fe 62623910 600afca7 62623970 62623a48 62468038 00000018 00000000 00000000 Call Trace: [<602b3e93>] rtmsg_fib+0xd3/0x190 [<602b6680>] fib_table_insert+0x260/0x500 [<602b0e5d>] inet_rtm_newroute+0x4d/0x60 [<60250def>] rtnetlink_rcv_msg+0x8f/0x270 [<60267079>] netlink_rcv_skb+0xc9/0xe0 [<60250d4b>] rtnetlink_rcv+0x3b/0x50 [<60265400>] netlink_unicast+0x1a0/0x2c0 [<60265e47>] netlink_sendmsg+0x3f7/0x470 [<6021dc9a>] sock_sendmsg+0x3a/0x90 [<6021e0d0>] ___sys_sendmsg+0x300/0x360 [<6021fa64>] __sys_sendmsg+0x54/0xa0 [<6021fac0>] SyS_sendmsg+0x10/0x20 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 $ addr2line -e vmlinux -i 0x602b3d18 include/linux/inetdevice.h:222 net/ipv4/fib_semantics.c:1264 Problem happens when RTNH_F_LINKDOWN is provided from user space when creating routes that do not use the flag, catched with netlink fuzzer. Currently, the kernel allows user space to set both flags to nh_flags and fib_flags but this is not intentional, the assumption was that they are not set. Fix this by rejecting both flags with EINVAL. Reported-by: Vegard Nossum Fixes: 0eeb075fad73 ("net: ipv4 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Julian Anastasov Cc: Andy Gospodarek Cc: Dinesh Dutt Cc: Scott Feldman Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_semantics.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b68418c71980..ffe95d954007d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + return -EINVAL; + nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex; @@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + goto err_inval; + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); From 0020fa536cc610216f80b798b9a1c9b13c3a37fb Mon Sep 17 00:00:00 2001 From: Beniamino Galvani Date: Wed, 13 Jul 2016 18:25:08 +0200 Subject: [PATCH 072/118] bonding: set carrier off for devices created through netlink [ Upstream commit 005db31d5f5f7c31cfdc43505d77eb3ca5cf8ec6 ] Commit e826eafa65c6 ("bonding: Call netif_carrier_off after register_netdevice") moved netif_carrier_off() from bond_init() to bond_create(), but the latter is called only for initial default devices and ones created through sysfs: $ modprobe bonding $ echo +bond1 > /sys/class/net/bonding_masters $ ip link add bond2 type bond $ grep "MII Status" /proc/net/bonding/* /proc/net/bonding/bond0:MII Status: down /proc/net/bonding/bond1:MII Status: down /proc/net/bonding/bond2:MII Status: up Ensure that carrier is initially off also for devices created through netlink. Signed-off-by: Beniamino Galvani Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index db760e84119fc..b8df0f5e8c25a 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -446,7 +446,11 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev, if (err < 0) return err; - return register_netdevice(bond_dev); + err = register_netdevice(bond_dev); + + netif_carrier_off(bond_dev); + + return err; } static size_t bond_get_size(const struct net_device *bond_dev) From 863c8bb8be39ad11f0d9d66a431b3d9ca5c11dd7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 15:42:52 -0700 Subject: [PATCH 073/118] net: bgmac: Fix infinite loop in bgmac_dma_tx_add() [ Upstream commit e86663c475d384ab5f46cb5637e9b7ad08c5c505 ] Nothing is decrementing the index "i" while we are cleaning up the fragments we could not successful transmit. Fixes: 9cde94506eacf ("bgmac: implement scatter/gather support") Reported-by: coverity (CID 1352048) Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bgmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 28f7610b03feb..c32f5d32f8118 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -219,7 +219,7 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac, dma_unmap_single(dma_dev, slot->dma_addr, skb_headlen(skb), DMA_TO_DEVICE); - while (i > 0) { + while (i-- > 0) { int index = (ring->end + i) % BGMAC_TX_RING_SLOTS; struct bgmac_slot_info *slot = &ring->slots[index]; u32 ctl1 = le32_to_cpu(ring->cpu_base[index].ctl1); From fc9b7c086b6743aa4b1a70ada58352c665ada49a Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 07:43:50 +0200 Subject: [PATCH 074/118] net/irda: fix NULL pointer dereference on memory allocation failure [ Upstream commit d3e6952cfb7ba5f4bfa29d4803ba91f96ce1204d ] I ran into this: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 2 PID: 2012 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff8800b745f2c0 ti: ffff880111740000 task.ti: ffff880111740000 RIP: 0010:[] [] irttp_connect_request+0x36/0x710 RSP: 0018:ffff880111747bb8 EFLAGS: 00010286 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000069dd8358 RDX: 0000000000000009 RSI: 0000000000000027 RDI: 0000000000000048 RBP: ffff880111747c00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000069dd8358 R11: 1ffffffff0759723 R12: 0000000000000000 R13: ffff88011a7e4780 R14: 0000000000000027 R15: 0000000000000000 FS: 00007fc738404700(0000) GS:ffff88011af00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc737fdfb10 CR3: 0000000118087000 CR4: 00000000000006e0 Stack: 0000000000000200 ffff880111747bd8 ffffffff810ee611 ffff880119f1f220 ffff880119f1f4f8 ffff880119f1f4f0 ffff88011a7e4780 ffff880119f1f232 ffff880119f1f220 ffff880111747d58 ffffffff82bca542 0000000000000000 Call Trace: [] irda_connect+0x562/0x1190 [] SYSC_connect+0x202/0x2a0 [] SyS_connect+0x9/0x10 [] do_syscall_64+0x19c/0x410 [] entry_SYSCALL64_slow_path+0x25/0x25 Code: 41 89 ca 48 89 e5 41 57 41 56 41 55 41 54 41 89 d7 53 48 89 fb 48 83 c7 48 48 89 fa 41 89 f6 48 c1 ea 03 48 83 ec 20 4c 8b 65 10 <0f> b6 04 02 84 c0 74 08 84 c0 0f 8e 4c 04 00 00 80 7b 48 00 74 RIP [] irttp_connect_request+0x36/0x710 RSP ---[ end trace 4cda2588bc055b30 ]--- The problem is that irda_open_tsap() can fail and leave self->tsap = NULL, and then irttp_connect_request() almost immediately dereferences it. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/irda/af_irda.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 923abd6b30640..8d2f7c9b491da 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1024,8 +1024,11 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, } /* Check if we have opened a local TSAP */ - if (!self->tsap) - irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (!self->tsap) { + err = irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (err) + goto out; + } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; From e23696bc441f5e4fefb18e81d51069632480f64a Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 25 Jul 2016 19:07:46 +0300 Subject: [PATCH 075/118] qed: Fix setting/clearing bit in completion bitmap [ Upstream commit 59d3f1ceb69b54569685d0c34dff16a1e0816b19 ] Slowpath completion handling is incorrectly changing SPQ_RING_SIZE bits instead of a single one. Fixes: 76a9a3642a0b ("qed: fix handling of concurrent ramrods") Signed-off-by: Manish Chopra Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_spq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index 3dd548ab8df14..40365cb1abe6c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -794,13 +794,12 @@ int qed_spq_completion(struct qed_hwfn *p_hwfn, * in a bitmap and increasing the chain consumer only * for the first successive completed entries. */ - bitmap_set(p_spq->p_comp_bitmap, pos, SPQ_RING_SIZE); + __set_bit(pos, p_spq->p_comp_bitmap); while (test_bit(p_spq->comp_bitmap_idx, p_spq->p_comp_bitmap)) { - bitmap_clear(p_spq->p_comp_bitmap, - p_spq->comp_bitmap_idx, - SPQ_RING_SIZE); + __clear_bit(p_spq->comp_bitmap_idx, + p_spq->p_comp_bitmap); p_spq->comp_bitmap_idx++; qed_chain_return_produced(&p_spq->chain); } From 9c946c931b63068c4197d9d0b4d24635418bc67d Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 29 Jul 2016 09:34:02 -0400 Subject: [PATCH 076/118] tcp: consider recv buf for the initial window scale [ Upstream commit f626300a3e776ccc9671b0dd94698fb3aa315966 ] tcp_select_initial_window() intends to advertise a window scaling for the maximum possible window size. To do so, it considers the maximum of net.ipv4.tcp_rmem[2] and net.core.rmem_max as the only possible upper-bounds. However, users with CAP_NET_ADMIN can use SO_RCVBUFFORCE to set the socket's receive buffer size to values larger than net.ipv4.tcp_rmem[2] and net.core.rmem_max. Thus, SO_RCVBUFFORCE is effectively ignored by tcp_select_initial_window(). To fix this, consider the maximum of net.ipv4.tcp_rmem[2], net.core.rmem_max and socket's initial buffer space. Fixes: b0573dea1fb3 ("[NET]: Introduce SO_{SND,RCV}BUFFORCE socket options") Signed-off-by: Soheil Hassas Yeganeh Suggested-by: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7c9883ab56e54..660c967ba84a0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -239,7 +239,8 @@ void tcp_select_initial_window(int __space, __u32 mss, /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ - space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; From 694dfd0ef02ded5b6fbea03a12350ee8a74921d5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 31 May 2016 03:33:57 +0100 Subject: [PATCH 077/118] ipath: Restrict use of the write() interface Commit e6bd18f57aad ("IB/security: Restrict use of the write() interface") fixed a security problem with various write() implementations in the Infiniband subsystem. In older kernel versions the ipath_write() function has the same problem and needs the same restriction. (The ipath driver has been completely removed upstream.) Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rdma/ipath/ipath_file_ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c index 13c3cd11ab92a..05d30f433b194 100644 --- a/drivers/staging/rdma/ipath/ipath_file_ops.c +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -45,6 +45,8 @@ #include #include +#include + #include "ipath_kernel.h" #include "ipath_common.h" #include "ipath_user_sdma.h" @@ -2243,6 +2245,9 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, ssize_t ret = 0; void *dest; + if (WARN_ON_ONCE(!ib_safe_file_access(fp))) + return -EACCES; + if (count < sizeof(cmd.type)) { ret = -EINVAL; goto bail; From 5a6f9d06d844763261f89850f33a4b84cfc0f1c1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 1 Dec 2015 10:16:42 +0100 Subject: [PATCH 078/118] scsi: ignore errors from scsi_dh_add_device() commit 221255aee67ec1c752001080aafec0c4e9390d95 upstream. device handler initialisation might fail due to a number of reasons. But as device_handlers are optional this shouldn't cause us to disable the device entirely. So just ignore errors from scsi_dh_add_device(). Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_sysfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index f7ae898833dd9..7232d43e2207d 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1058,11 +1058,12 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev) } error = scsi_dh_add_device(sdev); - if (error) { + if (error) + /* + * device_handler is optional, so any error can be ignored + */ sdev_printk(KERN_INFO, sdev, "failed to add device handler: %d\n", error); - return error; - } device_enable_async_suspend(&sdev->sdev_dev); error = device_add(&sdev->sdev_dev); From 02170f4afcb4514270fcd39cec05650b7858c605 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 3 Feb 2016 01:00:29 +0100 Subject: [PATCH 079/118] PNP: Add Haswell-ULT to Intel MCH size workaround commit ed1f0eeebaeeb7f790e9e7642116a208581e5bfc upstream. Add device ID 0x0a04 for Haswell-ULT to the list of devices with MCH problems. From a Lenovo ThinkPad T440S: [ 0.188604] pnp: PnP ACPI init [ 0.189044] system 00:00: [mem 0x00000000-0x0009ffff] could not be reserved [ 0.189048] system 00:00: [mem 0x000c0000-0x000c3fff] could not be reserved [ 0.189050] system 00:00: [mem 0x000c4000-0x000c7fff] could not be reserved [ 0.189052] system 00:00: [mem 0x000c8000-0x000cbfff] could not be reserved [ 0.189054] system 00:00: [mem 0x000cc000-0x000cffff] could not be reserved [ 0.189056] system 00:00: [mem 0x000d0000-0x000d3fff] has been reserved [ 0.189058] system 00:00: [mem 0x000d4000-0x000d7fff] has been reserved [ 0.189060] system 00:00: [mem 0x000d8000-0x000dbfff] has been reserved [ 0.189061] system 00:00: [mem 0x000dc000-0x000dffff] has been reserved [ 0.189063] system 00:00: [mem 0x000e0000-0x000e3fff] could not be reserved [ 0.189065] system 00:00: [mem 0x000e4000-0x000e7fff] could not be reserved [ 0.189067] system 00:00: [mem 0x000e8000-0x000ebfff] could not be reserved [ 0.189069] system 00:00: [mem 0x000ec000-0x000effff] could not be reserved [ 0.189071] system 00:00: [mem 0x000f0000-0x000fffff] could not be reserved [ 0.189073] system 00:00: [mem 0x00100000-0xdf9fffff] could not be reserved [ 0.189075] system 00:00: [mem 0xfec00000-0xfed3ffff] could not be reserved [ 0.189078] system 00:00: [mem 0xfed4c000-0xffffffff] could not be reserved [ 0.189082] system 00:00: Plug and Play ACPI device, IDs PNP0c01 (active) [ 0.189216] system 00:01: [io 0x1800-0x189f] could not be reserved [ 0.189220] system 00:01: [io 0x0800-0x087f] has been reserved [ 0.189222] system 00:01: [io 0x0880-0x08ff] has been reserved [ 0.189224] system 00:01: [io 0x0900-0x097f] has been reserved [ 0.189226] system 00:01: [io 0x0980-0x09ff] has been reserved [ 0.189229] system 00:01: [io 0x0a00-0x0a7f] has been reserved [ 0.189231] system 00:01: [io 0x0a80-0x0aff] has been reserved [ 0.189233] system 00:01: [io 0x0b00-0x0b7f] has been reserved [ 0.189235] system 00:01: [io 0x0b80-0x0bff] has been reserved [ 0.189238] system 00:01: [io 0x15e0-0x15ef] has been reserved [ 0.189240] system 00:01: [io 0x1600-0x167f] has been reserved [ 0.189242] system 00:01: [io 0x1640-0x165f] has been reserved [ 0.189246] system 00:01: [mem 0xf8000000-0xfbffffff] could not be reserved [ 0.189249] system 00:01: [mem 0x00000000-0x00000fff] could not be reserved [ 0.189251] system 00:01: [mem 0xfed1c000-0xfed1ffff] has been reserved [ 0.189254] system 00:01: [mem 0xfed10000-0xfed13fff] has been reserved [ 0.189256] system 00:01: [mem 0xfed18000-0xfed18fff] has been reserved [ 0.189258] system 00:01: [mem 0xfed19000-0xfed19fff] has been reserved [ 0.189261] system 00:01: [mem 0xfed45000-0xfed4bfff] has been reserved [ 0.189264] system 00:01: Plug and Play ACPI device, IDs PNP0c02 (active) [....] [ 0.583653] resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] [ 0.583654] ------------[ cut here ]------------ [ 0.583660] WARNING: CPU: 0 PID: 1 at arch/x86/mm/ioremap.c:198 __ioremap_caller+0x2c5/0x380() [ 0.583661] Info: mapping multiple BARs. Your kernel is fine. [ 0.583662] Modules linked in: [ 0.583666] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.3.3-303.fc23.x86_64 #1 [ 0.583668] Hardware name: LENOVO 20AR001GXS/20AR001GXS, BIOS GJET86WW (2.36 ) 12/04/2015 [ 0.583670] 0000000000000000 0000000014cf7e59 ffff880214a1baf8 ffffffff813a625f [ 0.583673] ffff880214a1bb40 ffff880214a1bb30 ffffffff810a07c2 00000000fed10000 [ 0.583675] ffffc90000cb8000 0000000000006000 0000000000000000 ffff8800d6381040 [ 0.583678] Call Trace: [ 0.583683] [] dump_stack+0x44/0x55 [ 0.583686] [] warn_slowpath_common+0x82/0xc0 [ 0.583688] [] warn_slowpath_fmt+0x5c/0x80 [ 0.583692] [] ? iomem_map_sanity_check+0xba/0xd0 [ 0.583695] [] __ioremap_caller+0x2c5/0x380 [ 0.583698] [] ioremap_nocache+0x17/0x20 [ 0.583701] [] snb_uncore_imc_init_box+0x79/0xb0 [ 0.583705] [] uncore_pci_probe+0xd0/0x1b0 [ 0.583707] [] local_pci_probe+0x45/0xa0 [ 0.583710] [] pci_device_probe+0xfd/0x140 [ 0.583713] [] driver_probe_device+0x222/0x480 [ 0.583715] [] __driver_attach+0x84/0x90 [ 0.583717] [] ? driver_probe_device+0x480/0x480 [ 0.583720] [] bus_for_each_dev+0x6c/0xc0 [ 0.583722] [] driver_attach+0x1e/0x20 [ 0.583724] [] bus_add_driver+0x1eb/0x280 [ 0.583727] [] ? uncore_cpu_setup+0x12/0x12 [ 0.583729] [] driver_register+0x60/0xe0 [ 0.583733] [] __pci_register_driver+0x4c/0x50 [ 0.583736] [] intel_uncore_init+0xe2/0x2e6 [ 0.583738] [] ? uncore_cpu_setup+0x12/0x12 [ 0.583741] [] do_one_initcall+0xb3/0x200 [ 0.583745] [] ? parse_args+0x1a0/0x4a0 [ 0.583749] [] kernel_init_freeable+0x189/0x223 [ 0.583752] [] ? rest_init+0x80/0x80 [ 0.583754] [] kernel_init+0xe/0xe0 [ 0.583758] [] ret_from_fork+0x3f/0x70 [ 0.583760] [] ? rest_init+0x80/0x80 [ 0.583765] ---[ end trace 077c426a39e018aa ]--- 00:00.0 Host bridge [0600]: Intel Corporation Haswell-ULT DRAM Controller [8086:0a04] (rev 0b) Subsystem: Lenovo Device [17aa:220c] Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=fast >TAbort- SERR- Kernel driver in use: hsw_uncore Link: https://bugzilla.redhat.com/show_bug.cgi?id=1300955 Tested-by: Signed-off-by: Josh Boyer Signed-off-by: Rafael J. Wysocki Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/pnp/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index 943c1cb9566c8..f5444f7ecc411 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -342,6 +342,7 @@ static void quirk_amd_mmconfig_area(struct pnp_dev *dev) /* Device IDs of parts that have 32KB MCH space */ static const unsigned int mch_quirk_devices[] = { 0x0154, /* Ivy Bridge */ + 0x0a04, /* Haswell-ULT */ 0x0c00, /* Haswell */ }; From d71d4aceae67acf0dd95fa288439d9801f76c9cb Mon Sep 17 00:00:00 2001 From: Christophe Le Roy Date: Fri, 11 Dec 2015 09:13:42 +0100 Subject: [PATCH 080/118] PNP: Add Broadwell to Intel MCH size workaround commit a77060f07ffc6ac978e280e738302f3e5572a99e upstream. Add device ID 0x1604 for Broadwell to commit cb171f7abb9a ("PNP: Work around BIOS defects in Intel MCH area reporting"). >From a Lenovo ThinkPad T550: system 00:01: [io 0x1800-0x189f] could not be reserved system 00:01: [io 0x0800-0x087f] has been reserved system 00:01: [io 0x0880-0x08ff] has been reserved system 00:01: [io 0x0900-0x097f] has been reserved system 00:01: [io 0x0980-0x09ff] has been reserved system 00:01: [io 0x0a00-0x0a7f] has been reserved system 00:01: [io 0x0a80-0x0aff] has been reserved system 00:01: [io 0x0b00-0x0b7f] has been reserved system 00:01: [io 0x0b80-0x0bff] has been reserved system 00:01: [io 0x15e0-0x15ef] has been reserved system 00:01: [io 0x1600-0x167f] has been reserved system 00:01: [io 0x1640-0x165f] has been reserved system 00:01: [mem 0xf8000000-0xfbffffff] could not be reserved system 00:01: [mem 0xfed1c000-0xfed1ffff] has been reserved system 00:01: [mem 0xfed10000-0xfed13fff] has been reserved system 00:01: [mem 0xfed18000-0xfed18fff] has been reserved system 00:01: [mem 0xfed19000-0xfed19fff] has been reserved system 00:01: [mem 0xfed45000-0xfed4bfff] has been reserved system 00:01: Plug and Play ACPI device, IDs PNP0c02 (active) [...] resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1 at /build/linux-CrHvZ_/linux-4.2.6/arch/x86/mm/ioremap.c:198 __ioremap_caller+0x2ee/0x360() Info: mapping multiple BARs. Your kernel is fine. Modules linked in: CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.2.0-1-amd64 #1 Debian 4.2.6-1 Hardware name: LENOVO 20CKCTO1WW/20CKCTO1WW, BIOS N11ET34W (1.10 ) 08/20/2015 0000000000000000 ffffffff817e6868 ffffffff8154e2f6 ffff8802241efbf8 ffffffff8106e5b1 ffffc90000e98000 0000000000006000 ffffc90000e98000 0000000000006000 0000000000000000 ffffffff8106e62a ffffffff817e68c8 Call Trace: [] ? dump_stack+0x40/0x50 [] ? warn_slowpath_common+0x81/0xb0 [] ? warn_slowpath_fmt+0x4a/0x50 [] ? iomem_map_sanity_check+0xb3/0xc0 [] ? __ioremap_caller+0x2ee/0x360 [] ? snb_uncore_imc_init_box+0x66/0x90 [] ? uncore_pci_probe+0xc8/0x1a0 [] ? local_pci_probe+0x3f/0xa0 [] ? pci_device_probe+0xc4/0x110 [] ? driver_probe_device+0x1ee/0x450 [] ? __driver_attach+0x7b/0x80 [] ? driver_probe_device+0x450/0x450 [] ? bus_for_each_dev+0x5a/0x90 [] ? bus_add_driver+0x1f1/0x290 [] ? uncore_cpu_setup+0xc/0xc [] ? driver_register+0x5f/0xe0 [] ? intel_uncore_init+0xcc/0x2b0 [] ? uncore_cpu_setup+0xc/0xc [] ? do_one_initcall+0xce/0x200 [] ? parse_args+0x140/0x4e0 [] ? kernel_init_freeable+0x162/0x1e8 [] ? rest_init+0x80/0x80 [] ? kernel_init+0xe/0xf0 [] ? ret_from_fork+0x3f/0x70 [] ? rest_init+0x80/0x80 ---[ end trace 472e7959536abf12 ]--- 00:00.0 Host bridge: Intel Corporation Broadwell-U Host Bridge -OPI (rev 09) Subsystem: Lenovo Device 2223 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=fast >TAbort- SERR- Kernel driver in use: bdw_uncore 00: 86 80 04 16 06 00 90 20 09 00 00 06 00 00 00 00 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20: 00 00 00 00 00 00 00 00 00 00 00 00 aa 17 23 22 30: 00 00 00 00 e0 00 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Christophe Le Roy Signed-off-by: Rafael J. Wysocki Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/pnp/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index f5444f7ecc411..d28e3ab9479c6 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -344,6 +344,7 @@ static const unsigned int mch_quirk_devices[] = { 0x0154, /* Ivy Bridge */ 0x0a04, /* Haswell-ULT */ 0x0c00, /* Haswell */ + 0x1604, /* Broadwell */ }; static struct pci_dev *get_intel_host(void) From 6e1242497cdf8274b8a27f24325634089c77285e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 8 Jan 2016 17:58:49 +0100 Subject: [PATCH 081/118] HID: sony: do not bail out when the sixaxis refuses the output report commit 19f4c2ba869517048add62c202f9645b6adf5dfb upstream. When setting the operational mode, some third party (Speedlink Strike-FX) gamepads refuse the output report. Failing here means we refuse to initialize the gamepad while this should be harmless. The weird part is that the initial commit that added this: a7de9b8 ("HID: sony: Enable Gasia third-party PS3 controllers") mentions this very same controller as one requiring this output report. Anyway, it's broken for one user at least, so let's change it. We will report an error, but at least the controller should work. And no, these devices present themselves as legacy Sony controllers (VID:PID of 054C:0268, as in the official ones) so there are no ways of discriminating them from the official ones. https://bugzilla.redhat.com/show_bug.cgi?id=1255325 Reported-and-tested-by: Max Fedotov Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-sony.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 774cd22105665..21febbb0d84e6 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1418,8 +1418,10 @@ static int sixaxis_set_operational_usb(struct hid_device *hdev) } ret = hid_hw_output_report(hdev, buf, 1); - if (ret < 0) - hid_err(hdev, "can't set operational mode: step 3\n"); + if (ret < 0) { + hid_info(hdev, "can't set operational mode: step 3, ignoring\n"); + ret = 0; + } out: kfree(buf); From 979a61a02992e2029fcedcdf32c05050aa652c9c Mon Sep 17 00:00:00 2001 From: Hector Marco-Gisbert Date: Thu, 10 Mar 2016 20:51:00 +0100 Subject: [PATCH 082/118] x86/mm/32: Enable full randomization on i386 and X86_32 commit 8b8addf891de8a00e4d39fc32f93f7c5eb8feceb upstream. Currently on i386 and on X86_64 when emulating X86_32 in legacy mode, only the stack and the executable are randomized but not other mmapped files (libraries, vDSO, etc.). This patch enables randomization for the libraries, vDSO and mmap requests on i386 and in X86_32 in legacy mode. By default on i386 there are 8 bits for the randomization of the libraries, vDSO and mmaps which only uses 1MB of VA. This patch preserves the original randomness, using 1MB of VA out of 3GB or 4GB. We think that 1MB out of 3GB is not a big cost for having the ASLR. The first obvious security benefit is that all objects are randomized (not only the stack and the executable) in legacy mode which highly increases the ASLR effectiveness, otherwise the attackers may use these non-randomized areas. But also sensitive setuid/setgid applications are more secure because currently, attackers can disable the randomization of these applications by setting the ulimit stack to "unlimited". This is a very old and widely known trick to disable the ASLR in i386 which has been allowed for too long. Another trick used to disable the ASLR was to set the ADDR_NO_RANDOMIZE personality flag, but fortunately this doesn't work on setuid/setgid applications because there is security checks which clear Security-relevant flags. This patch always randomizes the mmap_legacy_base address, removing the possibility to disable the ASLR by setting the stack to "unlimited". Signed-off-by: Hector Marco-Gisbert Acked-by: Ismael Ripoll Ripoll Acked-by: Kees Cook Acked-by: Arjan van de Ven Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: kees Cook Link: http://lkml.kernel.org/r/1457639460-5242-1-git-send-email-hecmargi@upv.es Signed-off-by: Ingo Molnar Cc: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/mmap.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 844b06d67df4d..307f60ecfc6de 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } -/* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; From 3088903a55f218c0d3758de086ede3901b8711b0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 9 Jun 2016 16:56:28 +0300 Subject: [PATCH 083/118] i2c: i801: Allow ACPI SystemIO OpRegion to conflict with PCI BAR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a7ae81952cdab56a1277bd2f9ed7284c0f575120 upstream. Many Intel systems the BIOS declares a SystemIO OpRegion below the SMBus PCI device as can be seen in ACPI DSDT table from Lenovo Yoga 900: Device (SBUS) { OperationRegion (SMBI, SystemIO, (SBAR << 0x05), 0x10) Field (SMBI, ByteAcc, NoLock, Preserve) { HSTS, 8, Offset (0x02), HCON, 8, HCOM, 8, TXSA, 8, DAT0, 8, DAT1, 8, HBDR, 8, PECR, 8, RXSA, 8, SDAT, 16 } There are also bunch of AML methods that that the BIOS can use to access these fields. Most of the systems in question AML methods accessing the SMBI OpRegion are never used. Now, because of this SMBI OpRegion many systems fail to load the SMBus driver with an error looking like one below: ACPI Warning: SystemIO range 0x0000000000003040-0x000000000000305F conflicts with OpRegion 0x0000000000003040-0x000000000000304F (\_SB.PCI0.SBUS.SMBI) (20160108/utaddress-255) ACPI: If an ACPI driver is available for this device, you should use it instead of the native driver The reason is that this SMBI OpRegion conflicts with the PCI BAR used by the SMBus driver. It turns out that we can install a custom SystemIO address space handler for the SMBus device to intercept all accesses through that OpRegion. This allows us to share the PCI BAR with the AML code if it for some reason is using it. We do not expect that this OpRegion handler will ever be called but if it is we print a warning and prevent all access from the SMBus driver itself. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110041 Reported-by: Andy Lutomirski Reported-by: Pali Rohár Suggested-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Acked-by: Rafael J. Wysocki Reviewed-by: Jean Delvare Reviewed-by: Benjamin Tissoires Tested-by: Pali Rohár Tested-by: Jean Delvare Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-i801.c | 103 +++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 27fa0cb09538c..85f39cc3e2765 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -244,6 +244,13 @@ struct i801_priv { struct platform_device *mux_pdev; #endif struct platform_device *tco_pdev; + + /* + * If set to true the host controller registers are reserved for + * ACPI AML use. Protected by acpi_lock. + */ + bool acpi_reserved; + struct mutex acpi_lock; }; #define FEATURE_SMBUS_PEC (1 << 0) @@ -714,9 +721,15 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, { int hwpec; int block = 0; - int ret, xact = 0; + int ret = 0, xact = 0; struct i801_priv *priv = i2c_get_adapdata(adap); + mutex_lock(&priv->acpi_lock); + if (priv->acpi_reserved) { + mutex_unlock(&priv->acpi_lock); + return -EBUSY; + } + hwpec = (priv->features & FEATURE_SMBUS_PEC) && (flags & I2C_CLIENT_PEC) && size != I2C_SMBUS_QUICK && size != I2C_SMBUS_I2C_BLOCK_DATA; @@ -773,7 +786,8 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, default: dev_err(&priv->pci_dev->dev, "Unsupported transaction %d\n", size); - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + goto out; } if (hwpec) /* enable/disable hardware PEC */ @@ -796,11 +810,11 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, ~(SMBAUXCTL_CRC | SMBAUXCTL_E32B), SMBAUXCTL(priv)); if (block) - return ret; + goto out; if (ret) - return ret; + goto out; if ((read_write == I2C_SMBUS_WRITE) || (xact == I801_QUICK)) - return 0; + goto out; switch (xact & 0x7f) { case I801_BYTE: /* Result put in SMBHSTDAT0 */ @@ -812,7 +826,10 @@ static s32 i801_access(struct i2c_adapter *adap, u16 addr, (inb_p(SMBHSTDAT1(priv)) << 8); break; } - return 0; + +out: + mutex_unlock(&priv->acpi_lock); + return ret; } @@ -1249,6 +1266,72 @@ static void i801_add_tco(struct i801_priv *priv) priv->tco_pdev = pdev; } +#ifdef CONFIG_ACPI +static acpi_status +i801_acpi_io_handler(u32 function, acpi_physical_address address, u32 bits, + u64 *value, void *handler_context, void *region_context) +{ + struct i801_priv *priv = handler_context; + struct pci_dev *pdev = priv->pci_dev; + acpi_status status; + + /* + * Once BIOS AML code touches the OpRegion we warn and inhibit any + * further access from the driver itself. This device is now owned + * by the system firmware. + */ + mutex_lock(&priv->acpi_lock); + + if (!priv->acpi_reserved) { + priv->acpi_reserved = true; + + dev_warn(&pdev->dev, "BIOS is accessing SMBus registers\n"); + dev_warn(&pdev->dev, "Driver SMBus register access inhibited\n"); + } + + if ((function & ACPI_IO_MASK) == ACPI_READ) + status = acpi_os_read_port(address, (u32 *)value, bits); + else + status = acpi_os_write_port(address, (u32)*value, bits); + + mutex_unlock(&priv->acpi_lock); + + return status; +} + +static int i801_acpi_probe(struct i801_priv *priv) +{ + struct acpi_device *adev; + acpi_status status; + + adev = ACPI_COMPANION(&priv->pci_dev->dev); + if (adev) { + status = acpi_install_address_space_handler(adev->handle, + ACPI_ADR_SPACE_SYSTEM_IO, i801_acpi_io_handler, + NULL, priv); + if (ACPI_SUCCESS(status)) + return 0; + } + + return acpi_check_resource_conflict(&priv->pci_dev->resource[SMBBAR]); +} + +static void i801_acpi_remove(struct i801_priv *priv) +{ + struct acpi_device *adev; + + adev = ACPI_COMPANION(&priv->pci_dev->dev); + if (!adev) + return; + + acpi_remove_address_space_handler(adev->handle, + ACPI_ADR_SPACE_SYSTEM_IO, i801_acpi_io_handler); +} +#else +static inline int i801_acpi_probe(struct i801_priv *priv) { return 0; } +static inline void i801_acpi_remove(struct i801_priv *priv) { } +#endif + static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) { unsigned char temp; @@ -1266,6 +1349,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) priv->adapter.dev.parent = &dev->dev; ACPI_COMPANION_SET(&priv->adapter.dev, ACPI_COMPANION(&dev->dev)); priv->adapter.retries = 3; + mutex_init(&priv->acpi_lock); priv->pci_dev = dev; switch (dev->device) { @@ -1328,10 +1412,8 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENODEV; } - err = acpi_check_resource_conflict(&dev->resource[SMBBAR]); - if (err) { + if (i801_acpi_probe(priv)) return -ENODEV; - } err = pcim_iomap_regions(dev, 1 << SMBBAR, dev_driver_string(&dev->dev)); @@ -1340,6 +1422,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) "Failed to request SMBus region 0x%lx-0x%Lx\n", priv->smba, (unsigned long long)pci_resource_end(dev, SMBBAR)); + i801_acpi_remove(priv); return err; } @@ -1404,6 +1487,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) err = i2c_add_adapter(&priv->adapter); if (err) { dev_err(&dev->dev, "Failed to add SMBus adapter\n"); + i801_acpi_remove(priv); return err; } @@ -1422,6 +1506,7 @@ static void i801_remove(struct pci_dev *dev) i801_del_mux(priv); i2c_del_adapter(&priv->adapter); + i801_acpi_remove(priv); pci_write_config_byte(dev, SMBHSTCFG, priv->original_hstcfg); platform_device_unregister(priv->tco_pdev); From 66e5d7b47c864f1821041f77752930ec3b8dfc22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 7 Mar 2016 21:15:36 +0100 Subject: [PATCH 084/118] cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d06dd537f95683aba3651098ae288b7cbff8274 upstream. usbnet_link_change will call schedule_work and should be avoided if bind is failing. Otherwise we will end up with scheduled work referring to a netdev which has gone away. Instead of making the call conditional, we can just defer it to usbnet_probe, using the driver_info flag made for this purpose. Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change") Reported-by: Andrey Konovalov Suggested-by: Linus Torvalds Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/cdc_ncm.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index a790d5f90b837..e0e94b855bbe6 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -952,8 +952,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { - int ret; - /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; @@ -962,16 +960,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * Additionally, generic NCM devices are assumed to accept arbitrarily * placed NDP. */ - ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); - - /* - * We should get an event when network connection is "connected" or - * "disconnected". Set network connection in "disconnected" state - * (carrier is OFF) during attach, so the IP network stack does not - * start IPv6 negotiation and more. - */ - usbnet_link_change(dev, 0, 0); - return ret; + return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) @@ -1554,7 +1543,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", - .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET, + .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET + | FLAG_LINK_INTR, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1567,7 +1557,7 @@ static const struct driver_info cdc_ncm_info = { static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN, + | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1580,7 +1570,7 @@ static const struct driver_info wwan_info = { static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN | FLAG_NOARP, + | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From 0107ea0e0928c8a077f0f912c809f2b86fa7496c Mon Sep 17 00:00:00 2001 From: Dave Weinstein Date: Thu, 28 Jul 2016 11:55:41 -0700 Subject: [PATCH 085/118] arm: oabi compat: add missing access checks commit 7de249964f5578e67b99699c5f0b405738d820a2 upstream. Add access checks to sys_oabi_epoll_wait() and sys_oabi_semtimedop(). This fixes CVE-2016-3857, a local privilege escalation under CONFIG_OABI_COMPAT. Reported-by: Chiachih Wu Reviewed-by: Kees Cook Reviewed-by: Nicolas Pitre Signed-off-by: Dave Weinstein Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/sys_oabi-compat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 087acb569b63a..5f221acd21aeb 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -279,8 +279,12 @@ asmlinkage long sys_oabi_epoll_wait(int epfd, mm_segment_t fs; long ret, err, i; - if (maxevents <= 0 || maxevents > (INT_MAX/sizeof(struct epoll_event))) + if (maxevents <= 0 || + maxevents > (INT_MAX/sizeof(*kbuf)) || + maxevents > (INT_MAX/sizeof(*events))) return -EINVAL; + if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents)) + return -EFAULT; kbuf = kmalloc(sizeof(*kbuf) * maxevents, GFP_KERNEL); if (!kbuf) return -ENOMEM; @@ -317,6 +321,8 @@ asmlinkage long sys_oabi_semtimedop(int semid, if (nsops < 1 || nsops > SEMOPM) return -EINVAL; + if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops)) + return -EFAULT; sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL); if (!sops) return -ENOMEM; From cca36a7dad58fc7a95944319e48162194ead6f00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Jul 2016 11:43:37 +0100 Subject: [PATCH 086/118] KEYS: 64-bit MIPS needs to use compat_sys_keyctl for 32-bit userspace commit 20f06ed9f61a185c6dabd662c310bed6189470df upstream. MIPS64 needs to use compat_sys_keyctl for 32-bit userspace rather than calling sys_keyctl. The latter will work in a lot of cases, thereby hiding the issue. Reported-by: Stephan Mueller Signed-off-by: David Howells Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: keyrings@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/13832/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 5a69eb48d0a8c..ee93d5fe61d70 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -344,7 +344,7 @@ EXPORT(sysn32_call_table) PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key PTR sys_request_key - PTR sys_keyctl /* 6245 */ + PTR compat_sys_keyctl /* 6245 */ PTR sys_set_thread_area PTR sys_inotify_init PTR sys_inotify_add_watch diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index e4b6d7c978226..b77052ec6fb21 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -500,7 +500,7 @@ EXPORT(sys32_call_table) PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key /* 4280 */ PTR sys_request_key - PTR sys_keyctl + PTR compat_sys_keyctl PTR sys_set_thread_area PTR sys_inotify_init PTR sys_inotify_add_watch /* 4285 */ From 4cf8f0b0b3e635d8a17f19ef3a183c4c95a4af39 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Mon, 13 Jun 2016 17:03:48 +0200 Subject: [PATCH 087/118] Revert "s390/kdump: Clear subchannel ID to signal non-CCW/SCSI IPL" commit 5419447e2142d6ed68c9f5c1a28630b3a290a845 upstream. This reverts commit 852ffd0f4e23248b47531058e531066a988434b5. There are use cases where an intermediate boot kernel (1) uses kexec to boot the final production kernel (2). For this scenario we should provide the original boot information to the production kernel (2). Therefore clearing the boot information during kexec() should not be done. Reported-by: Steffen Maier Signed-off-by: Michael Holzheu Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/ipl.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index b1f0a90f933bb..42570d8fb265f 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2070,13 +2070,6 @@ void s390_reset_system(void (*fn_pre)(void), S390_lowcore.program_new_psw.addr = PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler; - /* - * Clear subchannel ID and number to signal new kernel that no CCW or - * SCSI IPL has been done (for kexec and kdump) - */ - S390_lowcore.subchannel_id = 0; - S390_lowcore.subchannel_nr = 0; - /* Store status at absolute zero */ store_status(); From 6090bfb684a9985e29c3c0aae52a4b93f967e90f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 18 Nov 2015 11:41:05 -0800 Subject: [PATCH 088/118] apparmor: fix ref count leak when profile sha1 hash is read commit 0b938a2e2cf0b0a2c8bac9769111545aff0fee97 upstream. Signed-off-by: John Johansen Acked-by: Seth Arnold Signed-off-by: Greg Kroah-Hartman --- security/apparmor/apparmorfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index ad4fa49ad1db2..9068369f8a1bc 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -331,6 +331,7 @@ static int aa_fs_seq_hash_show(struct seq_file *seq, void *v) seq_printf(seq, "%.2x", profile->hash[i]); seq_puts(seq, "\n"); } + aa_put_profile(profile); return 0; } From 93f84c8864658c740d205624ab9d23ceca235e46 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 3 Jul 2016 17:01:26 -0400 Subject: [PATCH 089/118] random: strengthen input validation for RNDADDTOENTCNT commit 86a574de4590ffe6fd3f3ca34cdcf655a78e36ec upstream. Don't allow RNDADDTOENTCNT or RNDADDENTROPY to accept a negative entropy value. It doesn't make any sense to subtract from the entropy counter, and it can trigger a warning: random: negative entropy/overflow: pool input count -40000 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 6828 at drivers/char/random.c:670[< none >] credit_entropy_bits+0x21e/0xad0 drivers/char/random.c:670 Modules linked in: CPU: 3 PID: 6828 Comm: a.out Not tainted 4.7.0-rc4+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 ffffffff880b58e0 ffff88005dd9fcb0 ffffffff82cc838f ffffffff87158b40 fffffbfff1016b1c 0000000000000000 0000000000000000 ffffffff87158b40 ffffffff83283dae 0000000000000009 ffff88005dd9fcf8 ffffffff8136d27f Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x12e/0x18f lib/dump_stack.c:51 [] __warn+0x19f/0x1e0 kernel/panic.c:516 [] warn_slowpath_null+0x2c/0x40 kernel/panic.c:551 [] credit_entropy_bits+0x21e/0xad0 drivers/char/random.c:670 [< inline >] credit_entropy_bits_safe drivers/char/random.c:734 [] random_ioctl+0x21d/0x250 drivers/char/random.c:1546 [< inline >] vfs_ioctl fs/ioctl.c:43 [] do_vfs_ioctl+0x18c/0xff0 fs/ioctl.c:674 [< inline >] SYSC_ioctl fs/ioctl.c:689 [] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:680 [] entry_SYSCALL_64_fastpath+0x23/0xc1 arch/x86/entry/entry_64.S:207 ---[ end trace 5d4902b2ba842f1f ]--- This was triggered using the test program: // autogenerated by syzkaller (http://github.com/google/syzkaller) int main() { int fd = open("/dev/random", O_RDWR); int val = -5000; ioctl(fd, RNDADDTOENTCNT, &val); return 0; } It's harmless in that (a) only root can trigger it, and (b) after complaining the code never does let the entropy count go negative, but it's better to simply not allow this userspace from passing in a negative entropy value altogether. Google-Bug-Id: #29575089 Reported-By: Dmitry Vyukov Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index d0da5d852d41e..0227b0465b404 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -722,15 +722,18 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) } } -static void credit_entropy_bits_safe(struct entropy_store *r, int nbits) +static int credit_entropy_bits_safe(struct entropy_store *r, int nbits) { const int nbits_max = (int)(~0U >> (ENTROPY_SHIFT + 1)); + if (nbits < 0) + return -EINVAL; + /* Cap the value to avoid overflows */ nbits = min(nbits, nbits_max); - nbits = max(nbits, -nbits_max); credit_entropy_bits(r, nbits); + return 0; } /********************************************************************* @@ -1542,8 +1545,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EPERM; if (get_user(ent_count, p)) return -EFAULT; - credit_entropy_bits_safe(&input_pool, ent_count); - return 0; + return credit_entropy_bits_safe(&input_pool, ent_count); case RNDADDENTROPY: if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1557,8 +1559,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) size); if (retval < 0) return retval; - credit_entropy_bits_safe(&input_pool, ent_count); - return 0; + return credit_entropy_bits_safe(&input_pool, ent_count); case RNDZAPENTCNT: case RNDCLEARPOOL: /* From 5c7d0f49cf1492866fa619af4538f56938abe07d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2016 15:16:07 -0700 Subject: [PATCH 090/118] devpts: clean up interface to pty drivers commit 67245ff332064c01b760afa7a384ccda024bfd24 upstream. This gets rid of the horrible notion of having that struct inode *ptmx_inode be the linchpin of the interface between the pty code and devpts. By de-emphasizing the ptmx inode, a lot of things actually get cleaner, and we will have a much saner way forward. In particular, this will allow us to associate with any particular devpts instance at open-time, and not be artificially tied to one particular ptmx inode. The patch itself is actually fairly straightforward, and apart from some locking and return path cleanups it's pretty mechanical: - the interfaces that devpts exposes all take "struct pts_fs_info *" instead of "struct inode *ptmx_inode" now. NOTE! The "struct pts_fs_info" thing is a completely opaque structure as far as the pty driver is concerned: it's still declared entirely internally to devpts. So the pty code can't actually access it in any way, just pass it as a "cookie" to the devpts code. - the "look up the pts fs info" is now a single clear operation, that also does the reference count increment on the pts superblock. So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get ref" operation (devpts_get_ref(inode)), along with a "put ref" op (devpts_put_ref()). - the pty master "tty->driver_data" field now contains the pts_fs_info, not the ptmx inode. - because we don't care about the ptmx inode any more as some kind of base index, the ref counting can now drop the inode games - it just gets the ref on the superblock. - the pts_fs_info now has a back-pointer to the super_block. That's so that we can easily look up the information we actually need. Although quite often, the pts fs info was actually all we wanted, and not having to look it up based on some magical inode makes things more straightforward. In particular, now that "devpts_get_ref(inode)" operation should really be the *only* place we need to look up what devpts instance we're associated with, and we do it exactly once, at ptmx_open() time. The other side of this is that one ptmx node could now be associated with multiple different devpts instances - you could have a single /dev/ptmx node, and then have multiple mount namespaces with their own instances of devpts mounted on /dev/pts/. And that's all perfectly sane in a model where we just look up the pts instance at open time. This will eventually allow us to get rid of our odd single-vs-multiple pts instance model, but this patch in itself changes no semantics, only an internal binding model. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds Cc: Francesco Ruggeri Cc: "Herton R. Krzesinski" Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 63 +++++++++++++++++++-------------------- fs/devpts/inode.c | 49 +++++++++++++++--------------- include/linux/devpts_fs.h | 34 +++++++-------------- 3 files changed, 64 insertions(+), 82 deletions(-) diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 7865228f664f9..807d801456864 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -679,14 +679,14 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) /* this is called once with whichever end is closed last */ static void pty_unix98_shutdown(struct tty_struct *tty) { - struct inode *ptmx_inode; + struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) - ptmx_inode = tty->driver_data; + fsi = tty->driver_data; else - ptmx_inode = tty->link->driver_data; - devpts_kill_index(ptmx_inode, tty->index); - devpts_del_ref(ptmx_inode); + fsi = tty->link->driver_data; + devpts_kill_index(fsi, tty->index); + devpts_put_ref(fsi); } static const struct tty_operations ptm_unix98_ops = { @@ -738,6 +738,7 @@ static const struct tty_operations pty_unix98_ops = { static int ptmx_open(struct inode *inode, struct file *filp) { + struct pts_fs_info *fsi; struct tty_struct *tty; struct inode *slave_inode; int retval; @@ -752,47 +753,41 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; + fsi = devpts_get_ref(inode, filp); + retval = -ENODEV; + if (!fsi) + goto out_free_file; + /* find a device that is not in use. */ mutex_lock(&devpts_mutex); - index = devpts_new_index(inode); - if (index < 0) { - retval = index; - mutex_unlock(&devpts_mutex); - goto err_file; - } - + index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); - mutex_lock(&tty_mutex); - tty = tty_init_dev(ptm_driver, index); + retval = index; + if (index < 0) + goto out_put_ref; - if (IS_ERR(tty)) { - retval = PTR_ERR(tty); - goto out; - } + mutex_lock(&tty_mutex); + tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - tty->driver_data = inode; + retval = PTR_ERR(tty); + if (IS_ERR(tty)) + goto out; /* - * In the case where all references to ptmx inode are dropped and we - * still have /dev/tty opened pointing to the master/slave pair (ptmx - * is closed/released before /dev/tty), we must make sure that the inode - * is still valid when we call the final pty_unix98_shutdown, thus we - * hold an additional reference to the ptmx inode. For the same /dev/tty - * last close case, we also need to make sure the super_block isn't - * destroyed (devpts instance unmounted), before /dev/tty is closed and - * on its release devpts_kill_index is called. + * From here on out, the tty is "live", and the index and + * fsi will be killed/put by the tty_release() */ - devpts_add_ref(inode); + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + tty->driver_data = fsi; tty_add_file(tty, filp); - slave_inode = devpts_pty_new(inode, + slave_inode = devpts_pty_new(fsi, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index, tty->link); if (IS_ERR(slave_inode)) { @@ -811,12 +806,14 @@ static int ptmx_open(struct inode *inode, struct file *filp) return 0; err_release: tty_unlock(tty); + // This will also put-ref the fsi tty_release(inode, filp); return retval; out: - mutex_unlock(&tty_mutex); - devpts_kill_index(inode, index); -err_file: + devpts_kill_index(fsi, index); +out_put_ref: + devpts_put_ref(fsi); +out_free_file: tty_free_file(filp); return retval; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 706de324f2a61..c82edb0491170 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ int devpts_new_index(struct inode *ptmx_inode) return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index e0ee0b3000b2d..358a4db72a27d 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,38 +15,24 @@ #include +struct pts_fs_info; + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); -void devpts_kill_index(struct inode *ptmx_inode, int idx); -void devpts_add_ref(struct inode *ptmx_inode); -void devpts_del_ref(struct inode *ptmx_inode); +/* Look up a pts fs info and get a ref to it */ +struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); +void devpts_put_ref(struct pts_fs_info *); + +int devpts_new_index(struct pts_fs_info *); +void devpts_kill_index(struct pts_fs_info *, int); + /* mknod in devpts */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv); +struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); /* get private structure */ void *devpts_get_priv(struct inode *pts_inode); /* unlink */ void devpts_pty_kill(struct inode *inode); -#else - -/* Dummy stubs in the no-pty case */ -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } -static inline void devpts_add_ref(struct inode *ptmx_inode) { } -static inline void devpts_del_ref(struct inode *ptmx_inode) { } -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, - dev_t device, int index, void *priv) -{ - return ERR_PTR(-EINVAL); -} -static inline void *devpts_get_priv(struct inode *pts_inode) -{ - return NULL; -} -static inline void devpts_pty_kill(struct inode *inode) { } - #endif From 8f5b8210fff0e8469c056b82490d786bc6bde92a Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:57 -0600 Subject: [PATCH 091/118] x86/mm/pat: Add support of non-default PAT MSR setting commit 02f037d641dc6672be5cfe7875a48ab99b95b154 upstream. In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled")' ... PAT needs to support a case that PAT MSR is initialized with a non-default value. When pat_init() is called and PAT is disabled, it initializes the PAT table with the BIOS default value. Xen, however, sets PAT MSR with a non-default value to enable WC. This causes inconsistency between the PAT table and PAT MSR when PAT is set to disable on Xen. Change pat_init() to handle the PAT disable cases properly. Add init_cache_modes() to handle two cases when PAT is set to disable. 1. CPU supports PAT: Set PAT table to be consistent with PAT MSR. 2. CPU does not support PAT: Set PAT table to be consistent with PWT and PCD bits in a PTE. Note, __init_cache_modes(), renamed from pat_init_cache_modes(), will be changed to a static function in a later patch. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 2 +- arch/x86/mm/pat.c | 73 +++++++++++++++++++++++++++----------- arch/x86/xen/enlighten.c | 2 +- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index ca6c228d5e628..97ea55bc2b54e 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -6,7 +6,7 @@ bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(u64); +void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 188e3e07eeeba..86066ffb014ae 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -180,7 +180,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -206,9 +206,6 @@ static void pat_bsp_init(u64 pat) return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -217,15 +214,11 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - if (!cpu_has_pat) { /* * If this happens we are on a secondary CPU, but switched to @@ -237,18 +230,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -265,10 +272,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index beab8c706ac95..cf8d1bcabc56e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1632,7 +1632,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * configuration. */ rdmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(pat); + __init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ From d50e8b108ef8980bd193de587d984e986be2ecc1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:58 -0600 Subject: [PATCH 092/118] x86/mm/pat: Add pat_disable() interface commit 224bb1e5d67ba0f2872c98002d6a6f991ac6fd4a upstream. In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled") ... PAT needs to provide an interface that prevents the OS from initializing the PAT MSR. PAT MSR initialization must be done on all CPUs using the specific sequence of operations defined in the Intel SDM. This requires MTRRs to be enabled since pat_init() is called as part of MTRR init from mtrr_rendezvous_handler(). Make pat_disable() as the interface that prevents the OS from initializing the PAT MSR. MTRR will call this interface when it cannot provide the SDM-defined sequence to initialize PAT. This also assures that pat_disable() called from pat_bsp_init() will set the PAT table properly when CPU does not support PAT. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 1 + arch/x86/mm/pat.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 97ea55bc2b54e..0ad356c066eff 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,6 +5,7 @@ #include bool pat_enabled(void); +void pat_disable(const char *reason); extern void pat_init(void); void __init_cache_modes(u64); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 86066ffb014ae..a10dd4fcd5384 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -39,11 +39,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) From 32c854288949a34f4dc08655d0c4b0294916e6c0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:59 -0600 Subject: [PATCH 093/118] x86/mm/pat: Replace cpu_has_pat with boot_cpu_has() commit d63dcf49cf5ae5605f4d14229e3888e104f294b1 upstream. Borislav Petkov suggested: > Please use on init paths boot_cpu_has(X86_FEATURE_PAT) and on fast > paths static_cpu_has(X86_FEATURE_PAT). No more of that cpu_has_XXX > ugliness. Replace the use of cpu_has_pat on init paths with boot_cpu_has(). Suggested-by: Borislav Petkov Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-4-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a10dd4fcd5384..869bb3f03a2ce 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -212,7 +212,7 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } @@ -230,7 +230,7 @@ static void pat_bsp_init(u64 pat) static void pat_ap_init(u64 pat) { - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. From 594055cf63d2ed5b06387f91ce9505ae651fc38d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:00 -0600 Subject: [PATCH 094/118] x86/mtrr: Fix Xorg crashes in Qemu sessions commit edfe63ec97ed8d4496225f7ba54c9ce4207c5431 upstream. A Xorg failure on qemu32 was reported as a regression [1] caused by commit 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled"). This patch fixes the Xorg crash. Negative effects of this regression were the following two failures [2] in Xorg on QEMU with QEMU CPU model "qemu32" (-cpu qemu32), which were triggered by the fact that its virtual CPU does not support MTRRs. #1. copy_process() failed in the check in reserve_pfn_range() copy_process copy_mm dup_mm dup_mmap copy_page_range track_pfn_copy reserve_pfn_range A WC map request was tracked as WC in memtype, which set a PTE as UC (pgprot) per __cachemode2pte_tbl[]. This led to this error in reserve_pfn_range() called from track_pfn_copy(), which obtained a pgprot from a PTE. It converts pgprot to page_cache_mode, which does not necessarily result in the original page_cache_mode since __cachemode2pte_tbl[] redirects multiple types to UC. #2. error path in copy_process() then hit WARN_ON_ONCE in untrack_pfn(). x86/PAT: Xorg:509 map pfn expected mapping type uncached- minus for [mem 0xfd000000-0xfdffffff], got write-combining Call Trace: dump_stack warn_slowpath_common ? untrack_pfn ? untrack_pfn warn_slowpath_null untrack_pfn ? __kunmap_atomic unmap_single_vma ? pagevec_move_tail_fn unmap_vmas exit_mmap mmput copy_process.part.47 _do_fork SyS_clone do_syscall_32_irqs_on entry_INT80_32 These negative effects are caused by two separate bugs, but they can be addressed in separate patches. Fixing the pat_init() issue described below addresses the root cause, and avoids Xorg to hit these cases. When the CPU does not support MTRRs, MTRR does not call pat_init(), which leaves PAT enabled without initializing PAT. This pat_init() issue is a long-standing issue, but manifested as issue #1 (and then hit issue #2) with the above-mentioned commit because the memtype now tracks cache attribute with 'page_cache_mode'. This pat_init() issue existed before the commit, but we used pgprot in memtype. Hence, we did not have issue #1 before. But WC request resulted in WT in effect because WC pgrot is actually WT when PAT is not initialized. This is not how it was designed to work. When PAT is set to disable properly, WC is converted to UC. The use of WT can result in a system crash if the target range does not support WT. Fortunately, nobody ran into such issue before. To fix this pat_init() issue, PAT code has been enhanced to provide pat_disable() interface. Call this interface when MTRRs are disabled. By setting PAT to disable properly, PAT bypasses the memtype check, and avoids issue #1. [1]: https://lkml.org/lkml/2016/3/3/828 [2]: https://lkml.org/lkml/2016/3/4/775 Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-5-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mtrr.h | 6 +++++- arch/x86/kernel/cpu/mtrr/main.c | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index b94f6f64e23d0..dbff1456d2152 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,6 +24,7 @@ #define _ASM_X86_MTRR_H #include +#include /* @@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline void mtrr_bp_init(void) +{ + pat_disable("MTRRs disabled, skipping PAT initialization too."); +} #define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index f891b4750f04c..1b3417db125b2 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -759,8 +759,16 @@ void __init mtrr_bp_init(void) } } - if (!mtrr_enabled()) + if (!mtrr_enabled()) { pr_info("MTRR: Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } } void mtrr_ap_init(void) From a23b299b4a7d0083a3bdb61d1586956f817e8961 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:01 -0600 Subject: [PATCH 095/118] x86/mtrr: Fix PAT init handling when MTRR is disabled commit ad025a73f0e9344ac73ffe1b74c184033e08e7d5 upstream. get_mtrr_state() calls pat_init() on BSP even if MTRR is disabled. This results in calling pat_init() on BSP only since APs do not call pat_init() when MTRR is disabled. This inconsistency between BSP and APs leads to undefined behavior. Make BSP's calling condition to pat_init() consistent with AP's, mtrr_ap_init() and mtrr_aps_init(). Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-6-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mtrr/generic.c | 24 ++++++++++++++---------- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ arch/x86/kernel/cpu/mtrr/mtrr.h | 1 + 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3b533cf37c745..b5624fafa44a5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -444,11 +444,24 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; - unsigned long flags; unsigned lo, dummy; unsigned int i; @@ -481,15 +494,6 @@ bool __init get_mtrr_state(void) mtrr_state_set = 1; - /* PAT setup for BP. We need to go through sync steps here */ - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); - return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1b3417db125b2..fa77ac8291f03 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -752,6 +752,9 @@ void __init mtrr_bp_init(void) /* BIOS may override */ __mtrr_enabled = get_mtrr_state(); + if (mtrr_enabled()) + mtrr_bp_pat_init(); + if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 951884dcc4335..6c7ced07d16d1 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); From 26b340ea33f49af99449607c20c97fa3f499c5fa Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:02 -0600 Subject: [PATCH 096/118] x86/xen, pat: Remove PAT table init code from Xen commit 88ba281108ed0c25c9d292b48bd3f272fcb90dd0 upstream. Xen supports PAT without MTRRs for its guests. In order to enable WC attribute, it was necessary for xen_start_kernel() to call pat_init_cache_modes() to update PAT table before starting guest kernel. Now that the kernel initializes PAT table to the BIOS handoff state when MTRR is disabled, this Xen-specific PAT init code is no longer necessary. Delete it from xen_start_kernel(). Also change __init_cache_modes() to a static function since PAT table should not be tweaked by other modules. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Acked-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-7-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pat.h | 1 - arch/x86/mm/pat.c | 2 +- arch/x86/xen/enlighten.c | 9 --------- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0ad356c066eff..0b1ff4c1c14e7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,7 +7,6 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); -void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 869bb3f03a2ce..9222e6ae449af 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -191,7 +191,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void __init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8d1bcabc56e..ffa41591bff92 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -74,7 +74,6 @@ #include #include #include -#include #include #ifdef CONFIG_ACPI @@ -1519,7 +1518,6 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; - u64 pat; int rc; if (!xen_start_info) @@ -1627,13 +1625,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* - * Modify the cache mode translation tables to match Xen's PAT - * configuration. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - __init_cache_modes(pat); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 From e270fdc5237154293d0b58d85ee6584742f98aeb Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:03 -0600 Subject: [PATCH 097/118] x86/pat: Document the PAT initialization sequence commit b6350c21cfe8aa9d65e189509a23c0ea4b8362c2 upstream. Update PAT documentation to describe how PAT is initialized under various configurations. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-8-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- Documentation/x86/pat.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 54944c71b819b..2a4ee6302122f 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with "debugpat" boot parameter. With this parameter, various debug messages are printed to dmesg log. +PAT Initialization +------------------ + +The following table describes how PAT is initialized under various +configurations. The PAT MSR must be updated by Linux in order to support WC +and WT attributes. Otherwise, the PAT MSR has the value programmed in it +by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests. + + MTRR PAT Call Sequence PAT State PAT MSR + ========================================================= + E E MTRR -> PAT init Enabled OS + E D MTRR -> PAT init Disabled - + D E MTRR -> PAT disable Disabled BIOS + D D MTRR -> PAT disable Disabled - + - np/E PAT -> PAT disable Disabled BIOS + - np/D PAT -> PAT disable Disabled - + E !P/E MTRR -> PAT init Disabled BIOS + D !P/E MTRR -> PAT disable Disabled BIOS + !M !P/E MTRR stub -> PAT disable Disabled BIOS + + Legend + ------------------------------------------------ + E Feature enabled in CPU + D Feature disabled/unsupported in CPU + np "nopat" boot option specified + !P CONFIG_X86_PAT option unset + !M CONFIG_MTRR option unset + Enabled PAT state set to enabled + Disabled PAT state set to disabled + OS PAT initializes PAT MSR with OS setting + BIOS PAT keeps PAT MSR with BIOS setting + From fb93281fa225923c89cf94db59abfd98bba4709f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 Apr 2016 13:36:00 -0600 Subject: [PATCH 098/118] x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386 commit 1886297ce0c8d563a08c8a8c4c0b97743e06cd37 upstream. The following BUG_ON() crash was reported on QEMU/i386: kernel BUG at arch/x86/mm/physaddr.c:79! Call Trace: phys_mem_access_prot_allowed mmap_mem ? mmap_region mmap_region do_mmap vm_mmap_pgoff SyS_mmap_pgoff do_int80_syscall_32 entry_INT80_32 after commit: edfe63ec97ed ("x86/mtrr: Fix Xorg crashes in Qemu sessions") PAT is now set to disabled state when MTRRs are disabled. Thus, reactivating the __pa(high_memory) check in phys_mem_access_prot_allowed(). When CONFIG_DEBUG_VIRTUAL is set, __pa() calls __phys_addr(), which in turn calls slow_virt_to_phys() for 'high_memory'. Because 'high_memory' is set to (the max direct mapped virt addr + 1), it is not a valid virtual address. Hence, slow_virt_to_phys() returns 0 and hit the BUG_ON. Using __pa_nodebug() instead of __pa() will fix this BUG_ON. However, this code block, originally written for Pentiums and earlier, is no longer adequate since a 32-bit Xen guest has MTRRs disabled and supports ZONE_HIGHMEM. In this setup, this code sets UC attribute for accessing RAM in high memory range. Delete this code block as it has been unused for a long time. Reported-by: kernel test robot Reviewed-by: Borislav Petkov Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: David Vrabel Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1460403360-25441-1-git-send-email-toshi.kani@hpe.com Link: https://lkml.org/lkml/2016/4/1/608 Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pat.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9222e6ae449af..6ad687d104caf 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -777,25 +777,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; From 821d5e6b8aed558a989514ea85fa14e097111cf0 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 8 Feb 2016 11:05:28 -0800 Subject: [PATCH 099/118] drm/i915: Pretend cursor is always on for ILK-style WM calculations (v2) commit e2e407dc093f530b771ee8bf8fe1be41e3cea8b3 upstream. Due to our lack of two-step watermark programming, our driver has historically pretended that the cursor plane is always on for the purpose of watermark calculations; this helps avoid serious flickering when the cursor turns off/on (e.g., when the user moves the mouse pointer to a different screen). That workaround was accidentally dropped as we started working toward atomic watermark updates. Since we still aren't quite there yet with two-stage updates, we need to resurrect the workaround and treat the cursor as always active. v2: Tweak cursor width calculations slightly to more closely match the logic we used before the atomic overhaul began. (Ville) Cc: simdev11@outlook.com Cc: manfred.kitzbichler@gmail.com Cc: drm-intel-fixes@lists.freedesktop.org Reported-by: simdev11@outlook.com Reported-by: manfred.kitzbichler@gmail.com Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93892 Fixes: 43d59eda1 ("drm/i915: Eliminate usage of plane_wm_parameters from ILK-style WM code (v2)") Signed-off-by: Matt Roper Link: http://patchwork.freedesktop.org/patch/msgid/1454479611-6804-1-git-send-email-matthew.d.roper@intel.com (cherry picked from commit b2435692dbb709d4c8ff3b2f2815c9b8423b72bb) Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/1454958328-30129-1-git-send-email-matthew.d.roper@intel.com Tested-by: Jay Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_pm.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 62284e45d5312..eb434881ddbcb 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -1789,16 +1789,20 @@ static uint32_t ilk_compute_cur_wm(const struct intel_crtc_state *cstate, const struct intel_plane_state *pstate, uint32_t mem_value) { - int bpp = pstate->base.fb ? pstate->base.fb->bits_per_pixel / 8 : 0; + /* + * We treat the cursor plane as always-on for the purposes of watermark + * calculation. Until we have two-stage watermark programming merged, + * this is necessary to avoid flickering. + */ + int cpp = 4; + int width = pstate->visible ? pstate->base.crtc_w : 64; - if (!cstate->base.active || !pstate->visible) + if (!cstate->base.active) return 0; return ilk_wm_method2(ilk_pipe_pixel_rate(cstate), cstate->base.adjusted_mode.crtc_htotal, - drm_rect_width(&pstate->dst), - bpp, - mem_value); + width, cpp, mem_value); } /* Only for WM_LP. */ From 3cde0e742e29d112aca58731a77d8a3aee386fb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Jul 2016 11:42:38 +0100 Subject: [PATCH 100/118] x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace commit f7d665627e103e82d34306c7d3f6f46f387c0d8b upstream. x86_64 needs to use compat_sys_keyctl for 32-bit userspace rather than calling sys_keyctl(). The latter will work in a lot of cases, thereby hiding the issue. Reported-by: Stephan Mueller Tested-by: Stephan Mueller Signed-off-by: David Howells Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keyrings@vger.kernel.org Cc: linux-security-module@vger.kernel.org Link: http://lkml.kernel.org/r/146961615805.14395.5581949237156769439.stgit@warthog.procyon.org.uk Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f17705e1332cc..e62f4401e7928 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -294,7 +294,7 @@ # 285 sys_setaltroot 286 i386 add_key sys_add_key 287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl +288 i386 keyctl sys_keyctl compat_sys_keyctl 289 i386 ioprio_set sys_ioprio_set 290 i386 ioprio_get sys_ioprio_get 291 i386 inotify_init sys_inotify_init From 9a95c0cfc6f21b9ac66269d4782ea5a0f58cdf91 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: [PATCH 101/118] block: fix use-after-free in seq file commit 77da160530dd1dc94f6ae15a981f24e5f0021e84 upstream. I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c9..d2a1d43bf9faf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -831,6 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } From 62659f0b9ed71ffb8a1e66a42eb52ab8ddadb77a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 2 Aug 2016 14:03:07 -0700 Subject: [PATCH 102/118] sysv, ipc: fix security-layer leaking commit 9b24fef9f0410fb5364245d6cc2bd044cc064007 upstream. Commit 53dad6d3a8e5 ("ipc: fix race with LSMs") updated ipc_rcu_putref() to receive rcu freeing function but used generic ipc_rcu_free() instead of msg_rcu_free() which does security cleaning. Running LTP msgsnd06 with kmemleak gives the following: cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88003c0a11f8 (size 8): comm "msgsnd06", pid 1645, jiffies 4294672526 (age 6.549s) hex dump (first 8 bytes): 1b 00 00 00 01 00 00 00 ........ backtrace: kmemleak_alloc+0x23/0x40 kmem_cache_alloc_trace+0xe1/0x180 selinux_msg_queue_alloc_security+0x3f/0xd0 security_msg_queue_alloc+0x2e/0x40 newque+0x4e/0x150 ipcget+0x159/0x1b0 SyS_msgget+0x39/0x40 entry_SYSCALL_64_fastpath+0x13/0x8f Manfred Spraul suggested to fix sem.c as well and Davidlohr Bueso to only use ipc_rcu_free in case of security allocation failure in newary() Fixes: 53dad6d3a8e ("ipc: fix race with LSMs") Link: http://lkml.kernel.org/r/1470083552-22966-1-git-send-email-fabf@skynet.be Signed-off-by: Fabian Frederick Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/msg.c | 2 +- ipc/sem.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 1471db9a7e611..c6521c205cb40 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -680,7 +680,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, rcu_read_lock(); ipc_lock_object(&msq->q_perm); - ipc_rcu_putref(msq, ipc_rcu_free); + ipc_rcu_putref(msq, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; diff --git a/ipc/sem.c b/ipc/sem.c index b471e5a3863dd..20d07008ad5e0 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -442,7 +442,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -1385,7 +1385,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_unlock(); sem_io = ipc_alloc(sizeof(ushort)*nsems); if (sem_io == NULL) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return -ENOMEM; } @@ -1419,20 +1419,20 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if (sem_io == NULL) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); err = -ERANGE; goto out_free; } @@ -1722,7 +1722,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - ipc_rcu_putref(sma, ipc_rcu_free); + ipc_rcu_putref(sma, sem_rcu_free); return ERR_PTR(-ENOMEM); } From 3d1c64d81fd887ec0cac56f0299c2234a5450011 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Tue, 19 Jul 2016 12:48:01 -0700 Subject: [PATCH 103/118] fuse: fsync() did not return IO errors commit ac7f052b9e1534c8248f814b6f0068ad8d4a06d2 upstream. Due to implementation of fuse writeback filemap_write_and_wait_range() does not catch errors. We have to do this directly after fuse_sync_writes() Signed-off-by: Alexey Kuznetsov Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Fixes: 4d99ff8f12eb ("fuse: Turn writeback cache on") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c2e340d6ec6e8..82f714229b1fe 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -462,6 +462,21 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + + /* + * Due to implementation of fuse writeback + * filemap_write_and_wait_range() does not catch errors. + * We have to do this directly after fuse_sync_writes() + */ + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + goto out; + err = sync_inode_metadata(inode, 1); if (err) goto out; From 9ca5f11d9261e7ed491e425b2efde5e9cecf1447 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Tue, 19 Jul 2016 18:12:26 -0700 Subject: [PATCH 104/118] fuse: fuse_flush must check mapping->flags for errors commit 9ebce595f63a407c5cec98f98f9da8459b73740a upstream. fuse_flush() calls write_inode_now() that triggers writeback, but actual writeback will happen later, on fuse_sync_writes(). If an error happens, fuse_writepage_end() will set error bit in mapping->flags. So, we have to check mapping->flags after fuse_sync_writes(). Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Fixes: 4d99ff8f12eb ("fuse: Turn writeback cache on") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 82f714229b1fe..d58d4c0af0ce5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -417,6 +417,15 @@ static int fuse_flush(struct file *file, fl_owner_t id) fuse_sync_writes(inode); mutex_unlock(&inode->i_mutex); + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + return err; + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; From b6e0a217f621c62a2abd3ce4c6ae8146c8122e98 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 25 Jul 2016 21:17:04 +0800 Subject: [PATCH 105/118] fuse: fix wrong assignment of ->flags in fuse_send_init() commit 9446385f05c9af25fed53dbed3cc75763730be52 upstream. FUSE_HAS_IOCTL_DIR should be assigned to ->flags, it may be a typo. Signed-off-by: Wei Fang Signed-off-by: Miklos Szeredi Fixes: 69fe05c90ed5 ("fuse: add missing INIT flags") Signed-off-by: Greg Kroah-Hartman --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99b..0d5e8e59b390e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -926,7 +926,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; From 92f71339bceeda3a13b71e9663bf422bf3d3e941 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 6 Jul 2016 11:32:20 +0800 Subject: [PATCH 106/118] fs/dcache.c: avoid soft-lockup in dput() commit 47be61845c775643f1aa4d2a54343549f943c94c upstream. We triggered soft-lockup under stress test which open/access/write/close one file concurrently on more than five different CPUs: WARN: soft lockup - CPU#0 stuck for 11s! [who:30631] ... [] dput+0x100/0x298 [] terminate_walk+0x4c/0x60 [] path_lookupat+0x5cc/0x7a8 [] filename_lookup+0x38/0xf0 [] user_path_at_empty+0x78/0xd0 [] user_path_at+0x1c/0x28 [] SyS_faccessat+0xb4/0x230 ->d_lock trylock may failed many times because of concurrently operations, and dput() may execute a long time. Fix this by replacing cpu_relax() with cond_resched(). dput() used to be sleepable, so make it sleepable again should be safe. Signed-off-by: Wei Fang Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 108d7d810be3f..71b6056ad35dc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -578,7 +578,6 @@ static struct dentry *dentry_kill(struct dentry *dentry) failed: spin_unlock(&dentry->d_lock); - cpu_relax(); return dentry; /* try again with same dentry */ } @@ -752,6 +751,8 @@ void dput(struct dentry *dentry) return; repeat: + might_sleep(); + rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); @@ -783,8 +784,10 @@ void dput(struct dentry *dentry) kill_it: dentry = dentry_kill(dentry); - if (dentry) + if (dentry) { + cond_resched(); goto repeat; + } } EXPORT_SYMBOL(dput); From 148fbb966837725e6ff8f151ae6053521d04882c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 15 Jun 2016 22:27:05 +0800 Subject: [PATCH 107/118] crypto: gcm - Filter out async ghash if necessary commit b30bdfa86431afbafe15284a3ad5ac19b49b88e3 upstream. As it is if you ask for a sync gcm you may actually end up with an async one because it does not filter out async implementations of ghash. This patch fixes this by adding the necessary filter when looking for ghash. Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/gcm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/gcm.c b/crypto/gcm.c index bec329b3de8d7..d9ea5f9c05741 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -639,7 +639,9 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl, ghash_alg = crypto_find_alg(ghash_name, &crypto_ahash_type, CRYPTO_ALG_TYPE_HASH, - CRYPTO_ALG_TYPE_AHASH_MASK); + CRYPTO_ALG_TYPE_AHASH_MASK | + crypto_requires_sync(algt->type, + algt->mask)); if (IS_ERR(ghash_alg)) return PTR_ERR(ghash_alg); From 08bb036c9d82ec70fd88c7e08345373a97f98637 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 12 Jul 2016 13:17:57 +0800 Subject: [PATCH 108/118] crypto: scatterwalk - Fix test in scatterwalk_done commit 5f070e81bee35f1b7bd1477bb223a873ff657803 upstream. When there is more data to be processed, the current test in scatterwalk_done may prevent us from calling pagedone even when we should. In particular, if we're on an SG entry spanning multiple pages where the last page is not a full page, we will incorrectly skip calling pagedone on the second last page. This patch fixes this by adding a separate test for whether we've reached the end of a page. Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/scatterwalk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c index ea5815c5e1281..bc769c448d4a9 100644 --- a/crypto/scatterwalk.c +++ b/crypto/scatterwalk.c @@ -72,7 +72,8 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out, void scatterwalk_done(struct scatter_walk *walk, int out, int more) { - if (!(scatterwalk_pagelen(walk) & (PAGE_SIZE - 1)) || !more) + if (!more || walk->offset >= walk->sg->offset + walk->sg->length || + !(walk->offset & (PAGE_SIZE - 1))) scatterwalk_pagedone(walk, out, more); } EXPORT_SYMBOL_GPL(scatterwalk_done); From 9e38db20d794504bb52f9592c90cdc8754f97251 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 30 Jun 2016 11:53:46 -0400 Subject: [PATCH 109/118] ext4: check for extents that wrap around commit f70749ca42943faa4d4dcce46dfdcaadb1d0c4b6 upstream. An extent with lblock = 4294967295 and len = 1 will pass the ext4_valid_extent() test: ext4_lblk_t last = lblock + len - 1; if (len == 0 || lblock > last) return 0; since last = 4294967295 + 1 - 1 = 4294967295. This would later trigger the BUG_ON(es->es_lblk + es->es_len < es->es_lblk) in ext4_es_end(). We can simplify it by removing the - 1 altogether and changing the test to use lblock + len <= lblock, since now if len = 0, then lblock + 0 == lblock and it fails, and if len > 0 then lblock + len > lblock in order to pass (i.e. it doesn't overflow). Fixes: 5946d0893 ("ext4: check for overlapping extents in ext4_valid_extent_entries()") Fixes: 2f974865f ("ext4: check for zero length extent explicitly") Cc: Eryu Guan Signed-off-by: Phil Turnbull Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 62880586ed85f..8eac7d586997c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -376,9 +376,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); - ext4_lblk_t last = lblock + len - 1; - if (len == 0 || lblock > last) + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } From 5a7f477c725e866729307ff87011f8dd812a3cdf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Jul 2016 10:14:01 -0400 Subject: [PATCH 110/118] ext4: fix deadlock during page writeback commit 646caa9c8e196880b41cd3e3d33a2ebc752bdb85 upstream. Commit 06bd3c36a733 (ext4: fix data exposure after a crash) uncovered a deadlock in ext4_writepages() which was previously much harder to hit. After this commit xfstest generic/130 reproduces the deadlock on small filesystems. The problem happens when ext4_do_update_inode() sets LARGE_FILE feature and marks current inode handle as synchronous. That subsequently results in ext4_journal_stop() called from ext4_writepages() to block waiting for transaction commit while still holding page locks, reference to io_end, and some prepared bio in mpd structure each of which can possibly block transaction commit from completing and thus results in deadlock. Fix the problem by releasing page locks, io_end reference, and submitting prepared bio before calling ext4_journal_stop(). [ Changed to defer the call to ext4_journal_stop() only if the handle is synchronous. --tytso ] Reported-and-tested-by: Eryu Guan Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e31d762eedce1..6af24fe4ae2d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2589,13 +2589,36 @@ static int ext4_writepages(struct address_space *mapping, done = true; } } - ext4_journal_stop(handle); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); - /* Drop our io_end reference we got from init */ - ext4_put_io_end(mpd.io_submit.io_end); + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd.io_submit.io_end); if (ret == -ENOSPC && sbi->s_journal) { /* From 175f36cb34d4b06ca2384073f2b741db2e0f915b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 4 Jul 2016 11:03:00 -0400 Subject: [PATCH 111/118] ext4: don't call ext4_should_journal_data() on the journal inode commit 6a7fd522a7c94cdef0a3b08acf8e6702056e635c upstream. If ext4_fill_super() fails early, it's possible for ext4_evict_inode() to call ext4_should_journal_data() before superblock options and flags are fully set up. In that case, the iput() on the journal inode can end up causing a BUG(). Work around this problem by reordering the tests so we only call ext4_should_journal_data() after we know it's not the journal inode. Fixes: 2d859db3e4 ("ext4: fix data corruption in inodes with journalled data") Fixes: 2b405bfa84 ("ext4: fix data=journal fast mount/umount hang") Cc: Jan Kara Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6af24fe4ae2d1..9a5ad0f0d3ed3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,9 +205,9 @@ void ext4_evict_inode(struct inode *inode) * Note that directories do not have this problem because they * don't use page cache. */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT4_JOURNAL_INO) { + if (inode->i_ino != EXT4_JOURNAL_INO && + ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; From f8d4d52ce410c804d56fab866fa9fd2ec04d8d6e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 5 Jul 2016 20:01:52 -0400 Subject: [PATCH 112/118] ext4: validate s_reserved_gdt_blocks on mount commit 5b9554dc5bf008ae7f68a52e3d7e76c0920938a2 upstream. If s_reserved_gdt_blocks is extremely large, it's possible for ext4_init_block_bitmap(), which is called when ext4 sets up an uninitialized block bitmap, to corrupt random kernel memory. Add the same checks which e2fsck has --- it must never be larger than blocksize / sizeof(__u32) --- and then add a backup check in ext4_init_block_bitmap() in case the superblock gets modified after the file system is mounted. Reported-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/balloc.c | 3 +++ fs/ext4/super.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764ff..f97110461c196 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb, memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 852c26806af27..2d7b5462bcaa2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3372,6 +3372,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + goto failed_mount; + } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { if (blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, From db82c747482bab275cd639ed0007ee27ec0c35a1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 14 Jul 2016 23:21:35 -0400 Subject: [PATCH 113/118] ext4: short-cut orphan cleanup on error commit c65d5c6c81a1f27dec5f627f67840726fcd146de upstream. If we encounter a filesystem error during orphan cleanup, we should stop. Otherwise, we may end up in an infinite loop where the same inode is processed again and again. EXT4-fs (loop0): warning: checktime reached, running e2fsck is recommended EXT4-fs error (device loop0): ext4_mb_generate_buddy:758: group 2, block bitmap and bg descriptor inconsistent: 6117 vs 0 free clusters Aborting journal on device loop0-8. EXT4-fs (loop0): Remounting filesystem read-only EXT4-fs error (device loop0) in ext4_free_blocks:4895: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs error (device loop0) in ext4_ext_remove_space:3068: IO failure EXT4-fs error (device loop0) in ext4_ext_truncate:4667: Journal has aborted EXT4-fs error (device loop0) in ext4_orphan_del:2927: Journal has aborted EXT4-fs error (device loop0) in ext4_do_update_inode:4893: Journal has aborted EXT4-fs (loop0): Inode 16 (00000000618192a0): orphan list check failed! [...] EXT4-fs (loop0): Inode 16 (0000000061819748): orphan list check failed! [...] EXT4-fs (loop0): Inode 16 (0000000061819bf0): orphan list check failed! [...] See-also: c9eb13a9105 ("ext4: fix hang when processing corrupted orphaned inode list") Cc: Jan Kara Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d7b5462bcaa2..c542ebcf7a92f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2240,6 +2240,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, while (es->s_last_orphan) { struct inode *inode; + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; From 3a22cf0c7b597f7139d3fdd27fa70aa55aa6d977 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 14 Jul 2016 23:02:47 -0400 Subject: [PATCH 114/118] ext4: fix reference counting bug on block allocation error commit 554a5ccc4e4a20c5f3ec859de0842db4b4b9c77e upstream. If we hit this error when mounted with errors=continue or errors=remount-ro: EXT4-fs error (device loop0): ext4_mb_mark_diskspace_used:2940: comm ext4.exe: Allocating blocks 5090-6081 which overlap fs metadata then ext4_mb_new_blocks() will call ext4_mb_release_context() and try to continue. However, ext4_mb_release_context() is the wrong thing to call here since we are still actually using the allocation context. Instead, just error out. We could retry the allocation, but there is a possibility of getting stuck in an infinite loop instead, so this seems safer. [ Fixed up so we don't return EAGAIN to userspace. --tytso ] Fixes: 8556e8f3b6 ("ext4: Don't allow new groups to be added during block allocation") Signed-off-by: Vegard Nossum Signed-off-by: Theodore Ts'o Cc: Aneesh Kumar K.V Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf734170daa94..c4dcac8a018d2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2932,7 +2932,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation + * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); @@ -2941,7 +2941,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) - err = -EAGAIN; + err = -EFSCORRUPTED; goto out_err; } @@ -4506,18 +4506,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) { + if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { From 8627c7750a66a46d56d3564e1e881aa53764497c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jul 2016 15:44:57 -0700 Subject: [PATCH 115/118] mm: memcontrol: fix cgroup creation failure after many small jobs commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream. The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [hannes@cmpxchg.org: init the IDR] Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: John Garcia Reviewed-by: Vladimir Davydov Acked-by: Tejun Heo Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- include/linux/memcontrol.h | 8 ++++ mm/memcontrol.c | 91 ++++++++++++++++++++++++++++++-------- mm/slab_common.c | 4 +- 3 files changed, 83 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cd0e2413c358d..435fd8426b8ac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -174,6 +174,11 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -183,6 +188,9 @@ struct mem_cgroup_thresholds { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter memsw; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 67648e6b2ac85..e139c982b143b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -272,21 +272,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } /* Writing them here to avoid exposing memcg's inner layout */ @@ -4124,6 +4110,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4178,6 +4218,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto out_free_stat; + return memcg; out_free_stat: @@ -4263,9 +4309,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return &memcg->css; free_out: + idr_remove(&mem_cgroup_idr, memcg->id.id); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -4277,8 +4325,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); int ret; - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); if (!parent) return 0; @@ -4352,6 +4401,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_deactivate_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5685,6 +5736,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5703,6 +5755,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /** @@ -5726,7 +5781,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3c6a86b4ec25f..bec2fce9fafc3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -521,8 +521,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; From a0fddee3fb342a4150c83c36e317660663691a72 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:00 -0700 Subject: [PATCH 116/118] mm: memcontrol: fix swap counter leak on swapout from offline cgroup commit 1f47b61fb4077936465dcde872a4e5cc4fe708da upstream. An offline memory cgroup might have anonymous memory or shmem left charged to it and no swap. Since only swap entries pin the id of an offline cgroup, such a cgroup will have no id and so an attempt to swapout its anon/shmem will not store memory cgroup info in the swap cgroup map. As a result, memcg->swap or memcg->memsw will never get uncharged from it and any of its ascendants. Fix this by always charging swapout to the first ancestor cgroup that hasn't released its id yet. [hannes@cmpxchg.org: add comment to mem_cgroup_swapout] [vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()] Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e139c982b143b..27eaee2acaae5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4141,6 +4141,24 @@ static void mem_cgroup_id_get(struct mem_cgroup *memcg) atomic_inc(&memcg->id.ref); } +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + static void mem_cgroup_id_put(struct mem_cgroup *memcg) { if (atomic_dec_and_test(&memcg->id.ref)) { @@ -5721,7 +5739,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5736,16 +5754,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is From eccccb42d44f44badcfbdbb4e21a4f30d9694666 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 11 Aug 2016 15:33:03 -0700 Subject: [PATCH 117/118] mm: memcontrol: fix memcg id ref counter on swap charge move commit 615d66c37c755c49ce022c9e5ac0875d27d2603d upstream. Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") swap entries do not pin memcg->css.refcnt directly. Instead, they pin memcg->id.ref. So we should adjust the reference counters accordingly when moving swap charges between cgroups. Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 27eaee2acaae5..6b90d184e9c0e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4136,9 +4136,9 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) @@ -4159,9 +4159,9 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) return memcg; } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4170,6 +4170,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4854,6 +4864,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4861,9 +4873,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); From e4884275a4bb1cbce5a24a507c3e267c887dc1bd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Aug 2016 09:31:54 +0200 Subject: [PATCH 118/118] Linux 4.4.18 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 76d34f763a412..eaedea88a8a75 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 17 +SUBLEVEL = 18 EXTRAVERSION = NAME = Blurry Fish Butt