From 469bcef53c546bb792aa66303933272991b7831d Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:39 +0200 Subject: [PATCH 001/154] irqchip/atmel-aic: Fix unbalanced of_node_put() in aic_common_irq_fixup() aic_common_irq_fixup() is calling twice of_node_put() on the same node thus leading to an unbalanced refcount on the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: b2f579b58e93 ("irqchip: atmel-aic: Add irq fixup infrastructure") Cc: Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 28b26c80f4cf93..7c5a43488d27d1 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -196,7 +196,6 @@ static void __init aic_common_irq_fixup(const struct of_device_id *matches) return; match = of_match_node(matches, root); - of_node_put(root); if (match) { void (*fixup)(struct device_node *) = match->data; From 277867ade8262583f4280cadbe90e0031a3706a7 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:40 +0200 Subject: [PATCH 002/154] irqchip/atmel-aic: Fix unbalanced refcount in aic_common_rtc_irq_fixup() of_find_compatible_node() is calling of_node_put() on its first argument thus leading to an unbalanced of_node_get/put() issue if the node has not been retained before that. Instead of passing the root node, pass NULL, which does exactly the same: iterate over all DT nodes, starting from the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: 3d61467f9bab ("irqchip: atmel-aic: Implement RTC irq fixup") Cc: Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 7c5a43488d27d1..05650709972566 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -142,9 +142,9 @@ void __init aic_common_rtc_irq_fixup(struct device_node *root) struct device_node *np; void __iomem *regs; - np = of_find_compatible_node(root, NULL, "atmel,at91rm9200-rtc"); + np = of_find_compatible_node(NULL, NULL, "atmel,at91rm9200-rtc"); if (!np) - np = of_find_compatible_node(root, NULL, + np = of_find_compatible_node(NULL, NULL, "atmel,at91sam9x5-rtc"); if (!np) From 0a46230bf03549435156b36dee9e7489b8270be7 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:41 +0200 Subject: [PATCH 003/154] irqchip/atmel-aic: Remove root argument from ->fixup() prototype We are no longer using the root argument passed to the ->fixup() hooks. Remove it. Signed-off-by: Boris Brezillon Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 8 ++++---- drivers/irqchip/irq-atmel-aic-common.h | 4 ++-- drivers/irqchip/irq-atmel-aic.c | 14 +++++++------- drivers/irqchip/irq-atmel-aic5.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 05650709972566..072bd227b6c677 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -137,7 +137,7 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain) #define AT91_RTC_IMR 0x28 #define AT91_RTC_IRQ_MASK 0x1f -void __init aic_common_rtc_irq_fixup(struct device_node *root) +void __init aic_common_rtc_irq_fixup(void) { struct device_node *np; void __iomem *regs; @@ -165,7 +165,7 @@ void __init aic_common_rtc_irq_fixup(struct device_node *root) #define AT91_RTT_ALMIEN (1 << 16) /* Alarm Interrupt Enable */ #define AT91_RTT_RTTINCIEN (1 << 17) /* Real Time Timer Increment Interrupt Enable */ -void __init aic_common_rtt_irq_fixup(struct device_node *root) +void __init aic_common_rtt_irq_fixup(void) { struct device_node *np; void __iomem *regs; @@ -198,8 +198,8 @@ static void __init aic_common_irq_fixup(const struct of_device_id *matches) match = of_match_node(matches, root); if (match) { - void (*fixup)(struct device_node *) = match->data; - fixup(root); + void (*fixup)(void) = match->data; + fixup(); } of_node_put(root); diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h index af60376d50debe..242e62c1851ead 100644 --- a/drivers/irqchip/irq-atmel-aic-common.h +++ b/drivers/irqchip/irq-atmel-aic-common.h @@ -33,8 +33,8 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node, const char *name, int nirqs, const struct of_device_id *matches); -void __init aic_common_rtc_irq_fixup(struct device_node *root); +void __init aic_common_rtc_irq_fixup(void); -void __init aic_common_rtt_irq_fixup(struct device_node *root); +void __init aic_common_rtt_irq_fixup(void); #endif /* __IRQ_ATMEL_AIC_COMMON_H */ diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 37f952dd9fc94b..bb1ad451392fd8 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -209,20 +209,20 @@ static const struct irq_domain_ops aic_irq_ops = { .xlate = aic_irq_domain_xlate, }; -static void __init at91rm9200_aic_irq_fixup(struct device_node *root) +static void __init at91rm9200_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); + aic_common_rtc_irq_fixup(); } -static void __init at91sam9260_aic_irq_fixup(struct device_node *root) +static void __init at91sam9260_aic_irq_fixup(void) { - aic_common_rtt_irq_fixup(root); + aic_common_rtt_irq_fixup(); } -static void __init at91sam9g45_aic_irq_fixup(struct device_node *root) +static void __init at91sam9g45_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); - aic_common_rtt_irq_fixup(root); + aic_common_rtc_irq_fixup(); + aic_common_rtt_irq_fixup(); } static const struct of_device_id aic_irq_fixups[] __initconst = { diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index c04ee9a23d094f..6acad2ea0fb356 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -305,9 +305,9 @@ static const struct irq_domain_ops aic5_irq_ops = { .xlate = aic5_irq_domain_xlate, }; -static void __init sama5d3_aic_irq_fixup(struct device_node *root) +static void __init sama5d3_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); + aic_common_rtc_irq_fixup(); } static const struct of_device_id aic5_irq_fixups[] __initconst = { From 456c59c31c5126fe31c64956c43670060ea9debd Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:34 +0100 Subject: [PATCH 004/154] irqchip/gic-v2: Report failures in gic_irq_domain_alloc If the GIC cannot map an IRQ via irq_domain_ops->alloc(), it doesn't return an error code. This can cause a problem with drivers, where it thinks it has successfully got an IRQ for the device, but requesting the same ends up failure with -ENOSYS (as the IRQ's chip is not set). Fixes: commit 9a1091ef0017c ("irqchip: gic: Support hierarchy irq domain.") Cc: Yingjoe Chen Cc: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1b1df4f770bdef..940c1627875866 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1027,8 +1027,11 @@ static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, if (ret) return ret; - for (i = 0; i < nr_irqs; i++) - gic_irq_domain_map(domain, virq + i, hwirq + i); + for (i = 0; i < nr_irqs; i++) { + ret = gic_irq_domain_map(domain, virq + i, hwirq + i); + if (ret) + return ret; + } return 0; } From 63c16c6eacb69d0cbdaee5dea0dd56d238375fe6 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:33 +0100 Subject: [PATCH 005/154] irqchip/gic-v3: Report failures in gic_irq_domain_alloc If the GIC cannot map an IRQ via irq_domain_ops->alloc(), it doesn't return an error code. This can cause a problem with drivers, where it thinks it has successfully got an IRQ for the device, but requesting the same ends up failure with -ENOSYS (as the IRQ's chip is not set). Fixes: commit 443acc4f37f6 ("irqchip: GICv3: Convert to domain hierarchy") Cc: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dbffb7ab62033b..47630e9998b3df 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -831,8 +831,11 @@ static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, if (ret) return ret; - for (i = 0; i < nr_irqs; i++) - gic_irq_domain_map(domain, virq + i, hwirq + i); + for (i = 0; i < nr_irqs; i++) { + ret = gic_irq_domain_map(domain, virq + i, hwirq + i); + if (ret) + return ret; + } return 0; } From 65a30f8b300107266f316d550f060ccc186201a3 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:35 +0100 Subject: [PATCH 006/154] irqchip/gic-v3: Honor forced affinity setting Honor the 'force' flag for set_affinity, by selecting a CPU from the given mask (which may not be reported "online" by the cpu_online_mask). Some drivers, like ARM PMU, rely on it. Cc: Marc Zyngier Reported-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 47630e9998b3df..5ba64a7584a3ee 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -640,11 +640,16 @@ static void gic_smp_init(void) static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { - unsigned int cpu = cpumask_any_and(mask_val, cpu_online_mask); + unsigned int cpu; void __iomem *reg; int enabled; u64 val; + if (force) + cpu = cpumask_first(mask_val); + else + cpu = cpumask_any_and(mask_val, cpu_online_mask); + if (cpu >= nr_cpu_ids) return -EINVAL; From 05969566e6d64113a861adc6c17cbba685c640b3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 30 Jun 2017 17:43:02 -0300 Subject: [PATCH 007/154] ARM: dts: imx7d-sdb: Put pinctrl_spi4 in the correct location pinctrl_spi4 pin group is not part of the low power iomux controller, so move it under the normal iomuxc node. Fixes: 184f39b57cab6 ("ARM: dts: imx7d-sdb: Add GPIO expander node") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7d-sdb.dts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm/boot/dts/imx7d-sdb.dts b/arch/arm/boot/dts/imx7d-sdb.dts index 54c45402286b10..0a24d1bf3c3934 100644 --- a/arch/arm/boot/dts/imx7d-sdb.dts +++ b/arch/arm/boot/dts/imx7d-sdb.dts @@ -557,6 +557,14 @@ >; }; + pinctrl_spi4: spi4grp { + fsl,pins = < + MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59 + MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59 + MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59 + >; + }; + pinctrl_tsc2046_pendown: tsc2046_pendown { fsl,pins = < MX7D_PAD_EPDC_BDR1__GPIO2_IO29 0x59 @@ -697,13 +705,5 @@ fsl,pins = < MX7D_PAD_LPSR_GPIO1_IO01__PWM1_OUT 0x110b0 >; - - pinctrl_spi4: spi4grp { - fsl,pins = < - MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59 - MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59 - MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59 - >; - }; }; }; From fa405fd9dd7b0a5367fb5a773e93ac59efb98f44 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Tue, 11 Jul 2017 09:40:15 +0200 Subject: [PATCH 008/154] ARM: dts: at91: sama5d2: use sama5d2 compatible string for SMC A new compatible string has been introduced for sama5d2 SMC to allow to manage the registers mapping change. Signed-off-by: Ludovic Desroches Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/sama5d2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index cc06da39436684..3e6e2dbc259532 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -1048,7 +1048,7 @@ }; hsmc: hsmc@f8014000 { - compatible = "atmel,sama5d3-smc", "syscon", "simple-mfd"; + compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd"; reg = <0xf8014000 0x1000>; interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>; clocks = <&hsmc_clk>; From 8ff235fe7a8c4a5824c223b9996457174442e73a Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 7 Jul 2017 15:33:10 +0200 Subject: [PATCH 009/154] ARM: dts: at91: sama5d2: fix EBI/NAND controllers declaration Fix HSMC interrupt ID, PMECC registers and EBI ones. Fixes: d9c41bf30cf8 ("ARM: dts: at91: Declare EBI/NAND controllers") Signed-off-by: Ludovic Desroches Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/sama5d2.dtsi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 3e6e2dbc259532..60e69aeacbdbf4 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -303,7 +303,7 @@ #size-cells = <1>; atmel,smc = <&hsmc>; reg = <0x10000000 0x10000000 - 0x40000000 0x30000000>; + 0x60000000 0x30000000>; ranges = <0x0 0x0 0x10000000 0x10000000 0x1 0x0 0x60000000 0x10000000 0x2 0x0 0x70000000 0x10000000 @@ -1050,16 +1050,16 @@ hsmc: hsmc@f8014000 { compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd"; reg = <0xf8014000 0x1000>; - interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>; + interrupts = <17 IRQ_TYPE_LEVEL_HIGH 6>; clocks = <&hsmc_clk>; #address-cells = <1>; #size-cells = <1>; ranges; - pmecc: ecc-engine@ffffc070 { + pmecc: ecc-engine@f8014070 { compatible = "atmel,sama5d2-pmecc"; - reg = <0xffffc070 0x490>, - <0xffffc500 0x100>; + reg = <0xf8014070 0x490>, + <0xf8014500 0x100>; }; }; From 29178c1473fd4ad9c523b41bb18a047749c66d11 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Jun 2017 18:10:28 +0200 Subject: [PATCH 010/154] ARC: defconfig: Cleanup from old Kconfig options Remove old, dead Kconfig option INET_LRO. It is gone since commit 7bbf3cae65b6 ("ipv4: Remove inet_lro library"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Vineet Gupta --- arch/arc/configs/nps_defconfig | 1 - arch/arc/configs/tb10x_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index ede625c7621661..7c9c706ae7f66e 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -39,7 +39,6 @@ CONFIG_IP_PNP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 4c5118384eb5f1..f3018254939508 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set From 293b915fd9bebf33cdc906516fb28d54649a25ac Mon Sep 17 00:00:00 2001 From: Oscar Campos Date: Tue, 18 Jul 2017 17:20:36 -0700 Subject: [PATCH 011/154] Input: trackpoint - assume 3 buttons when buttons detection fails Trackpoint buttons detection fails on ThinkPad 570 and 470 series, this makes the middle button of the trackpoint to not being recogized. As I don't believe there is any trackpoint with less than 3 buttons this patch just assumes three buttons when the extended button information read fails. Signed-off-by: Oscar Campos Acked-by: Peter Hutterer Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/trackpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 922ea02edcc3ef..20b5b21c1bba88 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -380,8 +380,8 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) return 0; if (trackpoint_read(ps2dev, TP_EXT_BTN, &button_info)) { - psmouse_warn(psmouse, "failed to get extended button data\n"); - button_info = 0; + psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From d1ce263feb40e6b3208f3e1ebec6dbe86df6f522 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 12 Jul 2017 15:25:09 +0200 Subject: [PATCH 012/154] irqchip/gic-v3-its: Remove ACPICA version check for ACPI NUMA The version check was added due to dependency to a618c7f89a02 ACPICA: Add support for new SRAT subtable Now, that this code is in the kernel, remove the check. This is esp. useful to enable backports. Signed-off-by: Robert Richter Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fed99c55e2f430..3bfbf8d96a0e1d 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1835,7 +1835,7 @@ static int __init its_of_probe(struct device_node *node) #define ACPI_GICV3_ITS_MEM_SIZE (SZ_128K) -#if defined(CONFIG_ACPI_NUMA) && (ACPI_CA_VERSION >= 0x20170531) +#ifdef CONFIG_ACPI_NUMA struct its_srat_map { /* numa node id */ u32 numa_node; From 39a06b67c2c1256bcf2361a1f67d2529f70ab206 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 18 Jul 2017 18:37:55 +0100 Subject: [PATCH 013/154] irqchip/gic: Ensure we have an ISB between ack and ->handle_irq Devices that expose their interrupt status registers via system registers (e.g. Statistical profiling, CPU PMU, DynamIQ PMU, arch timer, vgic (although unused by Linux), ...) rely on a context synchronising operation on the CPU to ensure that the updated status register is visible to the CPU when handling the interrupt. This usually happens as a result of taking the IRQ exception in the first place, but there are two race scenarios where this isn't the case. For example, let's say we have two peripherals (X and Y), where Y uses a system register for its interrupt status. Case 1: 1. CPU takes an IRQ exception as a result of X raising an interrupt 2. Y then raises its interrupt line, but the update to its system register is not yet visible to the CPU 3. The GIC decides to expose Y's interrupt number first in the Ack register 4. The CPU runs the IRQ handler for Y, but the status register is stale Case 2: 1. CPU takes an IRQ exception as a result of X raising an interrupt 2. CPU reads the interrupt number for X from the Ack register and runs its IRQ handler 3. Y raises its interrupt line and the Ack register is updated, but again, the update to its system register is not yet visible to the CPU. 4. Since the GIC drivers poll the Ack register, we read Y's interrupt number and run its handler without a context synchronisation operation, therefore seeing the stale register value. In either case, we run the risk of missing an IRQ. This patch solves the problem by ensuring that we execute an ISB in the GIC drivers prior to invoking the interrupt handler. This is already the case for GICv3 and EOIMode 1 (the usual case for the host). Cc: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 2 ++ drivers/irqchip/irq-gic.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 5ba64a7584a3ee..984c3ecfd22c21 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -353,6 +353,8 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs if (static_key_true(&supports_deactivate)) gic_write_eoir(irqnr); + else + isb(); err = handle_domain_irq(gic_data.domain, irqnr, regs); if (err) { diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 940c1627875866..d3e7c43718b82b 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -361,6 +361,7 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) if (likely(irqnr > 15 && irqnr < 1020)) { if (static_key_true(&supports_deactivate)) writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI); + isb(); handle_domain_irq(gic->domain, irqnr, regs); continue; } @@ -401,10 +402,12 @@ static void gic_handle_cascade_irq(struct irq_desc *desc) goto out; cascade_irq = irq_find_mapping(chip_data->domain, gic_irq); - if (unlikely(gic_irq < 32 || gic_irq > 1020)) + if (unlikely(gic_irq < 32 || gic_irq > 1020)) { handle_bad_irq(desc); - else + } else { + isb(); generic_handle_irq(cascade_irq); + } out: chained_irq_exit(chip, desc); From f862b31514bad66e48d9d4ff6036ee051cf36a6f Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 26 Jun 2017 14:47:25 +0300 Subject: [PATCH 014/154] ARC: [plat-axs10x]: prepare dts files for enabling PAE40 on axs103 Enable 64bit adressing, where it needed, to make possible enabling PAE40 on axs103. This patch doesn't affect on any functionality. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axc001.dtsi | 20 +++++++++----------- arch/arc/boot/dts/axc003.dtsi | 21 ++++++++++----------- arch/arc/boot/dts/axc003_idu.dtsi | 21 ++++++++++----------- arch/arc/boot/dts/axs10x_mb.dtsi | 2 +- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi index 53ce226f77a598..a380ffa1a4589b 100644 --- a/arch/arc/boot/dts/axc001.dtsi +++ b/arch/arc/boot/dts/axc001.dtsi @@ -15,15 +15,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -91,23 +91,21 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&core_intc>; interrupts = < 7 >; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x20000000>; device_type = "memory"; - reg = <0x80000000 0x1b000000>; /* (512 - 32) MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x1b000000>; /* (512 - 32) MiB */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * We just move frame buffer area to the very end of @@ -118,7 +116,7 @@ */ frame_buffer: frame_buffer@9e000000 { compatible = "shared-dma-pool"; - reg = <0x9e000000 0x2000000>; + reg = <0x0 0x9e000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 14df46f141bf34..cc9239ef8d08c9 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -14,15 +14,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -94,30 +94,29 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&core_intc>; interrupts = < 24 >; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x40000000>; device_type = "memory"; - reg = <0x80000000 0x20000000>; /* 512MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */ + 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * Move frame buffer out of IOC aperture (0x8z-0xAz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; - reg = <0xbe000000 0x2000000>; + reg = <0x0 0xbe000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 695f9fa1996bcb..4ebb2170abecc7 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -14,15 +14,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -100,30 +100,29 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&idu_intc>; interrupts = <0>; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x40000000>; device_type = "memory"; - reg = <0x80000000 0x20000000>; /* 512MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */ + 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * Move frame buffer out of IOC aperture (0x8z-0xAz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; - reg = <0xbe000000 0x2000000>; + reg = <0x0 0xbe000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 41cfb29b62c142..0ff7e07edcd4d2 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -13,7 +13,7 @@ compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xe0000000 0x10000000>; + ranges = <0x00000000 0x0 0xe0000000 0x10000000>; interrupt-parent = <&mb_intc>; i2sclk: i2sclk@100a0 { From 33460f86ad2c982f3172a10b17948ccaf923f07f Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 28 Jul 2017 16:53:50 +0530 Subject: [PATCH 015/154] ARC: [plat-sim] Include this platform unconditionally Essentially remove CONFIG_ARC_PLAT_SIM There is no need for any platform specific code, just the board DTS match strings which we can include unconditionally Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 1 - arch/arc/Makefile | 2 +- arch/arc/configs/haps_hs_defconfig | 1 - arch/arc/configs/haps_hs_smp_defconfig | 1 - arch/arc/configs/nsim_700_defconfig | 1 - arch/arc/configs/nsim_hs_defconfig | 1 - arch/arc/configs/nsim_hs_smp_defconfig | 1 - arch/arc/configs/nsimosci_defconfig | 1 - arch/arc/configs/nsimosci_hs_defconfig | 1 - arch/arc/configs/nsimosci_hs_smp_defconfig | 1 - arch/arc/plat-sim/Kconfig | 13 ------------- arch/arc/plat-sim/platform.c | 5 ++++- 12 files changed, 5 insertions(+), 24 deletions(-) delete mode 100644 arch/arc/plat-sim/Kconfig diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index a5459698f0ee37..7db85ab00c5223 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -96,7 +96,6 @@ menu "ARC Architecture Configuration" menu "ARC Platform/SoC/Board" -source "arch/arc/plat-sim/Kconfig" source "arch/arc/plat-tb10x/Kconfig" source "arch/arc/plat-axs10x/Kconfig" #New platform adds here diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 44ef35d3395646..3a61cfcc38c0dd 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -107,7 +107,7 @@ core-y += arch/arc/ # w/o this dtb won't embed into kernel binary core-y += arch/arc/boot/dts/ -core-$(CONFIG_ARC_PLAT_SIM) += arch/arc/plat-sim/ +core-y += arch/arc/plat-sim/ core-$(CONFIG_ARC_PLAT_TB10X) += arch/arc/plat-tb10x/ core-$(CONFIG_ARC_PLAT_AXS10X) += arch/arc/plat-axs10x/ core-$(CONFIG_ARC_PLAT_EZNPS) += arch/arc/plat-eznps/ diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 57b3e599322f83..db04ea4dd2d972 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -21,7 +21,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index f85985adebb245..821a2e562f3f12 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs_idu" diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index b0066a749d4c49..6dff83a238b859 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700" CONFIG_PREEMPT=y # CONFIG_COMPACTION is not set diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig index ebe9ebb9293330..31ee51b987e7c5 100644 --- a/arch/arc/configs/nsim_hs_defconfig +++ b/arch/arc/configs/nsim_hs_defconfig @@ -26,7 +26,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig index 4bde43278be675..8d3b1f67cae421 100644 --- a/arch/arc/configs/nsim_hs_smp_defconfig +++ b/arch/arc/configs/nsim_hs_smp_defconfig @@ -24,7 +24,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu" diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index f6fb3d26557eb7..6168ce2ac2efdd 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci" # CONFIG_COMPACTION is not set CONFIG_NET=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index b9f0fe00044b6c..a70bdeb2b3fd03 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs" # CONFIG_COMPACTION is not set diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index 155add7761ed63..ef96406c446e82 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -18,7 +18,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y # CONFIG_ARC_TIMERS_64BIT is not set diff --git a/arch/arc/plat-sim/Kconfig b/arch/arc/plat-sim/Kconfig deleted file mode 100644 index ac6af96a82f320..00000000000000 --- a/arch/arc/plat-sim/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (C) 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. -# - -menuconfig ARC_PLAT_SIM - bool "ARC nSIM based simulation virtual platforms" - help - Support for nSIM based ARC simulation platforms - This includes the standalone nSIM (uart only) vs. System C OSCI VP diff --git a/arch/arc/plat-sim/platform.c b/arch/arc/plat-sim/platform.c index aea87389e44bd5..5cda56b1a2ead7 100644 --- a/arch/arc/plat-sim/platform.c +++ b/arch/arc/plat-sim/platform.c @@ -20,11 +20,14 @@ */ static const char *simulation_compat[] __initconst = { +#ifdef CONFIG_ISA_ARCOMPACT "snps,nsim", - "snps,nsim_hs", "snps,nsimosci", +#else + "snps,nsim_hs", "snps,nsimosci_hs", "snps,zebu_hs", +#endif NULL, }; From b37174d95b0251611a80ef60abf03752e9d66d67 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Fri, 7 Jul 2017 12:25:14 +0300 Subject: [PATCH 016/154] ARCv2: SLC: Make sure busy bit is set properly for region ops c70c473396cb "ARCv2: SLC: Make sure busy bit is set properly on SLC flushing" fixes problem for entire SLC operation where the problem was initially caught. But given a nature of the issue it is perfectly possible for busy bit to be read incorrectly even when region operation was started. So extending initial fix for regional operation as well. Signed-off-by: Alexey Brodkin Cc: stable@vger.kernel.org #4.10 Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index a867575a758b98..bebc24cb791266 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -697,6 +697,9 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1)); write_aux_reg(ARC_REG_SLC_RGN_START, paddr); + /* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */ + read_aux_reg(ARC_REG_SLC_CTRL); + while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); spin_unlock_irqrestore(&lock, flags); From 2e332fec2f2c996f8d5447b0946ca43bb0ae4b42 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 18 Jul 2017 12:14:09 -0700 Subject: [PATCH 017/154] ARC: dma: implement dma_unmap_page and sg variant Signed-off-by: Vineet Gupta --- arch/arc/mm/dma.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 2a07e6ecafbd76..1d0326d874e741 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -153,6 +153,19 @@ static void _dma_cache_sync(phys_addr_t paddr, size_t size, } } +/* + * arc_dma_map_page - map a portion of a page for streaming DMA + * + * Ensure that any data held in the cache is appropriately discarded + * or written back. + * + * The device owns this memory once this call has completed. The CPU + * can regain ownership by calling dma_unmap_page(). + * + * Note: while it takes struct page as arg, caller can "abuse" it to pass + * a region larger than PAGE_SIZE, provided it is physically contiguous + * and this still works correctly + */ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -165,6 +178,24 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, return plat_phys_to_dma(dev, paddr); } +/* + * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page() + * + * After this call, reads by the CPU to the buffer are guaranteed to see + * whatever the device wrote there. + * + * Note: historically this routine was not implemented for ARC + */ +static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t paddr = plat_dma_to_phys(dev, handle); + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + _dma_cache_sync(paddr, size, dir); +} + static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -178,6 +209,18 @@ static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg, return nents; } +static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) + arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, + attrs); +} + static void arc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { @@ -223,7 +266,9 @@ const struct dma_map_ops arc_dma_ops = { .free = arc_dma_free, .mmap = arc_dma_mmap, .map_page = arc_dma_map_page, + .unmap_page = arc_dma_unmap_page, .map_sg = arc_dma_map_sg, + .unmap_sg = arc_dma_unmap_sg, .sync_single_for_device = arc_dma_sync_single_for_device, .sync_single_for_cpu = arc_dma_sync_single_for_cpu, .sync_sg_for_cpu = arc_dma_sync_sg_for_cpu, From 7d79cee2c6540ea64dd917a14e2fd63d4ac3d3c0 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 1 Aug 2017 12:58:47 +0300 Subject: [PATCH 018/154] ARCv2: PAE40: Explicitly set MSB counterpart of SLC region ops addresses It is necessary to explicitly set both SLC_AUX_RGN_START1 and SLC_AUX_RGN_END1 which hold MSB bits of the physical address correspondingly of region start and end otherwise SLC region operation is executed in unpredictable manner Without this patch, SLC flushes on HSDK (IOC disabled) were taking seconds. Cc: stable@vger.kernel.org #4.4+ Reported-by: Vladimir Kondratiev Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta [vgupta: PAR40 regs only written if PAE40 exist] --- arch/arc/include/asm/cache.h | 2 ++ arch/arc/mm/cache.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index 19ebddffb279db..02fd1cece6ef33 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -96,7 +96,9 @@ extern unsigned long perip_base, perip_end; #define ARC_REG_SLC_FLUSH 0x904 #define ARC_REG_SLC_INVALIDATE 0x905 #define ARC_REG_SLC_RGN_START 0x914 +#define ARC_REG_SLC_RGN_START1 0x915 #define ARC_REG_SLC_RGN_END 0x916 +#define ARC_REG_SLC_RGN_END1 0x917 /* Bit val in SLC_CONTROL */ #define SLC_CTRL_DIS 0x001 diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index bebc24cb791266..874913b3e82655 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -665,6 +665,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned int ctrl; + phys_addr_t end; spin_lock_irqsave(&lock, flags); @@ -694,8 +695,16 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) * END needs to be setup before START (latter triggers the operation) * END can't be same as START, so add (l2_line_sz - 1) to sz */ - write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1)); - write_aux_reg(ARC_REG_SLC_RGN_START, paddr); + end = paddr + sz + l2_line_sz - 1; + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end)); + + write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end)); + + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr)); + + write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr)); /* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */ read_aux_reg(ARC_REG_SLC_CTRL); From b5ddb6d54729d814356937572d6c9b599f10c29f Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 3 Aug 2017 17:45:44 +0530 Subject: [PATCH 019/154] ARCv2: PAE40: set MSB even if !CONFIG_ARC_HAS_PAE40 but PAE exists in SoC PAE40 confiuration in hardware extends some of the address registers for TLB/cache ops to 2 words. So far kernel was NOT setting the higher word if feature was not enabled in software which is wrong. Those need to be set to 0 in such case. Normally this would be done in the cache flush / tlb ops, however since these registers only exist conditionally, this would have to be conditional to a flag being set on boot which is expensive/ugly - specially for the more common case of PAE exists but not in use. Optimize that by zero'ing them once at boot - nobody will write to them afterwards Cc: stable@vger.kernel.org #4.4+ Signed-off-by: Vineet Gupta --- arch/arc/include/asm/mmu.h | 2 ++ arch/arc/mm/cache.c | 34 ++++++++++++++++++++++++++++------ arch/arc/mm/tlb.c | 12 +++++++++++- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h index db7319e9b506e9..efb79fafff1d12 100644 --- a/arch/arc/include/asm/mmu.h +++ b/arch/arc/include/asm/mmu.h @@ -94,6 +94,8 @@ static inline int is_pae40_enabled(void) return IS_ENABLED(CONFIG_ARC_HAS_PAE40); } +extern int pae40_exist_but_not_enab(void); + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 874913b3e82655..7db283b46ebde8 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -1123,6 +1123,13 @@ noinline void __init arc_ioc_setup(void) __dc_enable(); } +/* + * Cache related boot time checks/setups only needed on master CPU: + * - Geometry checks (kernel build and hardware agree: e.g. L1_CACHE_BYTES) + * Assume SMP only, so all cores will have same cache config. A check on + * one core suffices for all + * - IOC setup / dma callbacks only need to be done once + */ void __init arc_cache_init_master(void) { unsigned int __maybe_unused cpu = smp_processor_id(); @@ -1202,12 +1209,27 @@ void __ref arc_cache_init(void) printk(arc_cache_mumbojumbo(0, str, sizeof(str))); - /* - * Only master CPU needs to execute rest of function: - * - Assume SMP so all cores will have same cache config so - * any geomtry checks will be same for all - * - IOC setup / dma callbacks only need to be setup once - */ if (!cpu) arc_cache_init_master(); + + /* + * In PAE regime, TLB and cache maintenance ops take wider addresses + * And even if PAE is not enabled in kernel, the upper 32-bits still need + * to be zeroed to keep the ops sane. + * As an optimization for more common !PAE enabled case, zero them out + * once at init, rather than checking/setting to 0 for every runtime op + */ + if (is_isa_arcv2() && pae40_exist_but_not_enab()) { + + if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) + write_aux_reg(ARC_REG_IC_PTAG_HI, 0); + + if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE)) + write_aux_reg(ARC_REG_DC_PTAG_HI, 0); + + if (l2_line_sz) { + write_aux_reg(ARC_REG_SLC_RGN_END1, 0); + write_aux_reg(ARC_REG_SLC_RGN_START1, 0); + } + } } diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index d0126fdfe2d854..b181f3ee38aab5 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -104,6 +104,8 @@ /* A copy of the ASID from the PID reg is kept in asid_cache */ DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE; +static int __read_mostly pae_exists; + /* * Utility Routine to erase a J-TLB entry * Caller needs to setup Index Reg (manually or via getIndex) @@ -784,7 +786,7 @@ void read_decode_mmu_bcr(void) mmu->u_dtlb = mmu4->u_dtlb * 4; mmu->u_itlb = mmu4->u_itlb * 4; mmu->sasid = mmu4->sasid; - mmu->pae = mmu4->pae; + pae_exists = mmu->pae = mmu4->pae; } } @@ -809,6 +811,11 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len) return buf; } +int pae40_exist_but_not_enab(void) +{ + return pae_exists && !is_pae40_enabled(); +} + void arc_mmu_init(void) { char str[256]; @@ -859,6 +866,9 @@ void arc_mmu_init(void) /* swapper_pg_dir is the pgd for the kernel, used by vmalloc */ write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir); #endif + + if (pae40_exist_but_not_enab()) + write_aux_reg(ARC_REG_TLBPD1HI, 0); } /* From 9e01e2d56db23485a75864b6aeee8e443f024ddb Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Wed, 2 Aug 2017 12:51:29 -0700 Subject: [PATCH 020/154] soc: imx: gpcv2: fix regulator deferred probe If a regulator requests a deferred probe, the power domain gets initialized twice. This leads to a list double add (without list debugging the kernel hangs due to the double add later): WARNING: CPU: 0 PID: 19 at lib/list_debug.c:31 __list_add_valid+0xbc/0xc4 list_add double add: new=c1229754, prev=c12383b4, next=c1229754. Initialize the power domain after we get the regulator. Also do not print an error in case the regulator defers probing. Cc: Fabio Estevam Cc: Andrey Smirnov Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Fixes: 03aa12629fc4 ("soc: imx: Add GPCv2 power gating driver") Signed-off-by: Stefan Agner Acked-by: Andrey Smirnov Tested-by: Andrey Smirnov Signed-off-by: Shawn Guo --- drivers/soc/imx/gpcv2.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/soc/imx/gpcv2.c b/drivers/soc/imx/gpcv2.c index 3039072911a5bc..afc7ecc3c18761 100644 --- a/drivers/soc/imx/gpcv2.c +++ b/drivers/soc/imx/gpcv2.c @@ -200,16 +200,11 @@ static int imx7_pgc_domain_probe(struct platform_device *pdev) domain->dev = &pdev->dev; - ret = pm_genpd_init(&domain->genpd, NULL, true); - if (ret) { - dev_err(domain->dev, "Failed to init power domain\n"); - return ret; - } - domain->regulator = devm_regulator_get_optional(domain->dev, "power"); if (IS_ERR(domain->regulator)) { if (PTR_ERR(domain->regulator) != -ENODEV) { - dev_err(domain->dev, "Failed to get domain's regulator\n"); + if (PTR_ERR(domain->regulator) != -EPROBE_DEFER) + dev_err(domain->dev, "Failed to get domain's regulator\n"); return PTR_ERR(domain->regulator); } } else { @@ -217,6 +212,12 @@ static int imx7_pgc_domain_probe(struct platform_device *pdev) domain->voltage, domain->voltage); } + ret = pm_genpd_init(&domain->genpd, NULL, true); + if (ret) { + dev_err(domain->dev, "Failed to init power domain\n"); + return ret; + } + ret = of_genpd_add_provider_simple(domain->dev->of_node, &domain->genpd); if (ret) { From 8317562097acec4c9e9750eb91115687931bca35 Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Wed, 2 Aug 2017 22:06:11 +0200 Subject: [PATCH 021/154] ARM: dts: i.MX25: add ranges to tscadc Add a ranges; line to the tscadc node. This creates a 1:1 mapping between the addresses used by tscadc and those in its child nodes (adc, tsc). Without such a mapping, the reg = ... lines in the tsc and adc nodes do not create a resource. Probing the fsl-imx25-tcq and fsl-imx25-tsadc drivers will then fail since there's no IORESOURCE_MEM. Signed-off-by: Martin Kaiser Fixes: 92f651f39b42 ("ARM: dts: imx25: Add TSC and ADC support") Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx25.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/imx25.dtsi b/arch/arm/boot/dts/imx25.dtsi index dfcc8e00cf1c53..0ade3619f3c3f1 100644 --- a/arch/arm/boot/dts/imx25.dtsi +++ b/arch/arm/boot/dts/imx25.dtsi @@ -297,6 +297,7 @@ #address-cells = <1>; #size-cells = <1>; status = "disabled"; + ranges; adc: adc@50030800 { compatible = "fsl,imx25-gcq"; From 8cd7b51ff57c74260b20c97623b0e0d420c22be8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 2 Aug 2017 10:26:58 +0000 Subject: [PATCH 022/154] arm64: renesas: salvator-common: avoid audio_clkout naming conflict clock name of "audio_clkout" is used by Renesas sound driver. This duplicated naming breaks its clock registering/unregistering. Especially, when unbind/bind it can't handle clkout correctly. This patch renames "audio_clkout" to "audio-clkout" to avoid naming conflict. Fixes: 8a8f181d2cfd ("arm64: renesas: salvator-x: use CS2000 as AUDIO_CLK_B") Signed-off-by: Kuninori Morimoto Signed-off-by: Simon Horman --- arch/arm64/boot/dts/renesas/salvator-common.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi index a451996f590a51..f903957da504a9 100644 --- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi +++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi @@ -45,7 +45,7 @@ stdout-path = "serial0:115200n8"; }; - audio_clkout: audio_clkout { + audio_clkout: audio-clkout { /* * This is same as <&rcar_sound 0> * but needed to avoid cs2000/rcar_sound probe dead-lock From c017d21147848fe017772764a77a7f32c5b017f9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 27 Jul 2017 15:38:17 -0700 Subject: [PATCH 023/154] irqchip: brcmstb-l2: Define an irq_pm_shutdown function The Broadcom STB platforms support S5 and we allow specific hardware wake-up events to take us out of this state. Because we were not defining an irq_pm_shutdown() function pointer, we would not be correctly masking non-wakeup events, which would result in spurious wake-ups from sources that were not explicitly configured for wake-up. Fixes: 7f646e92766e ("irqchip: brcmstb-l2: Add Broadcom Set Top Box Level-2 interrupt controller") Acked-by: Gregory Fong Signed-off-by: Florian Fainelli Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-brcmstb-l2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index bddf169c4b37b7..b009b916a29235 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -189,6 +189,7 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, ct->chip.irq_suspend = brcmstb_l2_intc_suspend; ct->chip.irq_resume = brcmstb_l2_intc_resume; + ct->chip.irq_pm_shutdown = brcmstb_l2_intc_suspend; if (data->can_wake) { /* This IRQ chip can wake the system, set all child interrupts From 41e327b586762833e48b3703d53312ac32f05f24 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Mon, 7 Aug 2017 21:35:05 +0800 Subject: [PATCH 024/154] quota: correct space limit check Currently we compare total space (curspace + rsvspace) with space limit in quota-tools when setting grace time and also in check_bdq(), but we missing rsvspace in somewhere else, correct them. This patch also fix incorrect zero dqb_btime and grace time updating failure when we use rsvspace(e.g. ext4 dalloc feature). Signed-off-by: zhangyi (F) Signed-off-by: Jan Kara --- fs/quota/dquot.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 53a17496c5c536..566e6ef99f077c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1124,6 +1124,10 @@ void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time64_t) 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1145,7 +1149,8 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; - if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1381,14 +1386,18 @@ static int info_idq_free(struct dquot *dquot, qsize_t inodes) static int info_bdq_free(struct dquot *dquot, qsize_t space) { + qsize_t tspace; + + tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; - if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) + if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; - if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && - dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) + if (tspace >= dquot->dq_dqb.dqb_bhardlimit && + tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } @@ -2681,7 +2690,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) if (check_blim) { if (!dm->dqb_bsoftlimit || - dm->dqb_curspace < dm->dqb_bsoftlimit) { + dm->dqb_curspace + dm->dqb_rsvspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) From be37aa4b993bd5d4191f76a7bd43be33f987b972 Mon Sep 17 00:00:00 2001 From: Michael Hernandez Date: Mon, 31 Jul 2017 14:45:10 -0700 Subject: [PATCH 025/154] scsi: qla2xxx: Fix system crash while triggering FW dump This patch fixes system hang/crash while firmware dump is attempted with Block MQ enabled in qla2xxx driver. Fix is to remove check in fw dump template entries for existing request and response queues so that full buffer size is calculated during template size calculation. Following stack trace is seen during firmware dump capture process [ 694.390588] qla2xxx [0000:81:00.0]-5003:11: ISP System Error - mbx1=4b1fh mbx2=10h mbx3=2ah mbx7=0h. [ 694.402336] BUG: unable to handle kernel paging request at ffffc90008c7b000 [ 694.402372] IP: memcpy_erms+0x6/0x10 [ 694.402386] PGD 105f01a067 [ 694.402386] PUD 85f89c067 [ 694.402398] PMD 10490cb067 [ 694.402409] PTE 0 [ 694.402421] [ 694.402437] Oops: 0002 [#1] PREEMPT SMP [ 694.402452] Modules linked in: netconsole configfs qla2xxx scsi_transport_fc nvme_fc nvme_fabrics bnep bluetooth rfkill xt_tcpudp unix_diag xt_multiport ip6table_filter ip6_tables iptable_filter ip_tables x_tables af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ipmi_ssif sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass igb crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel iTCO_wdt aes_x86_64 crypto_simd ptp iTCO_vendor_support glue_helper cryptd lpc_ich joydev i2c_i801 pcspkr ioatdma mei_me pps_core tpm_tis mei mfd_core acpi_power_meter tpm_tis_core ipmi_si ipmi_devintf tpm ipmi_msghandler shpchp wmi dca button acpi_pad btrfs xor uas usb_storage hid_generic usbhid raid6_pq crc32c_intel ast i2c_algo_bit drm_kms_helper syscopyarea sysfillrect [ 694.402692] sysimgblt fb_sys_fops xhci_pci ttm ehci_pci sr_mod xhci_hcd cdrom ehci_hcd drm usbcore sg [ 694.402730] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-1-default+ #19 [ 694.402753] Hardware name: Supermicro X10DRi/X10DRi, BIOS 1.1a 10/16/2015 [ 694.402776] task: ffffffff81c0e4c0 task.stack: ffffffff81c00000 [ 694.402798] RIP: 0010:memcpy_erms+0x6/0x10 [ 694.402813] RSP: 0018:ffff88085fc03cd0 EFLAGS: 00210006 [ 694.402832] RAX: ffffc90008c7ae0c RBX: 0000000000000004 RCX: 000000000001fe0c [ 694.402856] RDX: 0000000000020000 RSI: ffff8810332c01f4 RDI: ffffc90008c7b000 [ 694.402879] RBP: ffff88085fc03d18 R08: 0000000000020000 R09: 0000000000279e0a [ 694.402903] R10: 0000000000000000 R11: f000000000000000 R12: ffff88085fc03d80 [ 694.402927] R13: ffffc90008a01000 R14: ffffc90008a056d4 R15: ffff881052ef17e0 [ 694.402951] FS: 0000000000000000(0000) GS:ffff88085fc00000(0000) knlGS:0000000000000000 [ 694.402977] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 694.403012] CR2: ffffc90008c7b000 CR3: 0000000001c09000 CR4: 00000000001406f0 [ 694.403036] Call Trace: [ 694.403047] [ 694.403072] ? qla27xx_fwdt_entry_t263+0x18e/0x380 [qla2xxx] [ 694.403099] qla27xx_walk_template+0x9d/0x1a0 [qla2xxx] [ 694.403124] qla27xx_fwdump+0x1f3/0x272 [qla2xxx] [ 694.403149] qla2x00_async_event+0xb08/0x1a50 [qla2xxx] [ 694.403169] ? enqueue_task_fair+0xa2/0x9d0 Signed-off-by: Mike Hernandez Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_tmpl.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_tmpl.c b/drivers/scsi/qla2xxx/qla_tmpl.c index 33142610882f4d..b18646d6057f47 100644 --- a/drivers/scsi/qla2xxx/qla_tmpl.c +++ b/drivers/scsi/qla2xxx/qla_tmpl.c @@ -401,9 +401,6 @@ qla27xx_fwdt_entry_t263(struct scsi_qla_host *vha, for (i = 0; i < vha->hw->max_req_queues; i++) { struct req_que *req = vha->hw->req_q_map[i]; - if (!test_bit(i, vha->hw->req_qid_map)) - continue; - if (req || !buf) { length = req ? req->length : REQUEST_ENTRY_CNT_24XX; @@ -418,9 +415,6 @@ qla27xx_fwdt_entry_t263(struct scsi_qla_host *vha, for (i = 0; i < vha->hw->max_rsp_queues; i++) { struct rsp_que *rsp = vha->hw->rsp_q_map[i]; - if (!test_bit(i, vha->hw->rsp_qid_map)) - continue; - if (rsp || !buf) { length = rsp ? rsp->length : RESPONSE_ENTRY_CNT_MQ; @@ -660,9 +654,6 @@ qla27xx_fwdt_entry_t274(struct scsi_qla_host *vha, for (i = 0; i < vha->hw->max_req_queues; i++) { struct req_que *req = vha->hw->req_q_map[i]; - if (!test_bit(i, vha->hw->req_qid_map)) - continue; - if (req || !buf) { qla27xx_insert16(i, buf, len); qla27xx_insert16(1, buf, len); @@ -675,9 +666,6 @@ qla27xx_fwdt_entry_t274(struct scsi_qla_host *vha, for (i = 0; i < vha->hw->max_rsp_queues; i++) { struct rsp_que *rsp = vha->hw->rsp_q_map[i]; - if (!test_bit(i, vha->hw->rsp_qid_map)) - continue; - if (rsp || !buf) { qla27xx_insert16(i, buf, len); qla27xx_insert16(1, buf, len); From 180efde0a3f43dbe533e4be203c2918793482d4e Mon Sep 17 00:00:00 2001 From: Bodo Stroesser Date: Tue, 1 Aug 2017 14:42:54 +0200 Subject: [PATCH 026/154] scsi: st: fix blk_get_queue usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If blk_queue_get() in st_probe fails, disk->queue must not be set to SDp->request_queue, as that would result in put_disk() dropping a not taken reference. Thus, disk->queue should be set only after a successful blk_queue_get(). Fixes: 2b5bebccd282 ("st: Take additional queue ref in st_probe") Signed-off-by: Bodo Stroesser Acked-by: Shirish Pargaonkar Signed-off-by: Hannes Reinecke Reviewed-by: Ewan D. Milne Acked-by: Kai Mäkisara Signed-off-by: Martin K. Petersen --- drivers/scsi/st.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 8e5013d9cad445..94e402ed30f6ae 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4299,11 +4299,11 @@ static int st_probe(struct device *dev) kref_init(&tpnt->kref); tpnt->disk = disk; disk->private_data = &tpnt->driver; - disk->queue = SDp->request_queue; /* SCSI tape doesn't register this gendisk via add_disk(). Manually * take queue reference that release_disk() expects. */ - if (!blk_get_queue(disk->queue)) + if (!blk_get_queue(SDp->request_queue)) goto out_put_disk; + disk->queue = SDp->request_queue; tpnt->driver = &st_template; tpnt->device = SDp; From b0e17a9b0df29590c45dfb296f541270a5941f41 Mon Sep 17 00:00:00 2001 From: Brian King Date: Tue, 1 Aug 2017 10:21:30 -0500 Subject: [PATCH 027/154] scsi: ipr: Fix scsi-mq lockdep issue Fixes the following lockdep warning that can occur when scsi-mq is enabled with ipr due to ipr calling scsi_unblock_requests from irq context. The fix is to move the call to scsi_unblock_requests to ipr's existing workqueue. stack backtrace: CPU: 28 PID: 0 Comm: swapper/28 Not tainted 4.13.0-rc2-gcc6x-gf74c89b #1 Call Trace: [c000001fffe97550] [c000000000b50818] dump_stack+0xe8/0x160 (unreliable) [c000001fffe97590] [c0000000001586d0] print_usage_bug+0x2d0/0x390 [c000001fffe97640] [c000000000158f34] mark_lock+0x7a4/0x8e0 [c000001fffe976f0] [c00000000015a000] __lock_acquire+0x6a0/0x1a70 [c000001fffe97860] [c00000000015befc] lock_acquire+0xec/0x2e0 [c000001fffe97930] [c000000000b71514] _raw_spin_lock+0x44/0x70 [c000001fffe97960] [c0000000005b60f4] blk_mq_sched_dispatch_requests+0xa4/0x2a0 [c000001fffe979c0] [c0000000005acac0] __blk_mq_run_hw_queue+0x100/0x2c0 [c000001fffe97a00] [c0000000005ad478] __blk_mq_delay_run_hw_queue+0x118/0x130 [c000001fffe97a40] [c0000000005ad61c] blk_mq_start_hw_queues+0x6c/0xa0 [c000001fffe97a80] [c000000000797aac] scsi_kick_queue+0x2c/0x60 [c000001fffe97aa0] [c000000000797cf0] scsi_run_queue+0x210/0x360 [c000001fffe97b10] [c00000000079b888] scsi_run_host_queues+0x48/0x80 [c000001fffe97b40] [c0000000007b6090] ipr_ioa_bringdown_done+0x70/0x1e0 [c000001fffe97bc0] [c0000000007bc860] ipr_reset_ioa_job+0x80/0xf0 [c000001fffe97bf0] [c0000000007b4d50] ipr_reset_timer_done+0xd0/0x100 [c000001fffe97c30] [c0000000001937bc] call_timer_fn+0xdc/0x4b0 [c000001fffe97cf0] [c000000000193d08] expire_timers+0x178/0x330 [c000001fffe97d60] [c0000000001940c8] run_timer_softirq+0xb8/0x120 [c000001fffe97de0] [c000000000b726a8] __do_softirq+0x168/0x6d8 [c000001fffe97ef0] [c0000000000df2c8] irq_exit+0x108/0x150 [c000001fffe97f10] [c000000000017bf4] __do_irq+0x2a4/0x4a0 [c000001fffe97f90] [c00000000002da50] call_do_irq+0x14/0x24 [c0000007fad93aa0] [c000000000017e8c] do_IRQ+0x9c/0x140 [c0000007fad93af0] [c000000000008b98] hardware_interrupt_common+0x138/0x140 Reported-by: Michael Ellerman Signed-off-by: Brian King Signed-off-by: Martin K. Petersen --- drivers/scsi/ipr.c | 33 +++++++++++++++++++-------------- drivers/scsi/ipr.h | 2 ++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index b0c68d24db011b..da5bdbdcce5272 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -3351,6 +3351,16 @@ static void ipr_worker_thread(struct work_struct *work) return; } + if (ioa_cfg->scsi_unblock) { + ioa_cfg->scsi_unblock = 0; + ioa_cfg->scsi_blocked = 0; + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + scsi_unblock_requests(ioa_cfg->host); + spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); + if (ioa_cfg->scsi_blocked) + scsi_block_requests(ioa_cfg->host); + } + if (!ioa_cfg->scan_enabled) { spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); return; @@ -7211,9 +7221,8 @@ static int ipr_ioa_bringdown_done(struct ipr_cmnd *ipr_cmd) ENTER; if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { ipr_trace; - spin_unlock_irq(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock_irq(ioa_cfg->host->host_lock); + ioa_cfg->scsi_unblock = 1; + schedule_work(&ioa_cfg->work_q); } ioa_cfg->in_reset_reload = 0; @@ -7287,13 +7296,7 @@ static int ipr_ioa_reset_done(struct ipr_cmnd *ipr_cmd) list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_free_q); wake_up_all(&ioa_cfg->reset_wait_q); - spin_unlock(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock(ioa_cfg->host->host_lock); - - if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].allow_cmds) - scsi_block_requests(ioa_cfg->host); - + ioa_cfg->scsi_unblock = 1; schedule_work(&ioa_cfg->work_q); LEAVE; return IPR_RC_JOB_RETURN; @@ -9249,8 +9252,11 @@ static void _ipr_initiate_ioa_reset(struct ipr_ioa_cfg *ioa_cfg, spin_unlock(&ioa_cfg->hrrq[i]._lock); } wmb(); - if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) + if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { + ioa_cfg->scsi_unblock = 0; + ioa_cfg->scsi_blocked = 1; scsi_block_requests(ioa_cfg->host); + } ipr_cmd = ipr_get_free_ipr_cmnd(ioa_cfg); ioa_cfg->reset_cmd = ipr_cmd; @@ -9306,9 +9312,8 @@ static void ipr_initiate_ioa_reset(struct ipr_ioa_cfg *ioa_cfg, wake_up_all(&ioa_cfg->reset_wait_q); if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { - spin_unlock_irq(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock_irq(ioa_cfg->host->host_lock); + ioa_cfg->scsi_unblock = 1; + schedule_work(&ioa_cfg->work_q); } return; } else { diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h index e98a87a653357b..c7f0e9e3cd7d49 100644 --- a/drivers/scsi/ipr.h +++ b/drivers/scsi/ipr.h @@ -1488,6 +1488,8 @@ struct ipr_ioa_cfg { u8 cfg_locked:1; u8 clear_isr:1; u8 probe_done:1; + u8 scsi_unblock:1; + u8 scsi_blocked:1; u8 revid; From 424f727b94132a4193af401dd823c44612d1d59f Mon Sep 17 00:00:00 2001 From: Brian King Date: Tue, 1 Aug 2017 13:45:36 -0500 Subject: [PATCH 028/154] scsi: ses: Fix wrong page error If a SES device returns an error on a requested diagnostic page, we are currently printing an error indicating the wrong page was received. Fix this up to simply return a failure and only check the returned page when the diagnostic page buffer was populated by the device. Signed-off-by: Brian King Signed-off-by: Martin K. Petersen --- drivers/scsi/ses.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index f1cdf32d751412..8927f9f54ad926 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -99,7 +99,7 @@ static int ses_recv_diag(struct scsi_device *sdev, int page_code, ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen, NULL, SES_TIMEOUT, SES_RETRIES, NULL); - if (unlikely(!ret)) + if (unlikely(ret)) return ret; recv_page_code = ((unsigned char *)buf)[0]; From 7dc88d2afb6e5c6bd6bbedc394eb43c1e3114bdd Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sat, 22 Jul 2017 10:28:50 +0800 Subject: [PATCH 029/154] arm64: allwinner: a64: bananapi-m64: add missing ethernet0 alias The EMAC Ethernet controller was enabled, but an accompanying alias was not added. This results in unstable numbering if other Ethernet devices, such as a USB dongle, are present. Also, the bootloader uses the alias to assign a generated stable MAC address to the device node. Signed-off-by: Icenowy Zheng Signed-off-by: Maxime Ripard Fixes: e7295499903d ("arm64: allwinner: bananapi-m64: Enable dwmac-sun8i") [wens@csie.org: Rewrite commit log as fixing a previous patch with Fixes] Signed-off-by: Chen-Yu Tsai --- arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts index 0d1f026d831aac..ba2fde2909f949 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts @@ -51,6 +51,7 @@ compatible = "sinovoip,bananapi-m64", "allwinner,sun50i-a64"; aliases { + ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; }; From dff751c68904cf587d918cfb6b2f5b0112f73bc9 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sat, 22 Jul 2017 10:28:51 +0800 Subject: [PATCH 030/154] arm64: allwinner: a64: pine64: add missing ethernet0 alias The EMAC Ethernet controller was enabled, but an accompanying alias was not added. This results in unstable numbering if other Ethernet devices, such as a USB dongle, are present. Also, the bootloader uses the alias to assign a generated stable MAC address to the device node. Signed-off-by: Icenowy Zheng Signed-off-by: Maxime Ripard Fixes: 970239437493 ("arm64: allwinner: pine64: Enable dwmac-sun8i") [wens@csie.org: Rewrite commit log as fixing a previous patch with Fixes] Signed-off-by: Chen-Yu Tsai --- arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 08cda24ea194cb..827168bc22ed2c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts @@ -51,6 +51,7 @@ compatible = "pine64,pine64", "allwinner,sun50i-a64"; aliases { + ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; serial2 = &uart2; From 56a9155074b4d23aa07a98c35c6f107dd50a9367 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sat, 22 Jul 2017 10:28:52 +0800 Subject: [PATCH 031/154] arm64: allwinner: a64: sopine: add missing ethernet0 alias The EMAC Ethernet controller was enabled, but an accompanying alias was not added. This results in unstable numbering if other Ethernet devices, such as a USB dongle, are present. Also, the bootloader uses the alias to assign a generated stable MAC address to the device node. Signed-off-by: Icenowy Zheng Signed-off-by: Maxime Ripard Fixes: 96219b004865 ("arm64: allwinner: a64: add device tree for SoPine with baseboard") [wens@csie.org: Rewrite commit log as fixing a previous patch with Fixes] Signed-off-by: Chen-Yu Tsai --- arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index 17eb1cc5bf6b40..216e3a5dafaef8 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -53,6 +53,7 @@ "allwinner,sun50i-a64"; aliases { + ethernet0 = &emac; serial0 = &uart0; }; From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: [PATCH 032/154] perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 16 +++++++--------- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 6 +++--- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e3db8f642a7a0..af12e294caeda5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2114,7 +2114,7 @@ static void refresh_pce(void *ignored) load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } -static void x86_pmu_event_mapped(struct perf_event *event) +static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; @@ -2129,22 +2129,20 @@ static void x86_pmu_event_mapped(struct perf_event *event) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_sem); - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } -static void x86_pmu_event_unmapped(struct perf_event *event) +static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!current->mm) - return; if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3b873fc59e417..b14095bcf4bb40 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -310,8 +310,8 @@ struct pmu { * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ - void (*event_mapped) (struct perf_event *event); /*optional*/ - void (*event_unmapped) (struct perf_event *event); /*optional*/ + void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ + void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d4c..a654b8a3586fc1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5411,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } From 9b231d9f47c6114d317ce28cff92a74ad80547f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Aug 2017 15:42:09 +0200 Subject: [PATCH 033/154] perf/core: Fix time on IOC_ENABLE Vince reported that when we do IOC_ENABLE/IOC_DISABLE while the task is SIGSTOP'ed state the timestamps go wobbly. It turns out we indeed fail to correctly account time while in 'OFF' state and doing IOC_ENABLE without getting scheduled in exposes the problem. Further thinking about this problem, it occurred to me that we can suffer a similar fate when we migrate an uncore event between CPUs. The perf_event_install() on the 'new' CPU will do add_event_to_ctx() which will reset all the time stamp, resulting in a subsequent update_event_times() to overwrite the total_time_* fields with smaller values. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index a654b8a3586fc1..ee20d4c546b5eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } From e93c17301ac55321fc18e0f8316e924e58a83c8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 7 Aug 2017 19:43:13 -0700 Subject: [PATCH 034/154] x86/asm/64: Clear AC on NMI entries This closes a hole in our SMAP implementation. This patch comes from grsecurity. Good catch! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/314cc9f294e8f14ed85485727556ad4f15bb1659.1502159503.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d271fb79248f35..6d078b89a5e887 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1211,6 +1211,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx From d197f7988721221fac64f899efd7657c15281810 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 31 Jul 2017 11:37:28 -0700 Subject: [PATCH 035/154] clocksource/drivers/arm_arch_timer: Fix mem frame loop initialization The loop to find the best memory frame in arch_timer_mem_acpi_init() initializes the loop counter with itself ('i = i'), which is suspicious in the first place and pointed out by clang. The loop condition is 'i < timer_count' and a prior for loop exits when 'i' reaches 'timer_count', therefore the second loop is never executed. Initialize the loop counter with 0 to iterate over all timers, which supposedly was the intention before the typo monster attacked. Fixes: c2743a36765d3 ("clocksource: arm_arch_timer: add GTDT support for memory-mapped timer") Signed-off-by: Matthias Kaehlcke Reported-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index aae87c4c546ee0..72bbfccef1132c 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1440,7 +1440,7 @@ static int __init arch_timer_mem_acpi_init(int platform_timer_count) * While unlikely, it's theoretically possible that none of the frames * in a timer expose the combination of feature we want. */ - for (i = i; i < timer_count; i++) { + for (i = 0; i < timer_count; i++) { timer = &timers[i]; frame = arch_timer_mem_find_best_frame(timer); From 5c23a558a65406cac472df07fd26a2688a42cad2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 30 Jun 2017 01:14:45 -0500 Subject: [PATCH 036/154] clocksource/drivers/em_sti: Fix error return codes in em_sti_probe() Propagate the return values of platform_get_irq and devm_request_irq on failure. Cc: Frans Klaver Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Lezcano --- drivers/clocksource/em_sti.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index bc48cbf6a79571..269db74a065815 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -305,7 +305,7 @@ static int em_sti_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { dev_err(&pdev->dev, "failed to get irq\n"); - return -EINVAL; + return irq; } /* map memory, let base point to the STI instance */ @@ -314,11 +314,12 @@ static int em_sti_probe(struct platform_device *pdev) if (IS_ERR(p->base)) return PTR_ERR(p->base); - if (devm_request_irq(&pdev->dev, irq, em_sti_interrupt, - IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING, - dev_name(&pdev->dev), p)) { + ret = devm_request_irq(&pdev->dev, irq, em_sti_interrupt, + IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING, + dev_name(&pdev->dev), p); + if (ret) { dev_err(&pdev->dev, "failed to request low IRQ\n"); - return -ENOENT; + return ret; } /* get hold of clock */ From 10e66760fa8ee11f254a69433fc132d04758a5fc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 3 Aug 2017 12:58:18 +0200 Subject: [PATCH 037/154] x86/smpboot: Unbreak CPU0 hotplug A hang on CPU0 onlining after a preceding offlining is observed. Trace shows that CPU0 is stuck in check_tsc_sync_target() waiting for source CPU to run check_tsc_sync_source() but this never happens. Source CPU, in its turn, is stuck on synchronize_sched() which is called from native_cpu_up() -> do_boot_cpu() -> unregister_nmi_handler(). So it's a classic ABBA deadlock, due to the use of synchronize_sched() in unregister_nmi_handler(). Fix the bug by moving unregister_nmi_handler() from do_boot_cpu() to native_cpu_up() after cpu onlining is done. Signed-off-by: Vitaly Kuznetsov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170803105818.9934-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b474c8de7fba09..54b9e89d4d6be3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + int *cpu0_nmi_registered) { volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); @@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; - int cpu0_nmi_registered = 0; unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); @@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, - &cpu0_nmi_registered); + cpu0_nmi_registered); if (!boot_error) { /* @@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } - /* - * Clean up the nmi handler. Do this after the callin and callout sync - * to avoid impact of possible long unregister time. - */ - if (cpu0_nmi_registered) - unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); return boot_error; } @@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); + int cpu0_nmi_registered = 0; unsigned long flags; - int err; + int err, ret = 0; WARN_ON(irqs_disabled()); @@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - err = do_boot_cpu(apicid, cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); - return -EIO; + ret = -EIO; + goto unreg_nmi; } /* @@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - return 0; +unreg_nmi: + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + + return ret; } /** From fdf6e7a8c96ebe115b6460768c82dd136ecbd8db Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Wed, 26 Jul 2017 18:15:49 +0800 Subject: [PATCH 038/154] irqchip/gic-v3-its: Allow GIC ITS number more than MAX_NUMNODES When enabling ITS NUMA support on D05, I got the boot log: [ 0.000000] SRAT: PXM 0 -> ITS 0 -> Node 0 [ 0.000000] SRAT: PXM 0 -> ITS 1 -> Node 0 [ 0.000000] SRAT: PXM 0 -> ITS 2 -> Node 0 [ 0.000000] SRAT: PXM 1 -> ITS 3 -> Node 1 [ 0.000000] SRAT: ITS affinity exceeding max count[4] This is wrong on D05 as we have 8 ITSs with 4 NUMA nodes. So dynamically alloc the memory needed instead of using its_srat_maps[MAX_NUMNODES], which count the number of ITS entry(ies) in SRAT and alloc its_srat_maps as needed, then build the mapping of numa node to ITS ID. Of course, its_srat_maps will be freed after ITS probing because we don't need that after boot. After doing this, I got what I wanted: [ 0.000000] SRAT: PXM 0 -> ITS 0 -> Node 0 [ 0.000000] SRAT: PXM 0 -> ITS 1 -> Node 0 [ 0.000000] SRAT: PXM 0 -> ITS 2 -> Node 0 [ 0.000000] SRAT: PXM 1 -> ITS 3 -> Node 1 [ 0.000000] SRAT: PXM 2 -> ITS 4 -> Node 2 [ 0.000000] SRAT: PXM 2 -> ITS 5 -> Node 2 [ 0.000000] SRAT: PXM 2 -> ITS 6 -> Node 2 [ 0.000000] SRAT: PXM 3 -> ITS 7 -> Node 3 Fixes: dbd2b8267233 ("irqchip/gic-v3-its: Add ACPI NUMA node mapping") Signed-off-by: Hanjun Guo Reviewed-by: Lorenzo Pieralisi Cc: Ganapatrao Kulkarni Cc: John Garry Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 38 ++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 3bfbf8d96a0e1d..5fd5f62c925d2c 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1843,7 +1843,7 @@ struct its_srat_map { u32 its_id; }; -static struct its_srat_map its_srat_maps[MAX_NUMNODES] __initdata; +static struct its_srat_map *its_srat_maps __initdata; static int its_in_srat __initdata; static int __init acpi_get_its_numa_node(u32 its_id) @@ -1857,6 +1857,12 @@ static int __init acpi_get_its_numa_node(u32 its_id) return NUMA_NO_NODE; } +static int __init gic_acpi_match_srat_its(struct acpi_subtable_header *header, + const unsigned long end) +{ + return 0; +} + static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header, const unsigned long end) { @@ -1873,12 +1879,6 @@ static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header, return -EINVAL; } - if (its_in_srat >= MAX_NUMNODES) { - pr_err("SRAT: ITS affinity exceeding max count[%d]\n", - MAX_NUMNODES); - return -EINVAL; - } - node = acpi_map_pxm_to_node(its_affinity->proximity_domain); if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { @@ -1897,14 +1897,37 @@ static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header, static void __init acpi_table_parse_srat_its(void) { + int count; + + count = acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_GIC_ITS_AFFINITY, + gic_acpi_match_srat_its, 0); + if (count <= 0) + return; + + its_srat_maps = kmalloc(count * sizeof(struct its_srat_map), + GFP_KERNEL); + if (!its_srat_maps) { + pr_warn("SRAT: Failed to allocate memory for its_srat_maps!\n"); + return; + } + acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), ACPI_SRAT_TYPE_GIC_ITS_AFFINITY, gic_acpi_parse_srat_its, 0); } + +/* free the its_srat_maps after ITS probing */ +static void __init acpi_its_srat_maps_free(void) +{ + kfree(its_srat_maps); +} #else static void __init acpi_table_parse_srat_its(void) { } static int __init acpi_get_its_numa_node(u32 its_id) { return NUMA_NO_NODE; } +static void __init acpi_its_srat_maps_free(void) { } #endif static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header, @@ -1951,6 +1974,7 @@ static void __init its_acpi_probe(void) acpi_table_parse_srat_its(); acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, gic_acpi_parse_madt_its, 0); + acpi_its_srat_maps_free(); } #else static void __init its_acpi_probe(void) { } From a008873740a3d44946c7e72e456f15146cfd7287 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 10 Aug 2017 15:41:17 +0100 Subject: [PATCH 039/154] irqchip/gic-v3-its-platform-msi: Fix msi-parent parsing loop While parsing the msi-parent property to chase up the IRQ domain a given device belongs to, the index into the msi-parent tuple should be incremented to ensure all properties entries are taken into account. Current code missed the index update so the parsing loop does not work in case multiple msi-parent phandles are present and may turn into an infinite loop in of_pmsi_get_dev_id() if phandle at index 0 does not correspond to the domain we are actually looking-up. Fix the code by updating the phandle index at each iteration in of_pmsi_get_dev_id(). Fixes: deac7fc1c87f ("irqchip/gic-v3-its: Parse new version of msi-parent property") Signed-off-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its-platform-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c index 249240d9a4259e..833a90fe33aed8 100644 --- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c @@ -43,6 +43,7 @@ static int of_pmsi_get_dev_id(struct irq_domain *domain, struct device *dev, *dev_id = args.args[0]; break; } + index++; } while (!ret); return ret; From 9157259d16a8ee8116a98d32f29b797689327e8d Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 3 Aug 2017 09:17:21 -0400 Subject: [PATCH 040/154] mm: add pmd_t initializer __pmd() to work around a GCC bug. THP migration is added but only supports x86_64 at the moment. For all other architectures, swp_entry_to_pmd() only returns a zero pmd_t. Due to a GCC zero initializer bug #53119, the standard (pmd_t){0} initializer is not accepted by all GCC versions. __pmd() is a feasible workaround. In addition, sparc32's pmd_t is an array instead of a single value, so we need (pmd_t){ {0}, } instead of (pmd_t){0}. Thus, a different __pmd() definition is needed in sparc32. Signed-off-by: Zi Yan Signed-off-by: David S. Miller --- arch/sparc/include/asm/page_32.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index 0efd0583a8c9da..6249214148c24b 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -68,6 +68,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t; #define iopgprot_val(x) ((x).iopgprot) #define __pte(x) ((pte_t) { (x) } ) +#define __pmd(x) ((pmd_t) { { (x) }, }) #define __iopte(x) ((iopte_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) #define __ctxd(x) ((ctxd_t) { (x) } ) @@ -95,6 +96,7 @@ typedef unsigned long iopgprot_t; #define iopgprot_val(x) (x) #define __pte(x) (x) +#define __pmd(x) ((pmd_t) { { (x) }, }) #define __iopte(x) (x) #define __pgd(x) (x) #define __ctxd(x) (x) From c587c79f90632df59c61383c6abebb2e07a81911 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Tue, 8 Aug 2017 14:05:12 -0700 Subject: [PATCH 041/154] cpufreq: intel_pstate: report correct CPU frequencies during trace The intel_pstate CPU frequency scaling driver has always calculated CPU frequency incorrectly. Recent changes have eliminted most of the issues, however the frequency reported in the trace buffer, if used, is incorrect. It remains desireable that cpu->pstate.scaling still be a nice round number for things such as when setting max and min frequencies. So the proposal is to just fix the reported frequency in the trace data. Fixes what remains of [1]. Link: https://bugzilla.kernel.org/show_bug.cgi?id=96521 # [1] Signed-off-by: Doug Smythies Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0566455f233ed3..65ee4fcace1f26 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1613,8 +1613,7 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) static inline int32_t get_avg_frequency(struct cpudata *cpu) { - return mul_ext_fp(cpu->sample.core_avg_perf, - cpu->pstate.max_pstate_physical * cpu->pstate.scaling); + return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz); } static inline int32_t get_avg_pstate(struct cpudata *cpu) From 8e2f3bce05e056575c2c84a344a8291fdabb5f21 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Tue, 8 Aug 2017 14:12:49 -0700 Subject: [PATCH 042/154] cpufreq: x86: Disable interrupts during MSRs reading According to Intel 64 and IA-32 Architectures SDM, Volume 3, Chapter 14.2, "Software needs to exercise care to avoid delays between the two RDMSRs (for example interrupts)". So, disable interrupts during reading MSRs IA32_APERF and IA32_MPERF. See also: commit 4ab60c3f32c7 (cpufreq: intel_pstate: Disable interrupts during MSRs reading). Signed-off-by: Doug Smythies Reviewed-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/aperfmperf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 7cf7c70b6ef2a2..0ee83321a3136f 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -40,13 +40,16 @@ static void aperfmperf_snapshot_khz(void *dummy) struct aperfmperf_sample *s = this_cpu_ptr(&samples); ktime_t now = ktime_get(); s64 time_delta = ktime_ms_delta(now, s->time); + unsigned long flags; /* Don't bother re-computing within the cache threshold time. */ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return; + local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); aperf_delta = aperf - s->aperf; mperf_delta = mperf - s->mperf; From a8ec3ee861b6e4e6b82a98777c65510ae63766c1 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 10 Aug 2017 18:07:36 +0300 Subject: [PATCH 043/154] arc: Mask individual IRQ lines during core INTC init ARC cores on reset have all interrupt lines of built-in INTC enabled. Which means once we globally enable interrupts (very early on boot) faulty hardware blocks may trigger an interrupt that Linux kernel cannot handle yet as corresponding handler is not yet installed. In that case system falls in "interrupt storm" and basically never does anything useful except entering and exiting generic IRQ handling code. One real example of that kind of problematic hardware is DW GMAC which also has interrupts enabled on reset and if Ethernet PHY informs GMAC about link state, GMAC immediately reports that upstream to ARC core and here we are. Now with that change we mask all individual IRQ lines making entire system more fool-proof. [This patch was motivated by Adaptrum platform support] Signed-off-by: Alexey Brodkin Cc: Eugeniy Paltsev Tested-by: Alexandru Gagniuc Signed-off-by: Vineet Gupta --- arch/arc/kernel/intc-arcv2.c | 3 +++ arch/arc/kernel/intc-compact.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index f928795fd07a81..cf90714a676d6b 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -75,10 +75,13 @@ void arc_init_IRQ(void) * Set a default priority for all available interrupts to prevent * switching of register banks if Fast IRQ and multiple register banks * are supported by CPU. + * Also disable all IRQ lines so faulty external hardware won't + * trigger interrupt that kernel is not ready to handle. */ for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) { write_aux_reg(AUX_IRQ_SELECT, i); write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO); + write_aux_reg(AUX_IRQ_ENABLE, 0); } /* setup status32, don't enable intr yet as kernel doesn't want */ diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c index 7e608c6b0a0186..cef388025adf43 100644 --- a/arch/arc/kernel/intc-compact.c +++ b/arch/arc/kernel/intc-compact.c @@ -27,7 +27,7 @@ */ void arc_init_IRQ(void) { - int level_mask = 0; + int level_mask = 0, i; /* Is timer high priority Interrupt (Level2 in ARCompact jargon) */ level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ; @@ -40,6 +40,18 @@ void arc_init_IRQ(void) if (level_mask) pr_info("Level-2 interrupts bitset %x\n", level_mask); + + /* + * Disable all IRQ lines so faulty external hardware won't + * trigger interrupt that kernel is not ready to handle. + */ + for (i = TIMER0_IRQ; i < NR_CPU_IRQS; i++) { + unsigned int ienb; + + ienb = read_aux_reg(AUX_IENABLE); + ienb &= ~(1 << i); + write_aux_reg(AUX_IENABLE, ienb); + } } /* From 4d3a869333b74352c372077f316756d38cae09b1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 11 Aug 2017 09:51:41 +0200 Subject: [PATCH 044/154] ALSA: seq: Fix CONFIG_SND_SEQ_MIDI dependency The commit 0181307abc1d ("ALSA: seq: Reorganize kconfig and build") rewrote the dependency of each sequencer module in a standard way, but there was one change applied mistakenly: CONFIG_SND_SEQ_MIDI isn't enabled properly by CONFIG_SND_RAWMIDI. I seem to have changed the wrong one instead, CONFIG_SND_SEQ_MIDI_EMUL, which is eventually reverse-selected by CONFIG_SND_SEQ_MIDI itself. This ended up the lack of snd-seq-midi module as reported below. The fix is to put def_tristate properly to CONFIG_SND_SEQ_MIDI instead of *_MIDI_EMUL entry. Fixes: 0181307abc1d ("ALSA: seq: Reorganize kconfig and build") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196633 Signed-off-by: Takashi Iwai --- sound/core/seq/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/seq/Kconfig b/sound/core/seq/Kconfig index a536760a94c265..45c1336c6597f1 100644 --- a/sound/core/seq/Kconfig +++ b/sound/core/seq/Kconfig @@ -47,10 +47,10 @@ config SND_SEQ_HRTIMER_DEFAULT timer. config SND_SEQ_MIDI_EVENT - def_tristate SND_RAWMIDI + tristate config SND_SEQ_MIDI - tristate + def_tristate SND_RAWMIDI select SND_SEQ_MIDI_EVENT config SND_SEQ_MIDI_EMUL From 9e80dbd87286d3252ac2f78c6465c16e2ec8d476 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Jul 2017 10:22:25 +0300 Subject: [PATCH 045/154] clocksource/drivers/timer-of: Checking for IS_ERR() instead of NULL The current code checks the return value of the of_io_request_and_map() function as it was returning a NULL pointer in case of error. However, it returns an error code encoded in the pointer return value, not a NULL value. Fix this by checking the returned pointer against IS_ERR() and return the error with PTR_ERR(). Signed-off-by: Dan Carpenter Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-of.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c index d509b500a7b5f8..4d7aef9d9c1542 100644 --- a/drivers/clocksource/timer-of.c +++ b/drivers/clocksource/timer-of.c @@ -128,9 +128,9 @@ static __init int timer_base_init(struct device_node *np, const char *name = of_base->name ? of_base->name : np->full_name; of_base->base = of_io_request_and_map(np, of_base->index, name); - if (!of_base->base) { + if (IS_ERR(of_base->base)) { pr_err("Failed to iomap (%s)\n", name); - return -ENXIO; + return PTR_ERR(of_base->base); } return 0; From 599dc457c79bde8bd4fe8bbb2ba1f30ef3d7a5c8 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 18 Jul 2017 09:25:39 +0100 Subject: [PATCH 046/154] clocksource/drivers/Kconfig: Fix CLKSRC_PISTACHIO dependencies In v4.13, CLKSRC_PISTACHIO can select TIMER_OF on architectures without GENERIC_CLOCKEVENTS, resulting in a struct clock_event_device missing some required features and build breakage compiling timer_of.c. One of the symbols selecting TIMER_OF is CLKSRC_PISTACHIO, so add the dependency on GENERIC_CLOCKEVENTS. Thanks to kbuild test robot for finding this error (https://lkml.org/lkml/2017/7/16/249) Signed-off-by: Matt Redfearn Suggested-by: Ian Abbott Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index fcae5ca6ac9234..54a67f8a28ebfb 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -262,7 +262,7 @@ config CLKSRC_LPC32XX config CLKSRC_PISTACHIO bool "Clocksource for Pistachio SoC" if COMPILE_TEST - depends on HAS_IOMEM + depends on GENERIC_CLOCKEVENTS && HAS_IOMEM select TIMER_OF help Enables the clocksource for the Pistachio SoC. From 5442c26995527245c94c4a49e535eae8a60a5299 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Aug 2017 20:55:52 +0200 Subject: [PATCH 047/154] x86/cpufeature, kvm/svm: Rename (shorten) the new "virtualized VMSAVE/VMLOAD" CPUID flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "virtual_vmload_vmsave" is what is going to land in /proc/cpuinfo now as per v4.13-rc4, for a single feature bit which is clearly too long. So rename it to what it is called in the processor manual. "v_vmsave_vmload" is a bit shorter, after all. We could go more aggressively here but having it the same as in the processor manual is advantageous. Signed-off-by: Borislav Petkov Acked-by: Radim Krčmář Cc: Janakarajan Natarajan Cc: Jörg Rödel Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kvm-ML Link: http://lkml.kernel.org/r/20170801185552.GA3743@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kvm/svm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ca3c48c0872f4b..5a28e8e55e36fd 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -286,7 +286,7 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1107626938ccff..56ba05312759d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void) if (vls) { if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || !IS_ENABLED(CONFIG_X86_64)) { vls = false; } else { From b45e4c45b13275a6b4a3f83ae8301a1963fbe5d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 10 Aug 2017 16:57:09 +0100 Subject: [PATCH 048/154] x86: Mark various structures and functions as 'static' Mark a couple of structures and functions as 'static', pointed out by Sparse: warning: symbol 'bts_pmu' was not declared. Should it be static? warning: symbol 'p4_event_aliases' was not declared. Should it be static? warning: symbol 'rapl_attr_groups' was not declared. Should it be static? symbol 'process_uv2_message' was not declared. Should it be static? Signed-off-by: Colin Ian King Acked-by: Andrew Banman # for the UV change Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Will Deacon Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20170810155709.7094-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 2 +- arch/x86/events/intel/p4.c | 2 +- arch/x86/events/intel/rapl.c | 2 +- arch/x86/platform/uv/tlb_uv.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 8ae8c5ce3a1f94..ddd8d3516bfcfa 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -69,7 +69,7 @@ struct bts_buffer { struct bts_phys buf[0]; }; -struct pmu bts_pmu; +static struct pmu bts_pmu; static size_t buf_size(struct page *page) { diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index eb0533558c2b70..d32c0eed38ca92 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are * either up to date automatically or not applicable at all. */ -struct p4_event_alias { +static struct p4_event_alias { u64 original; u64 alternative; } p4_event_aliases[] = { diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index a45e2114a84609..8e2457cb6b4a41 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = { .attrs = rapl_formats_attr, }; -const struct attribute_group *rapl_attr_groups[] = { +static const struct attribute_group *rapl_attr_groups[] = { &rapl_pmu_attr_group, &rapl_pmu_format_group, &rapl_pmu_events_group, diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3e4bdb442fbcfc..f44c0bc95aa2f4 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -26,7 +26,7 @@ static struct bau_operations ops __ro_after_init; /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static int timeout_base_ns[] = { +static const int timeout_base_ns[] = { 20, 160, 1280, @@ -1216,7 +1216,7 @@ static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. * Such a message must be ignored. */ -void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) { unsigned long mmr_image; unsigned char swack_vec; From adb4f11e0a8f4e29900adb2b7af28b6bbd5c1fa4 Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Thu, 10 Aug 2017 10:52:45 +0800 Subject: [PATCH 049/154] clocksource/drivers/arm_arch_timer: Avoid infinite recursion when ftrace is enabled On platforms with an arch timer erratum workaround, it's possible for arch_timer_reg_read_stable() to recurse into itself when certain tracing options are enabled, leading to stack overflows and related problems. For example, when PREEMPT_TRACER and FUNCTION_GRAPH_TRACER are selected, it's possible to trigger this with: $ mount -t debugfs nodev /sys/kernel/debug/ $ echo function_graph > /sys/kernel/debug/tracing/current_tracer The problem is that in such cases, preempt_disable() instrumentation attempts to acquire a timestamp via trace_clock(), resulting in a call back to arch_timer_reg_read_stable(), and hence recursion. This patch changes arch_timer_reg_read_stable() to use preempt_{disable,enable}_notrace(), which avoids this. This problem is similar to the fixed by upstream commit 96b3d28bf4 ("sched/clock: Prevent tracing recursion in sched_clock_cpu()"). Fixes: 6acc71ccac71 ("arm64: arch_timer: Allows a CPU-specific erratum to only affect a subset of CPUs") Signed-off-by: Ding Tianhong Acked-by: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Daniel Lezcano --- arch/arm64/include/asm/arch_timer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index 74d08e44a651b9..a652ce0a5cb2c3 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -65,13 +65,13 @@ DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *, u64 _val; \ if (needs_unstable_timer_counter_workaround()) { \ const struct arch_timer_erratum_workaround *wa; \ - preempt_disable(); \ + preempt_disable_notrace(); \ wa = __this_cpu_read(timer_unstable_counter_workaround); \ if (wa && wa->read_##reg) \ _val = wa->read_##reg(); \ else \ _val = read_sysreg(reg); \ - preempt_enable(); \ + preempt_enable_notrace(); \ } else { \ _val = read_sysreg(reg); \ } \ From c44245b3d5435f533ca8346ece65918f84c057f9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 11 Aug 2017 09:00:06 -0700 Subject: [PATCH 050/154] xfs: fix inobt inode allocation search optimization When we try to allocate a free inode by searching the inobt, we try to find the inode nearest the parent inode by searching chunks both left and right of the chunk containing the parent. As an optimization, we cache the leftmost and rightmost records that we previously searched; if we do another allocation with the same parent inode, we'll pick up the search where it last left off. There's a bug in the case where we found a free inode to the left of the parent's chunk: we need to update the cached left and right records, but because we already reassigned the right record to point to the left, we end up assigning the left record to both the cached left and right records. This isn't a correctness problem strictly, but it can result in the next allocation rechecking chunks unnecessarily or allocating inodes further away from the parent than it needs to. Fix it by swapping the record pointer after we update the cached left and right records. Fixes: bd169565993b ("xfs: speed up free inode search") Signed-off-by: Omar Sandoval Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index ffd5a15d1bb6d0..abf5beaae907d3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1246,13 +1246,13 @@ xfs_dialloc_ag_inobt( /* free inodes to the left? */ if (useleft && trec.ir_freecount) { - rec = trec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; + rec = trec; goto alloc_inode; } From e28ae8e428fefe2facd72cea9f29906ecb9c861d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2017 12:45:35 -0700 Subject: [PATCH 051/154] iomap: fix integer truncation issues in the zeroing and dirtying helpers Fix the min_t calls in the zeroing and dirtying helpers to perform the comparisms on 64-bit types, which prevents them from incorrectly being truncated, and larger zeroing operations being stuck in a never ending loop. Special thanks to Markus Stockhausen for spotting the bug. Reported-by: Paul Menzel Tested-by: Paul Menzel Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 039266128b7ff0..59cc98ad7577b4 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -278,7 +278,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes; /* Bytes to write to page */ offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + bytes = min_t(loff_t, PAGE_SIZE - offset, length); rpage = __iomap_read_page(inode, pos); if (IS_ERR(rpage)) @@ -373,7 +373,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, unsigned offset, bytes; offset = pos & (PAGE_SIZE - 1); /* Within page */ - bytes = min_t(unsigned, PAGE_SIZE - offset, count); + bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); From d86e63e1f0b7868c55c8d4a54854b85e2bac690b Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Fri, 11 Aug 2017 22:27:35 +0800 Subject: [PATCH 052/154] arm64: allwinner: h5: fix pinctrl IRQs The pin controller of H5 has three IRQs at the chip's GIC, which represents three banks of pinctrl IRQs. However, the device tree used to miss the third IRQ of the pin controller, which makes the PG bank IRQ not usable. Add the missing IRQ to the pinctrl node. Fixes: 4e36de179f27 ("arm64: allwinner: h5: add Allwinner H5 .dtsi") Signed-off-by: Icenowy Zheng Signed-off-by: Chen-Yu Tsai --- arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi index 732e2e06f503c8..d9a720bff05d39 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi @@ -120,5 +120,8 @@ }; &pio { + interrupts = , + , + ; compatible = "allwinner,sun50i-h5-pinctrl"; }; From 8df4b0031067758d8b0a3bfde7d35e980d0376d5 Mon Sep 17 00:00:00 2001 From: "Shih-Yuan Lee (FourDollars)" Date: Mon, 14 Aug 2017 18:00:47 +0800 Subject: [PATCH 053/154] ALSA: hda/realtek - Fix pincfg for Dell XPS 13 9370 The initial pin configs for Dell headset mode of ALC3271 has changed. /sys/class/sound/hwC0D0/init_pin_configs: (BIOS 0.1.4) 0x12 0xb7a60130 0x13 0xb8a61140 0x14 0x40000000 0x16 0x411111f0 0x17 0x90170110 0x18 0x411111f0 0x19 0x411111f0 0x1a 0x411111f0 0x1b 0x411111f0 0x1d 0x4087992d 0x1e 0x411111f0 0x21 0x04211020 has changed to ... /sys/class/sound/hwC0D0/init_pin_configs: (BIOS 0.2.0) 0x12 0xb7a60130 0x13 0x40000000 0x14 0x411111f0 0x16 0x411111f0 0x17 0x90170110 0x18 0x411111f0 0x19 0x411111f0 0x1a 0x411111f0 0x1b 0x411111f0 0x1d 0x4067992d 0x1e 0x411111f0 0x21 0x04211020 Fixes: b4576de87243 ("ALSA: hda/realtek - Fix typo of pincfg for Dell quirk") Signed-off-by: Shih-Yuan Lee (FourDollars) Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a91a9ef00c4061..217bb582aff16a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6647,7 +6647,6 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { SND_HDA_PIN_QUIRK(0x10ec0299, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, ALC225_STANDARD_PINS, {0x12, 0xb7a60130}, - {0x13, 0xb8a61140}, {0x17, 0x90170110}), {} }; From 26a72e8a8d0707f1d49133a19c027a3af9fcfdcb Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 4 Aug 2017 15:03:48 +0100 Subject: [PATCH 054/154] drm/i915: remove unused function declaration This function is not part of the driver anymore. Signed-off-by: Lionel Landwerlin Fixes: 90f4fcd56bda ("drm/i915: Remove forced stop ring on suspend/unload") Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20170804140348.24971-1-lionel.g.landwerlin@intel.com (cherry picked from commit fe29133df37ac31de9e657ad91bcf74cdfe8c4cd) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_lrc.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 52b3a1fd4059ba..57ef5833c42749 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -63,7 +63,6 @@ enum { }; /* Logical Rings */ -void intel_logical_ring_stop(struct intel_engine_cs *engine); void intel_logical_ring_cleanup(struct intel_engine_cs *engine); int logical_render_ring_init(struct intel_engine_cs *engine); int logical_xcs_ring_init(struct intel_engine_cs *engine); From a0125a932e917cb507b682cb66645efdca1f8cab Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 8 Aug 2017 14:19:04 +0100 Subject: [PATCH 055/154] drm/i915: Perform an invalidate prior to executing golden renderstate As we may have just bound the renderstate into the GGTT for execution, we need to ensure that the GTT TLB are also flushed. On snb-gt2, this would cause a random GPU hang at the start of a new context (e.g. boot) and on snb-gt1, it was causing the renderstate batch to take ~10s. It was the GPU hang that revealed the truth, as the CS gleefully executed beyond the end of the golden renderstate batch, a good indicator for a GTT TLB miss. Fixes: 20fe17aa52dc ("drm/i915: Remove redundant TLB invalidate on switching contexts") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20170808131904.1385-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala Cc: # v4.12-rc1+ (cherry picked from commit 802673d66f8a6ded5d2689d597853c7bb3a70163) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem_render_state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c index 7032c542a9b1d0..4dd4c2159a92e2 100644 --- a/drivers/gpu/drm/i915/i915_gem_render_state.c +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c @@ -242,6 +242,10 @@ int i915_gem_render_state_emit(struct drm_i915_gem_request *req) goto err_unpin; } + ret = req->engine->emit_flush(req, EMIT_INVALIDATE); + if (ret) + goto err_unpin; + ret = req->engine->emit_bb_start(req, so->batch_offset, so->batch_size, I915_DISPATCH_SECURE); From 1dd7a3e7af70ebdd0cdd937b180726d15a4f0948 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 9 Aug 2017 13:07:02 -0700 Subject: [PATCH 056/154] drm/i915/cnl: Add slice and subslice information to debugfs. A missing part to EU slice power gating is the debugfs interface. This patch actually should have been squashed to the initial EU slice power gating one. v2: Initial patch was merged without this part. Fixes: c7ae7e9ab207 ("drm/i915/cnl: Configure EU slice power gating.") Cc: Joonas Lahtinen Signed-off-by: Rodrigo Vivi Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20170809200702.11236-1-rodrigo.vivi@intel.com (cherry picked from commit 7ea1adf30f82a4c0910524ac06f8f1f26281bb23) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 00d8967c851204..d1bd53b7373844 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4580,7 +4580,7 @@ static void gen9_sseu_device_status(struct drm_i915_private *dev_priv, sseu->slice_mask |= BIT(s); - if (IS_GEN9_BC(dev_priv)) + if (IS_GEN9_BC(dev_priv) || IS_CANNONLAKE(dev_priv)) sseu->subslice_mask = INTEL_INFO(dev_priv)->sseu.subslice_mask; From 7eceb9d04966435ed2d03f5554413715ab3cb34a Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 17 Jul 2017 12:58:54 -0700 Subject: [PATCH 057/154] drm/i915: Return correct EDP voltage swing table for 0.85V For 0.85V cnl_get_buf_trans_edp() returns the DP table, instead of EDP. Use the correct table. The error was pointed out by this clang warning: drivers/gpu/drm/i915/intel_ddi.c:392:39: warning: variable 'cnl_ddi_translations_edp_0_85V' is not needed and will not be emitted [-Wunneeded-internal-declaration] static const struct cnl_ddi_buf_trans cnl_ddi_translations_edp_0_85V[] = { Fixes: cf54ca8bc567 ("drm/i915/cnl: Implement voltage swing sequence.") Signed-off-by: Matthias Kaehlcke Reviewed-by: Manasi Navare Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20170717195854.192139-1-mka@chromium.org (cherry picked from commit 50946c89850db13bd672c664aec6cf4551f71fe9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_ddi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 9edeaaef77adf6..d3b3252a874252 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -1762,7 +1762,7 @@ cnl_get_buf_trans_edp(struct drm_i915_private *dev_priv, if (dev_priv->vbt.edp.low_vswing) { if (voltage == VOLTAGE_INFO_0_85V) { *n_entries = ARRAY_SIZE(cnl_ddi_translations_edp_0_85V); - return cnl_ddi_translations_dp_0_85V; + return cnl_ddi_translations_edp_0_85V; } else if (voltage == VOLTAGE_INFO_0_95V) { *n_entries = ARRAY_SIZE(cnl_ddi_translations_edp_0_95V); return cnl_ddi_translations_edp_0_95V; From 430ffaf46c05bda56535893f38e684f5418c4c93 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 12 Aug 2017 16:27:24 +0100 Subject: [PATCH 058/154] drm/i915: Suppress switch_mm emission between the same aliasing_ppgtt When switching between contexts using the aliasing_ppgtt, the VM is shared. We don't need to reload the PD registers unless they are dirty. Martin Peres reported an issue that looks like corruption between Haswell context switches, bisecting to commit f9326be5f1d3 ("drm/i915: Rearrange switch_context to load the aliasing ppgtt on first use"). Switching between the same mm (the aliasing_ppgtt is used for all contexts in this case) should be a nop, but appears to trigger some side-effects in the context switch. However, as we know the switch is redundant in this case, we can skip it and continue to ignore the issue until somebody feels strong enough to investigate full-ppgtt on gen7 again! Except.. Martin was using full-ppgtt which is not supported as it doesn't work correctly yet. So whilst the bisect did yield valuable information about the failures, the fix should not have any user impact under default settings, with the exception of a slightly lower throughput on xcs as the VM would always be reloaded. v2: Also remember to set the legacy_active_context following the switch on xcs (commit e8a9c58fcd9a ("drm/i915: Unify active context tracking between legacy/execlists/guc")) Fixes: f9326be5f1d3 ("drm/i915: Rearrange switch_context to load the aliasing ppgtt on first use") Fixes: e8a9c58fcd9a ("drm/i915: Unify active context tracking between legacy/execlists/guc") Reported-by: Martin Peres Signed-off-by: Chris Wilson Cc: Martin Peres Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20170812152724.6883-1-chris@chris-wilson.co.uk (cherry picked from commit 12124bea5b82dc1e917304aed703c27292270051) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem_context.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 39ed58a21fc1f5..e1e971ee2ed57a 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -688,19 +688,19 @@ static inline bool skip_rcs_switch(struct i915_hw_ppgtt *ppgtt, } static bool -needs_pd_load_pre(struct i915_hw_ppgtt *ppgtt, - struct intel_engine_cs *engine, - struct i915_gem_context *to) +needs_pd_load_pre(struct i915_hw_ppgtt *ppgtt, struct intel_engine_cs *engine) { + struct i915_gem_context *from = engine->legacy_active_context; + if (!ppgtt) return false; /* Always load the ppgtt on first use */ - if (!engine->legacy_active_context) + if (!from) return true; /* Same context without new entries, skip */ - if (engine->legacy_active_context == to && + if ((!from->ppgtt || from->ppgtt == ppgtt) && !(intel_engine_flag(engine) & ppgtt->pd_dirty_rings)) return false; @@ -744,7 +744,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req) if (skip_rcs_switch(ppgtt, engine, to)) return 0; - if (needs_pd_load_pre(ppgtt, engine, to)) { + if (needs_pd_load_pre(ppgtt, engine)) { /* Older GENs and non render rings still want the load first, * "PP_DCLV followed by PP_DIR_BASE register through Load * Register Immediate commands in Ring Buffer before submitting @@ -841,7 +841,7 @@ int i915_switch_context(struct drm_i915_gem_request *req) struct i915_hw_ppgtt *ppgtt = to->ppgtt ?: req->i915->mm.aliasing_ppgtt; - if (needs_pd_load_pre(ppgtt, engine, to)) { + if (needs_pd_load_pre(ppgtt, engine)) { int ret; trace_switch_mm(engine, to); @@ -852,6 +852,7 @@ int i915_switch_context(struct drm_i915_gem_request *req) ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine); } + engine->legacy_active_context = to; return 0; } From 781cc76e0c2469cb7ac12ba238a4ea006978e321 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 8 Aug 2017 10:08:26 +0200 Subject: [PATCH 059/154] drm/i915: Avoid the gpu reset vs. modeset deadlock ... using the biggest hammer we have. This is essentially a weaponized version of the timeout-based wedging Chris added in commit 36703e79a982c8ce5a8e43833291f2719e92d0d1 Author: Chris Wilson Date: Thu Jun 22 11:56:25 2017 +0100 drm/i915: Break modeset deadlocks on reset Because defense-in-depth is good it's good to still have both. Also note that with the locking change we can now restrict this a lot (old gpus and special testing only), so this doesn't kill the TDR benefits on at least anything remotely modern. And futuremore with a few tricks it should be possible to make a much more educated guess about whether an atomic commit is stuck waiting on the gpu (atomic_t counting the pending i915_sw_fence used by the atomic modeset code should do it), so we can improve this. But for now just start with something that is guaranteed to recover faster, for much better CI througput. This defacto reverts TDR on these platforms, but there's not really a single commit to specify as the sole offender. v2: Add a debug message to explain what's going on. We can't DRM_ERROR because that spams CI. And the timeout based fallback still prints a DRM_ERROR, in case something goes wrong. v3: Fix comment layout (Michel) Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion") Fixes: 221fe7994554 ("drm/i915: Perform a direct reset of the GPU from the waiter") Cc: Chris Wilson Cc: Mika Kuoppala Cc: Joonas Lahtinen Cc: Tvrtko Ursulin (v2) Cc: Michel Thierry Reviewed-by: Tvrtko Ursulin (v2) Reviewed-by: Michel Thierry Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20170808080828.23650-1-daniel.vetter@ffwll.ch (cherry picked from commit 97154ec242c14f646a3ab3b4da8f838d197f300d) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 9471c88d449eaf..cc484b56eeaa33 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -3485,6 +3485,13 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv) !gpu_reset_clobbers_display(dev_priv)) return; + /* We have a modeset vs reset deadlock, defensively unbreak it. + * + * FIXME: We can do a _lot_ better, this is just a first iteration. + */ + i915_gem_set_wedged(dev_priv); + DRM_DEBUG_DRIVER("Wedging GPU to avoid deadlocks with pending modeset updates\n"); + /* * Need mode_config.mutex so that we don't * trample ongoing ->detect() and whatnot. From 1874064eed0502bd9bef7be8023757b0c4f26883 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 14 Aug 2017 20:11:26 -0700 Subject: [PATCH 060/154] Input: elan_i2c - add ELAN0608 to the ACPI table Similar to commit 722c5ac708b4f ("Input: elan_i2c - add ELAN0605 to the ACPI table"), ELAN0608 should be handled by elan_i2c. This touchpad can be found in Lenovo ideapad 320-14IKB. BugLink: https://bugs.launchpad.net/bugs/1708852 Signed-off-by: Kai-Heng Feng Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 3b616cb7c67f88..9fe3908d12d5ea 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1248,6 +1248,7 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN0100", 0 }, { "ELAN0600", 0 }, { "ELAN0605", 0 }, + { "ELAN0608", 0 }, { "ELAN1000", 0 }, { } }; From 76988690402dde2880bfe06ecccf381d48ba8e1c Mon Sep 17 00:00:00 2001 From: KT Liao Date: Mon, 14 Aug 2017 20:11:59 -0700 Subject: [PATCH 061/154] Input: elan_i2c - Add antoher Lenovo ACPI ID for upcoming Lenovo NB Add 2 new IDs (ELAN0609 and ELAN060B) to the list of ACPI IDs that should be handled by the driver. Signed-off-by: KT Liao Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 9fe3908d12d5ea..714cf7f9b13859 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1249,6 +1249,9 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN0600", 0 }, { "ELAN0605", 0 }, { "ELAN0608", 0 }, + { "ELAN0605", 0 }, + { "ELAN0609", 0 }, + { "ELAN060B", 0 }, { "ELAN1000", 0 }, { } }; From 7e1d90f60a0d501c8503e636942ca704a454d910 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Mon, 14 Aug 2017 14:46:01 -0700 Subject: [PATCH 062/154] ALSA: seq: 2nd attempt at fixing race creating a queue commit 4842e98f26dd80be3623c4714a244ba52ea096a8 ("ALSA: seq: Fix race at creating a queue") attempted to fix a race reported by syzkaller. That fix has been described as follows: " When a sequencer queue is created in snd_seq_queue_alloc(),it adds the new queue element to the public list before referencing it. Thus the queue might be deleted before the call of snd_seq_queue_use(), and it results in the use-after-free error, as spotted by syzkaller. The fix is to reference the queue object at the right time. " Even with that fix in place, syzkaller reported a use-after-free error. It specifically pointed to the last instruction "return q->queue" in snd_seq_queue_alloc(). The pointer q is being used after kfree() has been called on it. It turned out that there is still a small window where a race can happen. The window opens at snd_seq_ioctl_create_queue()->snd_seq_queue_alloc()->queue_list_add() and closes at snd_seq_ioctl_create_queue()->queueptr()->snd_use_lock_use(). Between these two calls, a different thread could delete the queue and possibly re-create a different queue in the same location in queue_list. This change prevents this situation by calling snd_use_lock_use() from snd_seq_queue_alloc() prior to calling queue_list_add(). It is then the caller's responsibility to call snd_use_lock_free(&q->use_lock). Fixes: 4842e98f26dd ("ALSA: seq: Fix race at creating a queue") Reported-by: Dmitry Vyukov Cc: Signed-off-by: Daniel Mentz Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 13 ++++--------- sound/core/seq/seq_queue.c | 14 +++++++++----- sound/core/seq/seq_queue.h | 2 +- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 272c55fe17c88a..ea2d0ae85bd367 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1502,16 +1502,11 @@ static int snd_seq_ioctl_unsubscribe_port(struct snd_seq_client *client, static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; - int result; struct snd_seq_queue *q; - result = snd_seq_queue_alloc(client->number, info->locked, info->flags); - if (result < 0) - return result; - - q = queueptr(result); - if (q == NULL) - return -EINVAL; + q = snd_seq_queue_alloc(client->number, info->locked, info->flags); + if (IS_ERR(q)) + return PTR_ERR(q); info->queue = q->queue; info->locked = q->locked; @@ -1521,7 +1516,7 @@ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void *arg) if (!info->name[0]) snprintf(info->name, sizeof(info->name), "Queue-%d", q->queue); strlcpy(q->name, info->name, sizeof(q->name)); - queuefree(q); + snd_use_lock_free(&q->use_lock); return 0; } diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c index 450c5187eecb6b..79e0c5604ef806 100644 --- a/sound/core/seq/seq_queue.c +++ b/sound/core/seq/seq_queue.c @@ -184,22 +184,26 @@ void __exit snd_seq_queues_delete(void) static void queue_use(struct snd_seq_queue *queue, int client, int use); /* allocate a new queue - - * return queue index value or negative value for error + * return pointer to new queue or ERR_PTR(-errno) for error + * The new queue's use_lock is set to 1. It is the caller's responsibility to + * call snd_use_lock_free(&q->use_lock). */ -int snd_seq_queue_alloc(int client, int locked, unsigned int info_flags) +struct snd_seq_queue *snd_seq_queue_alloc(int client, int locked, unsigned int info_flags) { struct snd_seq_queue *q; q = queue_new(client, locked); if (q == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); q->info_flags = info_flags; queue_use(q, client, 1); + snd_use_lock_use(&q->use_lock); if (queue_list_add(q) < 0) { + snd_use_lock_free(&q->use_lock); queue_delete(q); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - return q->queue; + return q; } /* delete a queue - queue must be owned by the client */ diff --git a/sound/core/seq/seq_queue.h b/sound/core/seq/seq_queue.h index 30c8111477f61e..719093489a2c4e 100644 --- a/sound/core/seq/seq_queue.h +++ b/sound/core/seq/seq_queue.h @@ -71,7 +71,7 @@ void snd_seq_queues_delete(void); /* create new queue (constructor) */ -int snd_seq_queue_alloc(int client, int locked, unsigned int flags); +struct snd_seq_queue *snd_seq_queue_alloc(int client, int locked, unsigned int flags); /* delete queue (destructor) */ int snd_seq_queue_delete(int client, int queueid); From a8e800fe0f68bc28ce309914f47e432742b865ed Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Aug 2017 14:35:50 +0200 Subject: [PATCH 063/154] ALSA: usb-audio: Apply sample rate quirk to Sennheiser headset A Senheisser headset requires the typical sample-rate quirk for avoiding spurious errors from inquiring the current sample rate like: usb 1-1: 2:1: cannot get freq at ep 0x4 usb 1-1: 3:1: cannot get freq at ep 0x83 The USB ID 1395:740a has to be added to the entries in snd_usb_get_sample_rate_quirk(). Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1052580 Cc: Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index d7b0b0a3a2db55..f3e9e30172f301 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1142,6 +1142,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ + case USB_ID(0x1395, 0x740a): /* Sennheiser DECT */ case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ From 84393817db09bb436e934f8f8cc981cbca9ea4dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 13:03:47 +0200 Subject: [PATCH 064/154] x86/mtrr: Prevent CPU hotplug lock recursion Larry reported a CPU hotplug lock recursion in the MTRR code. ============================================ WARNING: possible recursive locking detected systemd-udevd/153 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] stop_machine+0x16/0x30 but task is already holding lock: (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] mtrr_add_page+0x83/0x470 .... cpus_read_lock+0x48/0x90 stop_machine+0x16/0x30 mtrr_add_page+0x18b/0x470 mtrr_add+0x3e/0x70 mtrr_add_page() holds the hotplug rwsem already and calls stop_machine() which acquires it again. Call stop_machine_cpuslocked() instead. Reported-and-tested-by: Larry Finger Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708140920250.1865@nanos Cc: "Paul E. McKenney" Cc: Borislav Petkov --- arch/x86/kernel/cpu/mtrr/main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c5bb63be4ba1e6..40d5a8a752125e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); } +static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { @@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr(i, base, size, type); + set_mtrr_cpuslocked(i, base, size, type); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr(replace, 0, 0, 0); + set_mtrr_cpuslocked(replace, 0, 0, 0); mtrr_usage_table[replace] = 0; } } @@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr(reg, 0, 0, 0); + set_mtrr_cpuslocked(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); From 3280d66a6363af0df0441709bc0bc302bd9a2510 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 14 Aug 2017 16:40:11 -0400 Subject: [PATCH 065/154] blk-mq: Fix queue usage on failed request allocation blk_mq_get_request() does not release the callers queue usage counter when allocation fails. The caller still needs to account for its own queue usage when it is unable to allocate a request. Fixes: 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed") Reported-by: Max Gurtovoy Reviewed-by: Ming Lei Reviewed-by: Sagi Grimberg Tested-by: Max Gurtovoy Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 535cbdf32aabb2..4603b115e23488 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -360,12 +360,12 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, return ERR_PTR(ret); rq = blk_mq_get_request(q, NULL, op, &alloc_data); + blk_queue_exit(q); if (!rq) return ERR_PTR(-EWOULDBLOCK); blk_mq_put_ctx(alloc_data.ctx); - blk_queue_exit(q); rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -411,12 +411,11 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); + blk_queue_exit(q); if (!rq) return ERR_PTR(-EWOULDBLOCK); - blk_queue_exit(q); - return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); From 462cdace790ac2ed6aad1b19c9c0af0143b6aab0 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 18 Jul 2017 15:01:00 +0100 Subject: [PATCH 066/154] xen: fix bio vec merging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current test for bio vec merging is not fully accurate and can be tricked into merging bios when certain grant combinations are used. The result of these malicious bio merges is a bio that extends past the memory page used by any of the originating bios. Take into account the following scenario, where a guest creates two grant references that point to the same mfn, ie: grant 1 -> mfn A, grant 2 -> mfn A. These references are then used in a PV block request, and mapped by the backend domain, thus obtaining two different pfns that point to the same mfn, pfn B -> mfn A, pfn C -> mfn A. If those grants happen to be used in two consecutive sectors of a disk IO operation becoming two different bios in the backend domain, the checks in xen_biovec_phys_mergeable will succeed, because bfn1 == bfn2 (they both point to the same mfn). However due to the bio merging, the backend domain will end up with a bio that expands past mfn A into mfn A + 1. Fix this by making sure the check in xen_biovec_phys_mergeable takes into account the offset and the length of the bio, this basically replicates whats done in __BIOVEC_PHYS_MERGEABLE using mfns (bus addresses). While there also remove the usage of __BIOVEC_PHYS_MERGEABLE, since that's already checked by the callers of xen_biovec_phys_mergeable. CC: stable@vger.kernel.org Reported-by: "Jan H. Schönherr" Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/biomerge.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 4da69dbf7dcad7..1bdd02a6d6ac75 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -10,8 +10,7 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); - return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && - ((bfn1 == bfn2) || ((bfn1+1) == bfn2)); + return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; #else /* * XXX: Add support for merging bio_vec when using different page From b15bd8cb37598afb2963f7eb9e2de468d2d60a2f Mon Sep 17 00:00:00 2001 From: Munehisa Kamata Date: Wed, 9 Aug 2017 15:31:40 -0700 Subject: [PATCH 067/154] xen-blkfront: use a right index when checking requests Since commit d05d7f40791c ("Merge branch 'for-4.8/core' of git://git.kernel.dk/linux-block") and 3fc9d690936f ("Merge branch 'for-4.8/drivers' of git://git.kernel.dk/linux-block"), blkfront_resume() has been using an index for iterating ring_info to check request when iterating blk_shadow in an inner loop. This seems to have been accidentally introduced during the massive rewrite of the block layer macros in the commits. This may cause crash like this: [11798.057074] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 [11798.058832] IP: [] blkfront_resume+0x10a/0x610 .... [11798.061063] Call Trace: [11798.061063] [] xenbus_dev_resume+0x53/0x140 [11798.061063] [] ? xenbus_dev_probe+0x150/0x150 [11798.061063] [] dpm_run_callback+0x3e/0x110 [11798.061063] [] device_resume+0x88/0x190 [11798.061063] [] dpm_resume+0x100/0x2d0 [11798.061063] [] dpm_resume_end+0x11/0x20 [11798.061063] [] do_suspend+0xe8/0x1a0 [11798.061063] [] shutdown_handler+0xfd/0x130 [11798.061063] [] ? split+0x110/0x110 [11798.061063] [] xenwatch_thread+0x86/0x120 [11798.061063] [] ? prepare_to_wait_event+0x110/0x110 [11798.061063] [] kthread+0xd7/0xf0 [11798.061063] [] ? kfree+0x121/0x170 [11798.061063] [] ? kthread_park+0x60/0x60 [11798.061063] [] ? call_usermodehelper_exec_work+0xb0/0xb0 [11798.061063] [] ? call_usermodehelper_exec_async+0x13a/0x140 [11798.061063] [] ret_from_fork+0x25/0x30 Use the right index in the inner loop. Fixes: d05d7f40791c ("Merge branch 'for-4.8/core' of git://git.kernel.dk/linux-block") Fixes: 3fc9d690936f ("Merge branch 'for-4.8/drivers' of git://git.kernel.dk/linux-block") Signed-off-by: Munehisa Kamata Reviewed-by: Thomas Friebel Reviewed-by: Eduardo Valentin Reviewed-by: Boris Ostrovsky Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Reviewed-by: Roger Pau Monne Cc: xen-devel@lists.xenproject.org Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 98e34e4c62b8b2..2468c28d477110 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2075,9 +2075,9 @@ static int blkfront_resume(struct xenbus_device *dev) /* * Get the bios in the request so we can re-queue them. */ - if (req_op(shadow[i].request) == REQ_OP_FLUSH || - req_op(shadow[i].request) == REQ_OP_DISCARD || - req_op(shadow[i].request) == REQ_OP_SECURE_ERASE || + if (req_op(shadow[j].request) == REQ_OP_FLUSH || + req_op(shadow[j].request) == REQ_OP_DISCARD || + req_op(shadow[j].request) == REQ_OP_SECURE_ERASE || shadow[j].request->cmd_flags & REQ_FUA) { /* * Flush operations don't contain bios, so From 7a7c286d07f9c704e8fd11dd960bf421cc67b66b Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Fri, 11 Aug 2017 09:34:33 +0800 Subject: [PATCH 068/154] drm/amdgpu: save list length when fence is signaled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update the list first to avoid redundant checks. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c index a6899180b26572..c586f44312f977 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c @@ -244,6 +244,12 @@ struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, struct dma_fence *f = e->fence; struct amd_sched_fence *s_fence = to_amd_sched_fence(f); + if (dma_fence_is_signaled(f)) { + hash_del(&e->node); + dma_fence_put(f); + kmem_cache_free(amdgpu_sync_slab, e); + continue; + } if (ring && s_fence) { /* For fences from the same ring it is sufficient * when they are scheduled. @@ -256,13 +262,6 @@ struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, } } - if (dma_fence_is_signaled(f)) { - hash_del(&e->node); - dma_fence_put(f); - kmem_cache_free(amdgpu_sync_slab, e); - continue; - } - return f; } From d76036ab47eafa6ce52b69482e91ca3ba337d6d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:36 +0200 Subject: [PATCH 069/154] audit: Fix use after free in audit_remove_watch_rule() audit_remove_watch_rule() drops watch's reference to parent but then continues to work with it. That is not safe as parent can get freed once we drop our reference. The following is a trivial reproducer: mount -o loop image /mnt touch /mnt/file auditctl -w /mnt/file -p wax umount /mnt auditctl -D Grab our own reference in audit_remove_watch_rule() earlier to make sure mark does not get freed under us. CC: stable@vger.kernel.org Reported-by: Tony Jones Signed-off-by: Jan Kara Tested-by: Tony Jones Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e0656bd6303684..1c7ded42f82f0e 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } From b5fed474b98332559f2590c6bc90388a0899e793 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:37 +0200 Subject: [PATCH 070/154] audit: Receive unmount event Although audit_watch_handle_event() can handle FS_UNMOUNT event, it is not part of AUDIT_FS_WATCH mask and thus such event never gets to audit_watch_handle_event(). Thus fsnotify marks are deleted by fsnotify subsystem on unmount without audit being notified about that which leads to a strange state of existing audit rules with dead fsnotify marks. Add FS_UNMOUNT to the mask of events to be received so that audit can clean up its state accordingly. Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c7ded42f82f0e..d1b5857b7e3302 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { From 4098116039911e8870d84c975e2ec22dab65a909 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Sat, 12 Aug 2017 23:36:47 +0200 Subject: [PATCH 071/154] parisc: pci memory bar assignment fails with 64bit kernels on dino/cujo For 64bit kernels the lmmio_space_offset of the host bridge window isn't set correctly on systems with dino/cujo PCI host bridges. This leads to not assigned memory bars and failing drivers, which need to use these bars. Signed-off-by: Thomas Bogendoerfer Cc: Acked-by: Helge Deller Signed-off-by: Helge Deller --- drivers/parisc/dino.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 5c63b920b4713e..ed92c1254cff47 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -956,7 +956,7 @@ static int __init dino_probe(struct parisc_device *dev) dino_dev->hba.dev = dev; dino_dev->hba.base_addr = ioremap_nocache(hpa, 4096); - dino_dev->hba.lmmio_space_offset = 0; /* CPU addrs == bus addrs */ + dino_dev->hba.lmmio_space_offset = PCI_F_EXTEND; spin_lock_init(&dino_dev->dinosaur_pen); dino_dev->hba.iommu = ccio_get_iommu(dev); From 42819eb7a0957cc340ad4ed8bba736bab5ebc464 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 14 Aug 2017 22:12:37 +0200 Subject: [PATCH 072/154] nvmet: don't overwrite identify sn/fr with 0-bytes The merged version of my patch "nvmet: don't report 0-bytes in serial number" fails to remove two lines which should have been replaced, so that the space-padded strings are overwritten again with 0-bytes. Fix it. Fixes: 42de82a8b544 nvmet: don't report 0-bytes in serial number Signed-off-by: Martin Wilck Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2d7a98ab53fbf2..a53bb6635b8378 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -199,12 +199,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) copy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1); copy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE)); - memset(id->mn, ' ', sizeof(id->mn)); - strncpy((char *)id->mn, "Linux", sizeof(id->mn)); - - memset(id->fr, ' ', sizeof(id->fr)); - strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); - id->rab = 6; /* From 16a5a480f067f945fd27bf91ffdce3f959b0d4b6 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 14 Aug 2017 11:20:32 -0700 Subject: [PATCH 073/154] nvmet-fc: correct use after free on list teardown Use list_for_each_entry_safe to prevent list handling from referencing next pointers directly after list_del's Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1b7f2520a20db7..b200f9aadd5201 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -704,7 +704,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) { struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport; struct nvmet_fc_fcp_iod *fod = queue->fod; - struct nvmet_fc_defer_fcp_req *deferfcp; + struct nvmet_fc_defer_fcp_req *deferfcp, *tempptr; unsigned long flags; int i, writedataactive; bool disconnect; @@ -735,7 +735,8 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) } /* Cleanup defer'ed IOs in queue */ - list_for_each_entry(deferfcp, &queue->avail_defer_list, req_list) { + list_for_each_entry_safe(deferfcp, tempptr, &queue->avail_defer_list, + req_list) { list_del(&deferfcp->req_list); kfree(deferfcp); } From 5a69aec945d27e78abac9fd032533d3aaebf7c1e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 16 Aug 2017 16:01:14 +1000 Subject: [PATCH 074/154] powerpc: Fix VSX enabling/flushing to also test MSR_FP and MSR_VEC VSX uses a combination of the old vector registers, the old FP registers and new "second halves" of the FP registers. Thus when we need to see the VSX state in the thread struct (flush_vsx_to_thread()) or when we'll use the VSX in the kernel (enable_kernel_vsx()) we need to ensure they are all flushed into the thread struct if either of them is individually enabled. Unfortunately we only tested if the whole VSX was enabled, not if they were individually enabled. Fixes: 72cd7b44bc99 ("powerpc: Uncomment and make enable_kernel_vsx() routine available") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ec480966f9bf55..1f0fd361e09b94 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -362,7 +362,8 @@ void enable_kernel_vsx(void) cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX); - if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) { + if (current->thread.regs && + (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) { check_if_tm_restore_required(current); /* * If a thread has already been reclaimed then the @@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) { if (tsk->thread.regs) { preempt_disable(); - if (tsk->thread.regs->msr & MSR_VSX) { + if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) { BUG_ON(tsk != current); giveup_vsx(tsk); } From a7d2e03928c1936004750c56faf7534c8534f875 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Thu, 10 Aug 2017 12:05:02 -0700 Subject: [PATCH 075/154] RDMA/vmw_pvrdma: Report CQ missed events There is a chance of a race between arming the CQ and receiving completions. By reporting CQ missed events any ULPs should poll again to get the completions. Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Acked-by: Aditya Sarwade Signed-off-by: Bryan Tan Signed-off-by: Adit Ranadive Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 69bda611d31385..90aa326fd7c097 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -65,13 +65,28 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, struct pvrdma_dev *dev = to_vdev(ibcq->device); struct pvrdma_cq *cq = to_vcq(ibcq); u32 val = cq->cq_handle; + unsigned long flags; + int has_data = 0; val |= (notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? PVRDMA_UAR_CQ_ARM_SOL : PVRDMA_UAR_CQ_ARM; + spin_lock_irqsave(&cq->cq_lock, flags); + pvrdma_write_uar_cq(dev, val); - return 0; + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + unsigned int head; + + has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx, + cq->ibcq.cqe, &head); + if (unlikely(has_data == PVRDMA_INVALID_IDX)) + dev_err(&dev->pdev->dev, "CQ ring state invalid\n"); + } + + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return has_data; } /** From f67ace2d8868d06710ceea1b10b124eead5040da Mon Sep 17 00:00:00 2001 From: Chien Tin Tung Date: Tue, 8 Aug 2017 20:38:43 -0500 Subject: [PATCH 076/154] i40iw: Fix parsing of query/commit FPM buffers Parsing of commit/query Host Memory Cache Function Private Memory is not skipping over reserved fields and incorrectly assigning those values into object's base/cnt/max_cnt fields. Skip over reserved fields and set correct values. Also correct memory alignment requirement for commit/query FPM buffers. Signed-off-by: Chien Tin Tung Signed-off-by: Shiraz Saleem Signed-off-by: Christopher N Bednarz Signed-off-by: Henry Orosco Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 121 +++++++++++++++-------- drivers/infiniband/hw/i40iw/i40iw_d.h | 4 +- 2 files changed, 83 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index 9ec1ae9a82c984..ef4a73cd171099 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -130,20 +130,32 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( u64 base = 0; u32 i, j; u32 k = 0; - u32 low; /* copy base values in obj_info */ - for (i = I40IW_HMC_IW_QP, j = 0; - i <= I40IW_HMC_IW_PBLE; i++, j += 8) { + for (i = I40IW_HMC_IW_QP, j = 0; i <= I40IW_HMC_IW_PBLE; i++, j += 8) { + if ((i == I40IW_HMC_IW_SRQ) || + (i == I40IW_HMC_IW_FSIMC) || + (i == I40IW_HMC_IW_FSIAV)) { + info[i].base = 0; + info[i].cnt = 0; + continue; + } get_64bit_val(buf, j, &temp); info[i].base = RS_64_1(temp, 32) * 512; if (info[i].base > base) { base = info[i].base; k = i; } - low = (u32)(temp); - if (low) - info[i].cnt = low; + if (i == I40IW_HMC_IW_APBVT_ENTRY) { + info[i].cnt = 1; + continue; + } + if (i == I40IW_HMC_IW_QP) + info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS); + else if (i == I40IW_HMC_IW_CQ) + info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS); + else + info[i].cnt = (u32)(temp); } size = info[k].cnt * info[k].size + info[k].base; if (size & 0x1FFFFF) @@ -154,6 +166,31 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( return 0; } +/** + * i40iw_sc_decode_fpm_query() - Decode a 64 bit value into max count and size + * @buf: ptr to fpm query buffer + * @buf_idx: index into buf + * @info: ptr to i40iw_hmc_obj_info struct + * @rsrc_idx: resource index into info + * + * Decode a 64 bit value from fpm query buffer into max count and size + */ +static u64 i40iw_sc_decode_fpm_query(u64 *buf, + u32 buf_idx, + struct i40iw_hmc_obj_info *obj_info, + u32 rsrc_idx) +{ + u64 temp; + u32 size; + + get_64bit_val(buf, buf_idx, &temp); + obj_info[rsrc_idx].max_cnt = (u32)temp; + size = (u32)RS_64_1(temp, 32); + obj_info[rsrc_idx].size = LS_64_1(1, size); + + return temp; +} + /** * i40iw_sc_parse_fpm_query_buf() - parses fpm query buffer * @buf: ptr to fpm query buffer @@ -168,9 +205,9 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_query_buf( struct i40iw_hmc_info *hmc_info, struct i40iw_hmc_fpm_misc *hmc_fpm_misc) { - u64 temp; struct i40iw_hmc_obj_info *obj_info; - u32 i, j, size; + u64 temp; + u32 size; u16 max_pe_sds; obj_info = hmc_info->hmc_obj; @@ -185,41 +222,52 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_query_buf( hmc_fpm_misc->max_sds = max_pe_sds; hmc_info->sd_table.sd_cnt = max_pe_sds + hmc_info->first_sd_index; - for (i = I40IW_HMC_IW_QP, j = 8; - i <= I40IW_HMC_IW_ARP; i++, j += 8) { - get_64bit_val(buf, j, &temp); - if (i == I40IW_HMC_IW_QP) - obj_info[i].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS); - else if (i == I40IW_HMC_IW_CQ) - obj_info[i].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS); - else - obj_info[i].max_cnt = (u32)temp; + get_64bit_val(buf, 8, &temp); + obj_info[I40IW_HMC_IW_QP].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS); + size = (u32)RS_64_1(temp, 32); + obj_info[I40IW_HMC_IW_QP].size = LS_64_1(1, size); - size = (u32)RS_64_1(temp, 32); - obj_info[i].size = ((u64)1 << size); - } - for (i = I40IW_HMC_IW_MR, j = 48; - i <= I40IW_HMC_IW_PBLE; i++, j += 8) { - get_64bit_val(buf, j, &temp); - obj_info[i].max_cnt = (u32)temp; - size = (u32)RS_64_1(temp, 32); - obj_info[i].size = LS_64_1(1, size); - } + get_64bit_val(buf, 16, &temp); + obj_info[I40IW_HMC_IW_CQ].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS); + size = (u32)RS_64_1(temp, 32); + obj_info[I40IW_HMC_IW_CQ].size = LS_64_1(1, size); + + i40iw_sc_decode_fpm_query(buf, 32, obj_info, I40IW_HMC_IW_HTE); + i40iw_sc_decode_fpm_query(buf, 40, obj_info, I40IW_HMC_IW_ARP); + + obj_info[I40IW_HMC_IW_APBVT_ENTRY].size = 8192; + obj_info[I40IW_HMC_IW_APBVT_ENTRY].max_cnt = 1; + + i40iw_sc_decode_fpm_query(buf, 48, obj_info, I40IW_HMC_IW_MR); + i40iw_sc_decode_fpm_query(buf, 56, obj_info, I40IW_HMC_IW_XF); - get_64bit_val(buf, 120, &temp); - hmc_fpm_misc->max_ceqs = (u8)RS_64(temp, I40IW_QUERY_FPM_MAX_CEQS); - get_64bit_val(buf, 120, &temp); - hmc_fpm_misc->ht_multiplier = RS_64(temp, I40IW_QUERY_FPM_HTMULTIPLIER); - get_64bit_val(buf, 120, &temp); - hmc_fpm_misc->timer_bucket = RS_64(temp, I40IW_QUERY_FPM_TIMERBUCKET); get_64bit_val(buf, 64, &temp); + obj_info[I40IW_HMC_IW_XFFL].max_cnt = (u32)temp; + obj_info[I40IW_HMC_IW_XFFL].size = 4; hmc_fpm_misc->xf_block_size = RS_64(temp, I40IW_QUERY_FPM_XFBLOCKSIZE); if (!hmc_fpm_misc->xf_block_size) return I40IW_ERR_INVALID_SIZE; + + i40iw_sc_decode_fpm_query(buf, 72, obj_info, I40IW_HMC_IW_Q1); + get_64bit_val(buf, 80, &temp); + obj_info[I40IW_HMC_IW_Q1FL].max_cnt = (u32)temp; + obj_info[I40IW_HMC_IW_Q1FL].size = 4; hmc_fpm_misc->q1_block_size = RS_64(temp, I40IW_QUERY_FPM_Q1BLOCKSIZE); if (!hmc_fpm_misc->q1_block_size) return I40IW_ERR_INVALID_SIZE; + + i40iw_sc_decode_fpm_query(buf, 88, obj_info, I40IW_HMC_IW_TIMER); + + get_64bit_val(buf, 112, &temp); + obj_info[I40IW_HMC_IW_PBLE].max_cnt = (u32)temp; + obj_info[I40IW_HMC_IW_PBLE].size = 8; + + get_64bit_val(buf, 120, &temp); + hmc_fpm_misc->max_ceqs = (u8)RS_64(temp, I40IW_QUERY_FPM_MAX_CEQS); + hmc_fpm_misc->ht_multiplier = RS_64(temp, I40IW_QUERY_FPM_HTMULTIPLIER); + hmc_fpm_misc->timer_bucket = RS_64(temp, I40IW_QUERY_FPM_TIMERBUCKET); + return 0; } @@ -3392,13 +3440,6 @@ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_ hmc_info->sd_table.sd_entry = virt_mem.va; } - /* fill size of objects which are fixed */ - hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].size = 4; - hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].size = 4; - hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size = 8; - hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].size = 8192; - hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].max_cnt = 1; - return ret_code; } diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index a39ac12b6a7e84..2ebaadbed37940 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -1507,8 +1507,8 @@ enum { I40IW_CQ0_ALIGNMENT_MASK = (256 - 1), I40IW_HOST_CTX_ALIGNMENT_MASK = (4 - 1), I40IW_SHADOWAREA_MASK = (128 - 1), - I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK = 0, - I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK = 0 + I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK = (4 - 1), + I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK = (4 - 1) }; enum i40iw_alignment { From 8129331f01a683ed8d9a9a65ed01b5c6ad26403a Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 8 Aug 2017 20:38:44 -0500 Subject: [PATCH 077/154] i40iw: Correct variable names Fix incorrect naming of status code and struct. Use inline instead of immediate. Signed-off-by: Mustafa Ismail Signed-off-by: Henry Orosco Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_status.h | 2 +- drivers/infiniband/hw/i40iw/i40iw_uk.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h index 91c421762f0679..f7013f11d80859 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_status.h +++ b/drivers/infiniband/hw/i40iw/i40iw_status.h @@ -62,7 +62,7 @@ enum i40iw_status_code { I40IW_ERR_INVALID_ALIGNMENT = -23, I40IW_ERR_FLUSHED_QUEUE = -24, I40IW_ERR_INVALID_PUSH_PAGE_INDEX = -25, - I40IW_ERR_INVALID_IMM_DATA_SIZE = -26, + I40IW_ERR_INVALID_INLINE_DATA_SIZE = -26, I40IW_ERR_TIMEOUT = -27, I40IW_ERR_OPCODE_MISMATCH = -28, I40IW_ERR_CQP_COMPL_ERROR = -29, diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index b0d3a0e8a9b522..70a6b41980fa87 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -435,7 +435,7 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp, op_info = &info->op.inline_rdma_write; if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE) - return I40IW_ERR_INVALID_IMM_DATA_SIZE; + return I40IW_ERR_INVALID_INLINE_DATA_SIZE; ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size); if (ret_code) @@ -511,7 +511,7 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp, op_info = &info->op.inline_send; if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE) - return I40IW_ERR_INVALID_IMM_DATA_SIZE; + return I40IW_ERR_INVALID_INLINE_DATA_SIZE; ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size); if (ret_code) @@ -1187,7 +1187,7 @@ enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size, u8 *wqe_size) { if (data_size > I40IW_MAX_INLINE_DATA_SIZE) - return I40IW_ERR_INVALID_IMM_DATA_SIZE; + return I40IW_ERR_INVALID_INLINE_DATA_SIZE; if (data_size <= 16) *wqe_size = I40IW_QP_WQE_MIN_SIZE; From 29c2415a6669bab354f0aa3445777fe147c7a05d Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 8 Aug 2017 20:38:46 -0500 Subject: [PATCH 078/154] i40iw: Fix typecast of tcp_seq_num The typecast of tcp_seq_num incorrectly uses u8. Fix by casting to u32. Signed-off-by: Mustafa Ismail Signed-off-by: Henry Orosco Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_uk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index 70a6b41980fa87..1060725d18bce8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -784,7 +784,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, get_64bit_val(cqe, 0, &qword0); get_64bit_val(cqe, 16, &qword2); - info->tcp_seq_num = (u8)RS_64(qword0, I40IWCQ_TCPSEQNUM); + info->tcp_seq_num = (u32)RS_64(qword0, I40IWCQ_TCPSEQNUM); info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID); From a28f047e5f9b987d614eeee34388087ffdda3e53 Mon Sep 17 00:00:00 2001 From: Christopher N Bednarz Date: Tue, 8 Aug 2017 20:38:47 -0500 Subject: [PATCH 079/154] i40iw: Use correct alignment for CQ0 memory Utilize correct alignment variable when allocating DMA memory for CQ0. Signed-off-by: Christopher N Bednarz Signed-off-by: Henry Orosco Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_puda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 71050c5d29a05f..7f5583d83622a5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -685,7 +685,7 @@ static enum i40iw_status_code i40iw_puda_cq_create(struct i40iw_puda_rsrc *rsrc) cqsize = rsrc->cq_size * (sizeof(struct i40iw_cqe)); tsize = cqsize + sizeof(struct i40iw_cq_shadow_area); ret = i40iw_allocate_dma_mem(dev->hw, &rsrc->cqmem, tsize, - I40IW_CQ0_ALIGNMENT_MASK); + I40IW_CQ0_ALIGNMENT); if (ret) return ret; From aa939c12ab8a0c094420ad1b909a957ac590e43e Mon Sep 17 00:00:00 2001 From: Christopher N Bednarz Date: Tue, 8 Aug 2017 20:38:48 -0500 Subject: [PATCH 080/154] i40iw: Fix potential fcn_id_array out of bounds Avoid out of bounds error by utilizing I40IW_MAX_STATS_COUNT instead of I40IW_INVALID_FCN_ID. Signed-off-by: Christopher N Bednarz Signed-off-by: Henry Orosco Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index ef4a73cd171099..a49ff2eb6fb3bc 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -4881,7 +4881,7 @@ void i40iw_vsi_stats_free(struct i40iw_sc_vsi *vsi) { u8 fcn_id = vsi->fcn_id; - if ((vsi->stats_fcn_id_alloc) && (fcn_id != I40IW_INVALID_FCN_ID)) + if (vsi->stats_fcn_id_alloc && fcn_id < I40IW_MAX_STATS_COUNT) vsi->dev->fcn_id_array[fcn_id] = false; i40iw_hw_stats_stop_timer(vsi); } From 5b59a3969e95cd9be3699ecf7149ae8ef103b6f5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 8 Aug 2017 18:41:02 +0100 Subject: [PATCH 081/154] IB/hns: fix memory leak on ah on error return path When dmac is NULL, ah is not being freed on the error return path. Fix this by kfree'ing it. Detected by CoverityScan, CID#1452636 ("Resource Leak") Fixes: d8966fcd4c25 ("IB/core: Use rdma_ah_attr accessor functions") Signed-off-by: Colin Ian King Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_ah.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index f78a733a63ec7e..d545302b8ef8c5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -64,8 +64,10 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, } else { u8 *dmac = rdma_ah_retrieve_dmac(ah_attr); - if (!dmac) + if (!dmac) { + kfree(ah); return ERR_PTR(-EINVAL); + } memcpy(ah->av.mac, dmac, ETH_ALEN); } From d4ba61d218822578dcf6c2453a38e000b0ea01e6 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 25 Jul 2017 06:51:15 -0700 Subject: [PATCH 082/154] iw_cxgb4: fix misuse of integer variable Fixes: ee30f7d507c0 ("iw_cxgb4: Max fastreg depth depends on DSGL support") Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 5332f06b99ba4c..c2fba76becd4e9 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -661,7 +661,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, rhp = php->rhp; if (mr_type != IB_MR_TYPE_MEM_REG || - max_num_sg > t4_max_fr_depth(&rhp->rdev.lldi.ulptx_memwrite_dsgl && + max_num_sg > t4_max_fr_depth(rhp->rdev.lldi.ulptx_memwrite_dsgl && use_dsgl)) return ERR_PTR(-EINVAL); From 06f8174a97822f6befd28fc2dd315b43b82c700f Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Mon, 17 Jul 2017 14:03:50 -0500 Subject: [PATCH 083/154] IB/core: Protect sysfs entry on ib_unregister_device ib_unregister_device is not protecting removal of sysfs entries. A call to ib_register_device in that window can result in duplicate sysfs entry warning. Move mutex_unlock to after ib_device_unregister_sysfs to protect against sysfs entry creation. This issue is exposed during driver load/unload stress test. WARNING: CPU: 5 PID: 4445 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x5f/0x70 sysfs: cannot create duplicate filename '/class/infiniband/i40iw0' Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./Q87M-D2H BIOS F7 01/17/2014 Workqueue: i40e i40e_service_task [i40e] Call Trace: dump_stack+0x67/0x98 __warn+0xcc/0xf0 warn_slowpath_fmt+0x4a/0x50 ? kernfs_path_from_node+0x4b/0x60 sysfs_warn_dup+0x5f/0x70 sysfs_do_create_link_sd.isra.2+0xb7/0xc0 sysfs_create_link+0x20/0x40 device_add+0x28c/0x600 ib_device_register_sysfs+0x58/0x170 [ib_core] ib_register_device+0x325/0x570 [ib_core] ? i40iw_register_rdma_device+0x1f4/0x400 [i40iw] ? kmem_cache_alloc_trace+0x143/0x330 ? __raw_spin_lock_init+0x2d/0x50 i40iw_register_rdma_device+0x2dc/0x400 [i40iw] i40iw_open+0x10a6/0x1950 [i40iw] ? i40iw_open+0xeab/0x1950 [i40iw] ? i40iw_make_cm_node+0x9c0/0x9c0 [i40iw] i40e_client_subtask+0xa4/0x110 [i40e] i40e_service_task+0xc2d/0x1320 [i40e] process_one_work+0x203/0x710 ? process_one_work+0x16f/0x710 worker_thread+0x126/0x4a0 ? trace_hardirqs_on+0xd/0x10 kthread+0x112/0x150 ? process_one_work+0x710/0x710 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 ---[ end trace fd11b69e21ea7653 ]--- Couldn't register device i40iw0 with driver model Signed-off-by: Shiraz Saleem Signed-off-by: Sindhu Devale Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a5dfab6adf495b..221468f7712844 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -537,10 +537,11 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); - mutex_unlock(&device_mutex); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + + mutex_unlock(&device_mutex); + ib_cache_cleanup_one(device); ib_security_destroy_port_pkey_list(device); From 870201f95fcbd19538aef630393fe9d583eff82e Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 16 Aug 2017 18:57:04 +0300 Subject: [PATCH 084/154] IB/uverbs: Fix NULL pointer dereference during device removal As part of ib_uverbs_remove_one which might be triggered upon reset flow, we trigger IB_EVENT_DEVICE_FATAL event to userspace application. If device was removed after uverbs fd was opened but before ib_uverbs_get_context was called, the event file will be accessed before it was allocated, result in NULL pointer dereference: [ 72.325873] BUG: unable to handle kernel NULL pointer dereference at (null) ... [ 72.325984] IP: _raw_spin_lock_irqsave+0x22/0x40 [ 72.327123] Call Trace: [ 72.327168] ib_uverbs_async_handler.isra.8+0x2e/0x160 [ib_uverbs] [ 72.327216] ? synchronize_srcu_expedited+0x27/0x30 [ 72.327269] ib_uverbs_remove_one+0x120/0x2c0 [ib_uverbs] [ 72.327330] ib_unregister_device+0xd0/0x180 [ib_core] [ 72.327373] mlx5_ib_remove+0x74/0x140 [mlx5_ib] [ 72.327422] mlx5_remove_device+0xfb/0x110 [mlx5_core] [ 72.327466] mlx5_unregister_interface+0x3c/0xa0 [mlx5_core] [ 72.327509] mlx5_ib_cleanup+0x10/0x962 [mlx5_ib] [ 72.327546] SyS_delete_module+0x155/0x230 [ 72.328472] ? exit_to_usermode_loop+0x70/0xa6 [ 72.329370] do_syscall_64+0x54/0xc0 [ 72.330262] entry_SYSCALL64_slow_path+0x25/0x25 Fix it by checking that user context was allocated before trigger the event. Fixes: 036b10635739 ('IB/uverbs: Enable device removal when there are active user space applications') Signed-off-by: Maor Gottlieb Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index c023e2c81b8f2b..5e530d2bee4448 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1153,7 +1153,6 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, kref_get(&file->ref); mutex_unlock(&uverbs_dev->lists_mutex); - ib_uverbs_event_handler(&file->event_handler, &event); mutex_lock(&file->cleanup_mutex); ucontext = file->ucontext; @@ -1170,6 +1169,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, * for example due to freeing the resources * (e.g mmput). */ + ib_uverbs_event_handler(&file->event_handler, &event); ib_dev->disassociate_ucontext(ucontext); mutex_lock(&file->cleanup_mutex); ib_uverbs_cleanup_ucontext(file, ucontext, true); From 79db795833bf5c3e798bcd7a5aeeee3fb0505927 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Aug 2017 21:32:19 -0700 Subject: [PATCH 085/154] sparc64: Don't clibber fixed registers in __multi4. %g4 and %g5 are fixed registers used by the kernel for the thread pointer and the per-cpu offset. Use %o4 and %g7 instead. Diagnosis by Anthony Yznaga. Fixes: 1b4af13ff2cc ("sparc64: Add __multi3 for gcc 7.x and later.") Reported-by: Anatoly Pugachev Tested-by: Anatoly Pugachev Signed-off-by: David S. Miller --- arch/sparc/lib/multi3.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S index d6b6c97fe3c730..703127aaf4a501 100644 --- a/arch/sparc/lib/multi3.S +++ b/arch/sparc/lib/multi3.S @@ -5,26 +5,26 @@ .align 4 ENTRY(__multi3) /* %o0 = u, %o1 = v */ mov %o1, %g1 - srl %o3, 0, %g4 - mulx %g4, %g1, %o1 + srl %o3, 0, %o4 + mulx %o4, %g1, %o1 srlx %g1, 0x20, %g3 - mulx %g3, %g4, %g5 - sllx %g5, 0x20, %o5 - srl %g1, 0, %g4 + mulx %g3, %o4, %g7 + sllx %g7, 0x20, %o5 + srl %g1, 0, %o4 sub %o1, %o5, %o5 srlx %o5, 0x20, %o5 - addcc %g5, %o5, %g5 + addcc %g7, %o5, %g7 srlx %o3, 0x20, %o5 - mulx %g4, %o5, %g4 + mulx %o4, %o5, %o4 mulx %g3, %o5, %o5 sethi %hi(0x80000000), %g3 - addcc %g5, %g4, %g5 - srlx %g5, 0x20, %g5 + addcc %g7, %o4, %g7 + srlx %g7, 0x20, %g7 add %g3, %g3, %g3 movcc %xcc, %g0, %g3 - addcc %o5, %g5, %o5 - sllx %g4, 0x20, %g4 - add %o1, %g4, %o1 + addcc %o5, %g7, %o5 + sllx %o4, 0x20, %o4 + add %o1, %o4, %o1 add %o5, %g3, %g2 mulx %g1, %o2, %g1 add %g1, %g2, %g1 From c7b725be84985532161bcb4fbecd056326998a77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Aug 2017 18:38:42 -0700 Subject: [PATCH 086/154] net: igmp: Use ingress interface rather than vrf device Anuradha reported that statically added groups for interfaces enslaved to a VRF device were not persisting. The problem is that igmp queries and reports need to use the data in the in_dev for the real ingress device rather than the VRF device. Update igmp_rcv accordingly. Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast") Reported-by: Anuradha Karuppiah Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 498706b072fb70..caf2f1101d027b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct net_device *dev = skb->dev; + struct in_device *in_dev; int len = skb->len; bool dropped = true; + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); + if (!dev) + goto drop; + } + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; From 6170a506899aee3dd4934c928426505e47b1b466 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Wed, 16 Aug 2017 11:09:10 -0700 Subject: [PATCH 087/154] sparc64: remove unnecessary log message There is no need to log message if ATU hvapi couldn't get register. Unlike PCI hvapi, ATU hvapi registration failure is not hard error. Even if ATU hvapi registration fails (on system with ATU or without ATU) system continues with legacy IOMMU. So only log message when ATU hvapi successfully get registered. Signed-off-by: Tushar Dave Signed-off-by: David S. Miller --- arch/sparc/kernel/pci_sun4v.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index f10e2f7123949d..9ebebf1fd93d2f 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -1266,8 +1266,6 @@ static int pci_sun4v_probe(struct platform_device *op) * ATU group, but ATU hcalls won't be available. */ hv_atu = false; - pr_err(PFX "Could not register hvapi ATU err=%d\n", - err); } else { pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n", vatu_major, vatu_minor); From 47ac5484fd961420e5ec0bb5b972fde381f57365 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Aug 2017 17:39:52 +0200 Subject: [PATCH 088/154] x86: Fix norandmaps/ADDR_NO_RANDOMIZE Documentation/admin-guide/kernel-parameters.txt says: norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space but it doesn't work because arch_rnd() which is used to randomize mm->mmap_base returns a random value unconditionally. And as Kirill pointed out, ADDR_NO_RANDOMIZE is broken by the same reason. Just shift the PF_RANDOMIZE check from arch_mmap_rnd() to arch_rnd(). Fixes: 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Acked-by: Cyrill Gorcunov Reviewed-by: Dmitry Safonov Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170815153952.GA1076@redhat.com --- arch/x86/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 229d04a83f8561..c94df122815ad9 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -79,13 +79,13 @@ static int mmap_is_legacy(void) static unsigned long arch_rnd(unsigned int rndbits) { + if (!(current->flags & PF_RANDOMIZE)) + return 0; return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; } unsigned long arch_mmap_rnd(void) { - if (!(current->flags & PF_RANDOMIZE)) - return 0; return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } From 01578e36163cdd0e4fd61d9976de15f13364e26d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Aug 2017 17:40:11 +0200 Subject: [PATCH 089/154] x86/elf: Remove the unnecessary ADDR_NO_RANDOMIZE checks The ADDR_NO_RANDOMIZE checks in stack_maxrandom_size() and randomize_stack_top() are not required. PF_RANDOMIZE is set by load_elf_binary() only if ADDR_NO_RANDOMIZE is not set, no need to re-check after that. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170815154011.GB1076@redhat.com --- arch/x86/mm/mmap.c | 3 +-- fs/binfmt_elf.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c94df122815ad9..a88cfbfbd0781a 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -50,8 +50,7 @@ unsigned long tasksize_64bit(void) static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); max <<= PAGE_SHIFT; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 879ff9c7ffd01a..6466153f2bf099 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -664,8 +664,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; From d6957f3396d0b1ee54d183524550d791054b5ebe Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 15 Aug 2017 11:34:19 +0200 Subject: [PATCH 090/154] printk-formats.txt: Better describe the difference between %pS and %pF Sometimes people seems unclear when to use the %pS or %pF printk format. For example, see commit 51d96dc2e2dc ("random: fix warning message on ia64 and parisc") which fixed such a wrong format string. The documentation should be more clear about the difference. Signed-off-by: Helge Deller [pmladek@suse.com: Restructure the entire section] Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Helge Deller --- Documentation/printk-formats.txt | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 65ea5915178b4b..074670b98bac78 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -58,20 +58,23 @@ Symbols/Function Pointers %ps versatile_init %pB prev_fn_of_versatile_init+0x88/0x88 -For printing symbols and function pointers. The ``S`` and ``s`` specifiers -result in the symbol name with (``S``) or without (``s``) offsets. Where -this is used on a kernel without KALLSYMS - the symbol address is -printed instead. +The ``F`` and ``f`` specifiers are for printing function pointers, +for example, f->func, &gettimeofday. They have the same result as +``S`` and ``s`` specifiers. But they do an extra conversion on +ia64, ppc64 and parisc64 architectures where the function pointers +are actually function descriptors. + +The ``S`` and ``s`` specifiers can be used for printing symbols +from direct addresses, for example, __builtin_return_address(0), +(void *)regs->ip. They result in the symbol name with (``S``) or +without (``s``) offsets. If KALLSYMS are disabled then the symbol +address is printed instead. The ``B`` specifier results in the symbol name with offsets and should be used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-call``s are used and marked with the noreturn GCC attribute. -On ia64, ppc64 and parisc64 architectures function pointers are -actually function descriptors which must first be resolved. The ``F`` and -``f`` specifiers perform this resolution and then provide the same -functionality as the ``S`` and ``s`` specifiers. Kernel Pointers =============== From 494bea39f3201776cdfddc232705f54a0bd210c4 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 16 Aug 2017 13:30:07 +0800 Subject: [PATCH 091/154] openvswitch: fix skb_panic due to the incorrect actions attrlen For sw_flow_actions, the actions_len only represents the kernel part's size, and when we dump the actions to the userspace, we will do the convertions, so it's true size may become bigger than the actions_len. But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len to alloc the skbuff, so the user_skb's size may become insufficient and oops will happen like this: skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head: ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:129! [...] Call Trace: [] skb_put+0x43/0x44 [] skb_zerocopy+0x6c/0x1f4 [] queue_userspace_packet+0x3a3/0x448 [openvswitch] [] ovs_dp_upcall+0x30/0x5c [openvswitch] [] output_userspace+0x132/0x158 [openvswitch] [] ? ip6_rcv_finish+0x74/0x77 [ipv6] [] do_execute_actions+0xcc1/0xdc8 [openvswitch] [] ovs_execute_actions+0x74/0x106 [openvswitch] [] ovs_dp_process_packet+0xe1/0xfd [openvswitch] [] ? key_extract+0x63c/0x8d5 [openvswitch] [] ovs_vport_receive+0xa1/0xc3 [openvswitch] [...] Also we can find that the actions_len is much little than the orig_len: crash> struct sw_flow_actions 0xffff8812f539d000 struct sw_flow_actions { rcu = { next = 0xffff8812f5398800, func = 0xffffe3b00035db32 }, orig_len = 1384, actions_len = 592, actions = 0xffff8812f539d01c } So as a quick fix, use the orig_len instead of the actions_len to alloc the user_skb. Last, this oops happened on our system running a relative old kernel, but the same risk still exists on the mainline, since we use the wrong actions_len from the beginning. Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace") Cc: Neil McKee Signed-off-by: Liping Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 1 + net/openvswitch/datapath.c | 7 ++++--- net/openvswitch/datapath.h | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e4610676299bcd..a54a556fcdb57d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, goto out; } + OVS_CB(skb)->acts_origlen = acts->orig_len; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 45fe8c8a884df3..6b44fe4052825a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, - unsigned int hdrlen) + unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ @@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) - size += nla_total_size(upcall_info->actions_len); + size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) @@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen - cutlen); + len = upcall_msg_size(upcall_info, hlen - cutlen, + OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 5d8dcd88815f06..480600649d0b03 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -99,11 +99,13 @@ struct datapath { * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. + * @acts_origlen: The netlink size of the flow actions applied to this skb. * @cutlen: The number of bytes from the packet end to be removed. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u16 acts_origlen; u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) From 120e9dabaf551c6dc03d3a10a1f026376cb1811c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 07:03:15 -0700 Subject: [PATCH 092/154] dccp: defer ccid_hc_tx_delete() at dismantle time syszkaller team reported another problem in DCCP [1] Problem here is that the structure holding RTO timer (ccid2_hc_tx_rto_expire() handler) is freed too soon. We can not use del_timer_sync() to cancel the timer since this timer wants to grab socket lock (that would risk a dead lock) Solution is to defer the freeing of memory when all references to the socket were released. Socket timers do own a reference, so this should fix the issue. [1] ================================================================== BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365 CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events_unbound call_usermodehelper_exec_work Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:638 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702 RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline] RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline] RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343 RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10 RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006 RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98 RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60 R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0 release_task+0xe9e/0x1a40 kernel/exit.c:220 wait_task_zombie kernel/exit.c:1162 [inline] wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389 do_wait_thread kernel/exit.c:1452 [inline] do_wait+0x441/0xa90 kernel/exit.c:1523 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665 SYSC_wait4+0x134/0x140 kernel/exit.c:1677 SyS_wait4+0x2c/0x40 kernel/exit.c:1673 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline] call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097 worker_thread+0x223/0x1860 kernel/workqueue.c:2231 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425 Allocated by task 21267: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561 ccid_new+0x20e/0x390 net/dccp/ccid.c:151 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline] dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __release_sock+0x124/0x360 net/core/sock.c:2269 release_sock+0xa4/0x2a0 net/core/sock.c:2784 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline] __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682 SYSC_connect+0x204/0x470 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3049: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833 dccp_done+0xb7/0xd0 net/dccp/proto.c:145 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:248 [inline] ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:477 [inline] ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:248 [inline] ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455 process_backlog+0x203/0x740 net/core/dev.c:5130 napi_poll net/core/dev.c:5527 [inline] net_rx_action+0x792/0x1910 net/core/dev.c:5593 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 The buggy address belongs to the object at ffff8801d2660100 which belongs to the cache ccid2_hc_tx_sock of size 1240 The buggy address is located 1088 bytes inside of 1240-byte region [ffff8801d2660100, ffff8801d26605d8) The buggy address belongs to the page: page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0 flags: 0x200000000008100(slab|head) raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005 raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/proto.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 86bc40ba6ba5b4..b68168fcc06aa1 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); +static void dccp_sk_destruct(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = NULL; + inet_sock_destruct(sk); +} + int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); @@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; @@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); From 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 10:36:47 -0700 Subject: [PATCH 093/154] ptr_ring: use kmalloc_array() As found by syzkaller, malicious users can set whatever tx_queue_len on a tun device and eventually crash the kernel. Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small ring buffer is not fast anyway. Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Michael S. Tsirkin Cc: Jason Wang Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 9 +++++---- include/linux/skb_array.h | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index d8c97ec8a8e66d..37b4bb2545b32d 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -436,9 +436,9 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, __PTR_RING_PEEK_CALL_v; \ }) -static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp) +static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) { - return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp); + return kcalloc(size, sizeof(void *), gfp); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) @@ -582,7 +582,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ -static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings, +static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, + unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { @@ -590,7 +591,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings, void ***queues; int i; - queues = kmalloc(nrings * sizeof *queues, gfp); + queues = kmalloc_array(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 35226cd4efb0f3..8621ffdeecbf05 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -193,7 +193,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) } static inline int skb_array_resize_multiple(struct skb_array **rings, - int nrings, int size, gfp_t gfp) + int nrings, unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 11:09:12 -0700 Subject: [PATCH 094/154] ipv4: better IP_MAX_MTU enforcement While working on yet another syzkaller report, I found that our IP_MAX_MTU enforcements were not properly done. gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and final result can be bigger than IP_MAX_MTU :/ This is a problem because device mtu can be changed on other cpus or threads. While this patch does not fix the issue I am working on, it is probably worth addressing it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++-- net/ipv4/route.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 821cedcc8e73b6..0cf7f5a65fe6be 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, !forwarding) return dst_mtu(dst); - return min(dst->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, @@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } u32 ip_idents_reserve(u32 hash, int segs); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7effa62beed3fa..fe877a4a72b1ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) From 369157b41cca435442cf5add9df209aaf951860d Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 16 Aug 2017 10:47:03 -0700 Subject: [PATCH 095/154] nvmet-fc: eliminate incorrect static markers on local variables There were 2 statics introduced that were bogus. Removed the static designations. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index b200f9aadd5201..309c84aa7595b9 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -394,7 +394,7 @@ nvmet_fc_free_ls_iodlist(struct nvmet_fc_tgtport *tgtport) static struct nvmet_fc_ls_iod * nvmet_fc_alloc_ls_iod(struct nvmet_fc_tgtport *tgtport) { - static struct nvmet_fc_ls_iod *iod; + struct nvmet_fc_ls_iod *iod; unsigned long flags; spin_lock_irqsave(&tgtport->lock, flags); @@ -471,7 +471,7 @@ nvmet_fc_destroy_fcp_iodlist(struct nvmet_fc_tgtport *tgtport, static struct nvmet_fc_fcp_iod * nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue) { - static struct nvmet_fc_fcp_iod *fod; + struct nvmet_fc_fcp_iod *fod; lockdep_assert_held(&queue->qlock); From 187e91fe5e915f4b7f39b824aa422493463e443d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 16 Aug 2017 21:08:08 +0200 Subject: [PATCH 096/154] x86/boot/64/clang: Use fixup_pointer() to access 'next_early_pgt' __startup_64() is normally using fixup_pointer() to access globals in a position-independent fashion. However 'next_early_pgt' was accessed directly, which wasn't guaranteed to work. Luckily GCC was generating a R_X86_64_PC32 PC-relative relocation for 'next_early_pgt', but Clang emitted a R_X86_64_32S, which led to accessing invalid memory and rebooting the kernel. Signed-off-by: Alexander Potapenko Acked-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c88d71508e36 ("x86/boot/64: Rewrite startup_64() in C") Link: http://lkml.kernel.org/r/20170816190808.131748-1-glider@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 46c3c73e7f43f5..9ba79543d9ee9f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -53,6 +53,7 @@ void __head __startup_64(unsigned long physaddr) pudval_t *pud; pmdval_t *pmd, pmd_entry; int i; + unsigned int *next_pgt_ptr; /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -91,9 +92,9 @@ void __head __startup_64(unsigned long physaddr) * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */ - - pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); - pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); + pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); From ee7b1f31200d9f3cc45e1bd22e962bd6b1d4d611 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 11 Aug 2017 17:29:56 +0100 Subject: [PATCH 097/154] of: fix DMA mask generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historically, DMA masks have suffered some ambiguity between whether they represent the range of physical memory a device can access, or the address bits a device is capable of driving, particularly since on many platforms the two are equivalent. Whilst there are some stragglers left (dma_max_pfn(), I'm looking at you...), the majority of DMA code has been cleaned up to follow the latter definition, not least since it is the only one which makes sense once IOMMUs are involved. In this respect, of_dma_configure() has always done the wrong thing in how it generates initial masks based on "dma-ranges". Although rounding down did not affect the TI Keystone platform where dma_addr + size is already a power of two, in any other case it results in a mask which is at best unnecessarily constrained and at worst unusable. BCM2837 illustrates the problem nicely, where we have a DMA base of 3GB and a size of 1GB - 16MB, giving dma_addr + size = 0xff000000 and a resultant mask of 0x7fffffff, which is then insufficient to even cover the necessary offset, effectively making all DMA addresses out-of-range. This has been hidden until now (mostly because we don't yet prevent drivers from simply overwriting this initial mask later upon probe), but due to recent changes elsewhere now shows up as USB being broken on Raspberry Pi 3. Make it right by rounding up instead of down, such that the mask correctly correctly describes all possisble bits the device needs to emit. Fixes: 9a6d7298b083 ("of: Calculate device DMA masks based on DT dma-range size") Reported-by: Stefan Wahren Reported-by: Andreas Färber Reported-by: Hans Verkuil Signed-off-by: Robin Murphy Acked-by: Rob Herring Signed-off-by: Christoph Hellwig --- drivers/of/device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/of/device.c b/drivers/of/device.c index 28c38c756f9285..e0a28ea341fe95 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -89,6 +89,7 @@ int of_dma_configure(struct device *dev, struct device_node *np) bool coherent; unsigned long offset; const struct iommu_ops *iommu; + u64 mask; /* * Set default coherent_dma_mask to 32 bit. Drivers are expected to @@ -134,10 +135,9 @@ int of_dma_configure(struct device *dev, struct device_node *np) * Limit coherent and dma mask based on size and default mask * set by the driver. */ - dev->coherent_dma_mask = min(dev->coherent_dma_mask, - DMA_BIT_MASK(ilog2(dma_addr + size))); - *dev->dma_mask = min((*dev->dma_mask), - DMA_BIT_MASK(ilog2(dma_addr + size))); + mask = DMA_BIT_MASK(ilog2(dma_addr + size - 1) + 1); + dev->coherent_dma_mask &= mask; + *dev->dma_mask &= mask; coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", From 0f174b3525a43bd51f9397394763925e0ebe7bc7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 16 Aug 2017 14:18:37 +0200 Subject: [PATCH 098/154] ALSA: usb-audio: Add mute TLV for playback volumes on C-Media devices C-Media devices (at least some models) mute the playback stream when volumes are set to the minimum value. But this isn't informed via TLV and the user-space, typically PulseAudio, gets confused as if it's still played in a low volume. This patch adds the new flag, min_mute, to struct usb_mixer_elem_info for indicating that the mixer element is with the minimum-mute volume. This flag is set for known C-Media devices in snd_usb_mixer_fu_apply_quirk() in turn. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196669 Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 2 ++ sound/usb/mixer.h | 1 + sound/usb/mixer_quirks.c | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 082736c539bc14..e630813c500862 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -542,6 +542,8 @@ int snd_usb_mixer_vol_tlv(struct snd_kcontrol *kcontrol, int op_flag, if (size < sizeof(scale)) return -ENOMEM; + if (cval->min_mute) + scale[0] = SNDRV_CTL_TLVT_DB_MINMAX_MUTE; scale[2] = cval->dBmin; scale[3] = cval->dBmax; if (copy_to_user(_tlv, scale, sizeof(scale))) diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 3417ef347e4043..2b4b067646ab09 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -64,6 +64,7 @@ struct usb_mixer_elem_info { int cached; int cache_val[MAX_CHANNELS]; u8 initialized; + u8 min_mute; void *private_data; }; diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index e3d1dec48ee49f..e1e7ce9ab217f6 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -1878,6 +1878,12 @@ void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer, if (unitid == 7 && cval->control == UAC_FU_VOLUME) snd_dragonfly_quirk_db_scale(mixer, cval, kctl); break; + /* lowest playback value is muted on C-Media devices */ + case USB_ID(0x0d8c, 0x000c): + case USB_ID(0x0d8c, 0x0014): + if (strstr(kctl->id.name, "Playback")) + cval->min_mute = 1; + break; } } From c8c03f1858331e85d397bacccd34ef409aae993c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Aug 2017 17:08:07 -0700 Subject: [PATCH 099/154] pty: fix the cached path of the pty slave file descriptor in the master Christian Brauner reported that if you use the TIOCGPTPEER ioctl() to get a slave pty file descriptor, the resulting file descriptor doesn't look right in /proc//fd/. In particular, he wanted to use readlink() on /proc/self/fd/ to get the pathname of the slave pty (basically implementing "ptsname{_r}()"). The reason for that was that we had generated the wrong 'struct path' when we create the pty in ptmx_open(). In particular, the dentry was correct, but the vfsmount pointed to the mount of the ptmx node. That _can_ be correct - in case you use "/dev/pts/ptmx" to open the master - but usually is not. The normal case is to use /dev/ptmx, which then looks up the pts/ directory, and then the vfsmount of the ptmx node is obviously the /dev directory, not the /dev/pts/ directory. We actually did have the right vfsmount available, but in the wrong place (it gets looked up in 'devpts_acquire()' when we get a reference to the pts filesystem), and so ptmx_open() used the wrong mnt pointer. The end result of this confusion was that the pty worked fine, but when if you did TIOCGPTPEER to get the slave side of the pty, end end result would also work, but have that dodgy 'struct path'. And then when doing "d_path()" on to get the pathname, the vfsmount would not match the root of the pts directory, and d_path() would return an empty pathname thinking that the entry had escaped a bind mount into another mount. This fixes the problem by making devpts_acquire() return the vfsmount for the pts filesystem, allowing ptmx_open() to trivially just use the right mount for the pts dentry, and create the proper 'struct path'. Reported-by: Christian Brauner Cc: Al Viro Acked-by: Eric Biederman Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 7 +++++-- fs/devpts/inode.c | 4 +++- include/linux/devpts_fs.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 284749fb0f6b96..1fc80ea87c13c0 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -793,6 +793,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) struct tty_struct *tty; struct path *pts_path; struct dentry *dentry; + struct vfsmount *mnt; int retval; int index; @@ -805,7 +806,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_acquire(filp); + fsi = devpts_acquire(filp, &mnt); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; @@ -849,7 +850,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); if (!pts_path) goto err_release; - pts_path->mnt = filp->f_path.mnt; + pts_path->mnt = mnt; pts_path->dentry = dentry; path_get(pts_path); tty->link->driver_data = pts_path; @@ -866,6 +867,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) path_put(pts_path); kfree(pts_path); err_release: + mntput(mnt); tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); @@ -874,6 +876,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); + mntput(mnt); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c2c..44dfbca9306f04 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp) +struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) { struct pts_fs_info *result; struct path path; @@ -142,6 +142,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); + *ptsmnt = NULL; /* Has the devpts filesystem already been found? */ sb = path.mnt->mnt_sb; @@ -165,6 +166,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) * pty code needs to hold extra references in case of last /dev/tty close */ atomic_inc(&sb->s_active); + *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 277ab9af9ac29a..7883e901f65c82 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,7 +19,7 @@ struct pts_fs_info; -struct pts_fs_info *devpts_acquire(struct file *); +struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt); void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); From 81a0b8d74edd5841be29d223ce44bc8db2b00d09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Aug 2017 13:57:49 +0200 Subject: [PATCH 100/154] nvme-fabrics: fix reporting of unrecognized options Only print the specified options that are not recognized, instead of the whole list of options. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy --- drivers/nvme/host/fabrics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2e582a2409437b..5f5cd306f76d05 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -794,7 +794,8 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, int i; for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { - if (opt_tokens[i].token & ~allowed_opts) { + if ((opt_tokens[i].token & opts->mask) && + (opt_tokens[i].token & ~allowed_opts)) { pr_warn("invalid parameter '%s'\n", opt_tokens[i].pattern); } From 014cd0a368dc6351c65d51e4ee34f8573a4a1543 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 17 Aug 2017 20:30:39 +1000 Subject: [PATCH 101/154] bpf: Update sysctl documentation to list all supported architectures The sysctl documentation states that the JIT is only available on x86_64, which is no longer correct. Update the list, and break it out to indicate which architectures support the cBPF JIT (via HAVE_CBPF_JIT) or the eBPF JIT (HAVE_EBPF_JIT). Signed-off-by: Michael Ellerman Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 14db18c970b1b0..b9c3c60780104e 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -36,8 +36,23 @@ bpf_jit_enable -------------- This enables Berkeley Packet Filter Just in Time compiler. -Currently supported on x86_64 architecture, bpf_jit provides a framework -to speed packet filtering, the one used by tcpdump/libpcap for example. + +There are two flavors of JIT, the new eBPF JIT supported on: + - x86_64 + - arm64 + - ppc64 + - sparc64 + - mips64 + +And the older cBPF JIT supported on: + - arm + - mips + - ppc + - sparc + +The BPF JIT provides a framework to speed packet filtering, the one used by +tcpdump/libpcap for example. + Values : 0 - disable the JIT (default value) 1 - enable the JIT From 8204f8ddaafafcae074746fcf2a05a45e6827603 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Aug 2017 14:20:28 -0700 Subject: [PATCH 102/154] xfs: clear MS_ACTIVE after finishing log recovery Way back when we established inode block-map redo log items, it was discovered that we needed to prevent the VFS from evicting inodes during log recovery because any given inode might be have bmap redo items to replay even if the inode has no link count and is ultimately deleted, and any eviction of an unlinked inode causes the inode to be truncated and freed too early. To make this possible, we set MS_ACTIVE so that inodes would not be torn down immediately upon release. Unfortunately, this also results in the quota inodes not being released at all if a later part of the mount process should fail, because we never reclaim the inodes. So, set MS_ACTIVE right before we do the last part of log recovery and clear it immediately after we finish the log recovery so that everything will be torn down properly if we abort the mount. Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_log.c | 11 +++++++++++ fs/xfs/xfs_mount.c | 10 ---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0053bcf2b10a1d..4ebd0bafc914ce 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -749,9 +749,20 @@ xfs_log_mount_finish( return 0; } + /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. Turn it off immediately after recovery finishes + * so that we don't leak the quota inodes if subsequent mount + * activities fail. + */ + mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); + mp->m_super->s_flags &= ~MS_ACTIVE; return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 40d4e8b4e193b4..151a82db094590 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -944,15 +944,6 @@ xfs_mountfs( } } - /* - * During the second phase of log recovery, we need iget and - * iput to behave like they do for an active filesystem. - * xfs_fs_drop_inode needs to be able to prevent the deletion - * of inodes before we're done replaying log items on those - * inodes. - */ - mp->m_super->s_flags |= MS_ACTIVE; - /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently @@ -1028,7 +1019,6 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: - mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); From 77aff8c76425c8f49b50d0b9009915066739e7d2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Aug 2017 14:20:29 -0700 Subject: [PATCH 103/154] xfs: don't leak quotacheck dquots when cow recovery If we fail a mount on account of cow recovery errors, it's possible that a previous quotacheck left some dquots in memory. The bailout clause of xfs_mountfs forgets to purge these, and so we leak them. Fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_mount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 151a82db094590..ea7d4b4e50d0ca 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1024,6 +1024,8 @@ xfs_mountfs( IRELE(rip); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + /* Clean out dquots that might be in memory after quotacheck. */ + xfs_qm_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); From c40bc54fdf2d52a80f66b365f1eac9d43b32e107 Mon Sep 17 00:00:00 2001 From: Gary Bisson Date: Thu, 17 Aug 2017 15:50:10 +0200 Subject: [PATCH 104/154] ARM: dts: imx6qdl-nitrogen6_som2: fix PCIe reset Previous value was a bad copy of nitrogen6_max device tree. Signed-off-by: Gary Bisson Fixes: 3faa1bb2e89c ("ARM: dts: imx: add Boundary Devices Nitrogen6_SOM2 support") Cc: Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi index aeaa5a6e4fcf46..a24e4f1911abe7 100644 --- a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi +++ b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi @@ -507,7 +507,7 @@ pinctrl_pcie: pciegrp { fsl,pins = < /* PCIe reset */ - MX6QDL_PAD_EIM_BCLK__GPIO6_IO31 0x030b0 + MX6QDL_PAD_EIM_DA0__GPIO3_IO00 0x030b0 MX6QDL_PAD_EIM_DA4__GPIO3_IO04 0x030b0 >; }; @@ -668,7 +668,7 @@ &pcie { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_pcie>; - reset-gpio = <&gpio6 31 GPIO_ACTIVE_LOW>; + reset-gpio = <&gpio3 0 GPIO_ACTIVE_LOW>; status = "okay"; }; From e9d8a0fdeacd843c85dcef480cdb2ab76bcdb6e4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 17 Aug 2017 16:45:06 -0400 Subject: [PATCH 105/154] nvme-pci: set cqe_seen on polled completions Fixes: 920d13a884 ("nvme-pci: factor out the cqe reading mechanics from __nvme_process_cq") Reported-by: Jens Axboe Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74a124a062640a..925467b31a3339 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -801,6 +801,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, return; } + nvmeq->cqe_seen = 1; req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); nvme_end_request(req, cqe->status, cqe->result); } @@ -830,10 +831,8 @@ static void nvme_process_cq(struct nvme_queue *nvmeq) consumed++; } - if (consumed) { + if (consumed) nvme_ring_cq_doorbell(nvmeq); - nvmeq->cqe_seen = 1; - } } static irqreturn_t nvme_irq(int irq, void *data) From ed993c6fdfa7734881a4516852d95ae2d3b604d3 Mon Sep 17 00:00:00 2001 From: Jussi Laako Date: Fri, 18 Aug 2017 10:42:14 +0300 Subject: [PATCH 106/154] ALSA: usb-audio: add DSD support for new Amanero PID Add DSD support for new Amanero Combo384 firmware version with a new PID. This firmware uses DSD_U32_BE. Fixes: 3eff682d765b ("ALSA: usb-audio: Support both DSD LE/BE Amanero firmware versions") Signed-off-by: Jussi Laako Cc: Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index f3e9e30172f301..6a03f9697039cc 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1375,6 +1375,10 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, } } break; + case USB_ID(0x16d0, 0x0a23): + if (fp->altsetting == 2) + return SNDRV_PCM_FMTBIT_DSD_U32_BE; + break; default: break; From 0b36f2bd28d040acedb52f4327eb2441afe4f514 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Aug 2017 10:55:10 +0200 Subject: [PATCH 107/154] ALSA: emu10k1: Fix forgotten user-copy conversion in init code The commit d42fe63d5839 ("ALSA: emu10k1: Get rid of set_fs() usage") converted the user-space copy hack with set_fs() to the direct memcpy(), but one place was forgotten. This resulted in the error from snd_emu10k1_init_efx(), eventually failed to load the driver. Fix the missing piece. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196687 Fixes: d42fe63d5839 ("ALSA: emu10k1: Get rid of set_fs() usage") Signed-off-by: Takashi Iwai --- sound/pci/emu10k1/emufx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c index dc585959ca32c8..a2b56b188be4d9 100644 --- a/sound/pci/emu10k1/emufx.c +++ b/sound/pci/emu10k1/emufx.c @@ -698,10 +698,18 @@ static int copy_gctl(struct snd_emu10k1 *emu, { struct snd_emu10k1_fx8010_control_old_gpr __user *octl; - if (emu->support_tlv) - return copy_from_user(gctl, &_gctl[idx], sizeof(*gctl)); + if (emu->support_tlv) { + if (in_kernel) + memcpy(gctl, (void *)&_gctl[idx], sizeof(*gctl)); + else if (copy_from_user(gctl, &_gctl[idx], sizeof(*gctl))) + return -EFAULT; + return 0; + } + octl = (struct snd_emu10k1_fx8010_control_old_gpr __user *)_gctl; - if (copy_from_user(gctl, &octl[idx], sizeof(*octl))) + if (in_kernel) + memcpy(gctl, (void *)&octl[idx], sizeof(*octl)); + else if (copy_from_user(gctl, &octl[idx], sizeof(*octl))) return -EFAULT; gctl->tlv = NULL; return 0; From 7374bfb82e3844abcc5a5b8034620d80b92b820d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 27 Jul 2017 15:47:33 -0700 Subject: [PATCH 108/154] MAINTAINERS: Remove Jason Cooper's irqchip git tree Jason's irqchip tree does not seem to have been updated for many months now, remove it from the list of trees to avoid any possible confusion. Jason says: "Unfortunately, when I have time for irqchip, I don't always have the time to properly follow up with pull-requests. So, for the time being, I'll stick to reviewing as I can." Signed-off-by: Florian Fainelli Signed-off-by: Thomas Gleixner Acked-by: Jason Cooper Cc: marc.zyngier@arm.com Link: http://lkml.kernel.org/r/20170727224733.8288-1-f.fainelli@gmail.com --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f66488dfdbc9cc..b116efa1a0870c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7102,7 +7102,6 @@ M: Marc Zyngier L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core -T: git git://git.infradead.org/users/jcooper/linux.git irqchip/core F: Documentation/devicetree/bindings/interrupt-controller/ F: drivers/irqchip/ From 45bd07ad82622fb7c8dd7504d976b7dd11568965 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 20 Jul 2017 17:00:32 +0530 Subject: [PATCH 109/154] x86: Constify attribute_group structures attribute_groups are not supposed to change at runtime and none of the groups is modified. Mark the non-const structs as const. [ tglx: Folded into one big patch ] Signed-off-by: Arvind Yadav Signed-off-by: Thomas Gleixner Cc: tony.luck@intel.com Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1500550238-15655-2-git-send-email-arvind.yadav.cs@gmail.com --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/events/intel/uncore_nhmex.c | 12 +++---- arch/x86/events/intel/uncore_snb.c | 6 ++-- arch/x86/events/intel/uncore_snbep.c | 42 ++++++++++++------------ arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 +-- arch/x86/kernel/ksysfs.c | 4 +-- 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 44ec523287f670..1c5390f1cf0992 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = { NULL, }; -static struct attribute_group uncore_pmu_attr_group = { +static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index cda56933200503..6a5cbe90f8593e 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -272,7 +272,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_ubox_format_group = { +static const struct attribute_group nhmex_uncore_ubox_format_group = { .name = "format", .attrs = nhmex_uncore_ubox_formats_attr, }; @@ -299,7 +299,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_cbox_format_group = { +static const struct attribute_group nhmex_uncore_cbox_format_group = { .name = "format", .attrs = nhmex_uncore_cbox_formats_attr, }; @@ -407,7 +407,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_bbox_format_group = { +static const struct attribute_group nhmex_uncore_bbox_format_group = { .name = "format", .attrs = nhmex_uncore_bbox_formats_attr, }; @@ -484,7 +484,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_sbox_format_group = { +static const struct attribute_group nhmex_uncore_sbox_format_group = { .name = "format", .attrs = nhmex_uncore_sbox_formats_attr, }; @@ -898,7 +898,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_mbox_format_group = { +static const struct attribute_group nhmex_uncore_mbox_format_group = { .name = "format", .attrs = nhmex_uncore_mbox_formats_attr, }; @@ -1163,7 +1163,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_rbox_format_group = { +static const struct attribute_group nhmex_uncore_rbox_format_group = { .name = "format", .attrs = nhmex_uncore_rbox_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index a3dcc12bef4ab3..db1127ce685eb8 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -130,7 +130,7 @@ static struct attribute *snb_uncore_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_format_group = { +static const struct attribute_group snb_uncore_format_group = { .name = "format", .attrs = snb_uncore_formats_attr, }; @@ -289,7 +289,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_imc_format_group = { +static const struct attribute_group snb_uncore_imc_format_group = { .name = "format", .attrs = snb_uncore_imc_formats_attr, }; @@ -769,7 +769,7 @@ static struct attribute *nhm_uncore_formats_attr[] = { NULL, }; -static struct attribute_group nhm_uncore_format_group = { +static const struct attribute_group nhm_uncore_format_group = { .name = "format", .attrs = nhm_uncore_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 4f9127644b80ab..db1fe377e6dd9d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -602,27 +602,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = { { /* end: all zeroes */ }, }; -static struct attribute_group snbep_uncore_format_group = { +static const struct attribute_group snbep_uncore_format_group = { .name = "format", .attrs = snbep_uncore_formats_attr, }; -static struct attribute_group snbep_uncore_ubox_format_group = { +static const struct attribute_group snbep_uncore_ubox_format_group = { .name = "format", .attrs = snbep_uncore_ubox_formats_attr, }; -static struct attribute_group snbep_uncore_cbox_format_group = { +static const struct attribute_group snbep_uncore_cbox_format_group = { .name = "format", .attrs = snbep_uncore_cbox_formats_attr, }; -static struct attribute_group snbep_uncore_pcu_format_group = { +static const struct attribute_group snbep_uncore_pcu_format_group = { .name = "format", .attrs = snbep_uncore_pcu_formats_attr, }; -static struct attribute_group snbep_uncore_qpi_format_group = { +static const struct attribute_group snbep_uncore_qpi_format_group = { .name = "format", .attrs = snbep_uncore_qpi_formats_attr, }; @@ -1431,27 +1431,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = { NULL, }; -static struct attribute_group ivbep_uncore_format_group = { +static const struct attribute_group ivbep_uncore_format_group = { .name = "format", .attrs = ivbep_uncore_formats_attr, }; -static struct attribute_group ivbep_uncore_ubox_format_group = { +static const struct attribute_group ivbep_uncore_ubox_format_group = { .name = "format", .attrs = ivbep_uncore_ubox_formats_attr, }; -static struct attribute_group ivbep_uncore_cbox_format_group = { +static const struct attribute_group ivbep_uncore_cbox_format_group = { .name = "format", .attrs = ivbep_uncore_cbox_formats_attr, }; -static struct attribute_group ivbep_uncore_pcu_format_group = { +static const struct attribute_group ivbep_uncore_pcu_format_group = { .name = "format", .attrs = ivbep_uncore_pcu_formats_attr, }; -static struct attribute_group ivbep_uncore_qpi_format_group = { +static const struct attribute_group ivbep_uncore_qpi_format_group = { .name = "format", .attrs = ivbep_uncore_qpi_formats_attr, }; @@ -1887,7 +1887,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_ubox_format_group = { +static const struct attribute_group knl_uncore_ubox_format_group = { .name = "format", .attrs = knl_uncore_ubox_formats_attr, }; @@ -1927,7 +1927,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_cha_format_group = { +static const struct attribute_group knl_uncore_cha_format_group = { .name = "format", .attrs = knl_uncore_cha_formats_attr, }; @@ -2037,7 +2037,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_pcu_format_group = { +static const struct attribute_group knl_uncore_pcu_format_group = { .name = "format", .attrs = knl_uncore_pcu_formats_attr, }; @@ -2187,7 +2187,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_irp_format_group = { +static const struct attribute_group knl_uncore_irp_format_group = { .name = "format", .attrs = knl_uncore_irp_formats_attr, }; @@ -2385,7 +2385,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_ubox_format_group = { +static const struct attribute_group hswep_uncore_ubox_format_group = { .name = "format", .attrs = hswep_uncore_ubox_formats_attr, }; @@ -2439,7 +2439,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_cbox_format_group = { +static const struct attribute_group hswep_uncore_cbox_format_group = { .name = "format", .attrs = hswep_uncore_cbox_formats_attr, }; @@ -2621,7 +2621,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_sbox_format_group = { +static const struct attribute_group hswep_uncore_sbox_format_group = { .name = "format", .attrs = hswep_uncore_sbox_formats_attr, }; @@ -3314,7 +3314,7 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_chabox_format_group = { +static const struct attribute_group skx_uncore_chabox_format_group = { .name = "format", .attrs = skx_uncore_cha_formats_attr, }; @@ -3427,7 +3427,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_iio_format_group = { +static const struct attribute_group skx_uncore_iio_format_group = { .name = "format", .attrs = skx_uncore_iio_formats_attr, }; @@ -3484,7 +3484,7 @@ static struct attribute *skx_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_format_group = { +static const struct attribute_group skx_uncore_format_group = { .name = "format", .attrs = skx_uncore_formats_attr, }; @@ -3605,7 +3605,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_upi_uncore_format_group = { +static const struct attribute_group skx_upi_uncore_format_group = { .name = "format", .attrs = skx_upi_uncore_formats_attr, }; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d7cc190ae45719..f7370abd33c675 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = { NULL }; -static struct attribute_group thermal_attr_group = { +static const struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9cb98ee103db1b..86e8f0b2537b3e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = { NULL }; -static struct attribute_group mc_attr_group = { +static const struct attribute_group mc_attr_group = { .attrs = mc_default_attrs, .name = "microcode", }; @@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = { NULL }; -static struct attribute_group cpu_root_microcode_group = { +static const struct attribute_group cpu_root_microcode_group = { .name = "microcode", .attrs = cpu_root_microcode_attrs, }; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4afc67f5facc49..06e1ff5562c0b4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = { NULL, }; -static struct attribute_group boot_params_attr_group = { +static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, .bin_attrs = boot_params_data_attrs, }; @@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = { NULL, }; -static struct attribute_group setup_data_attr_group = { +static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, .bin_attrs = setup_data_data_attrs, }; From 4dd6a9973b8aaffac4bf37c5bb70e8eae5a7afb4 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Fri, 28 Jul 2017 09:51:34 -0700 Subject: [PATCH 110/154] soc: ti: ti_sci_pm_domains: Populate name for genpd Commit b6a1d093f96b ("PM / Domains: Extend generic power domain debugfs") now creates a debugfs directory for each genpd based on the name of the genpd. Currently no name is given to the genpd created by ti_sci_pm_domains driver so because of this we see a NULL pointer dereferences when it is accessed on boot when the debugfs entry creation is attempted. Give the genpd a name before registering it to avoid this. Fixes: 52835d59fc6c ("soc: ti: Add ti_sci_pm_domains driver") Signed-off-by: Dave Gerlach Signed-off-by: Santosh Shilimkar Signed-off-by: Arnd Bergmann --- drivers/soc/ti/ti_sci_pm_domains.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/soc/ti/ti_sci_pm_domains.c b/drivers/soc/ti/ti_sci_pm_domains.c index b0b283810e7224..de31b9389e2ee7 100644 --- a/drivers/soc/ti/ti_sci_pm_domains.c +++ b/drivers/soc/ti/ti_sci_pm_domains.c @@ -176,6 +176,8 @@ static int ti_sci_pm_domain_probe(struct platform_device *pdev) ti_sci_pd->dev = dev; + ti_sci_pd->pd.name = "ti_sci_pd"; + ti_sci_pd->pd.attach_dev = ti_sci_pd_attach_dev; ti_sci_pd->pd.detach_dev = ti_sci_pd_detach_dev; From e8f241893dfbbebe2813c01eac54f263e6a5e59c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 10:53:45 +0100 Subject: [PATCH 111/154] genirq: Restore trigger settings in irq_modify_status() irq_modify_status starts by clearing the trigger settings from irq_data before applying the new settings, but doesn't restore them, leaving them to IRQ_TYPE_NONE. That's pretty confusing to the potential request_irq() that could follow. Instead, snapshot the settings before clearing them, and restore them if the irq_modify_status() invocation was not changing the trigger. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-and-tested-by: jeffy Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170818095345.12378-1-marc.zyngier@arm.com --- kernel/irq/chip.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e22..3675c6004f2a68 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: [PATCH 112/154] kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- arch/x86/Kconfig | 1 + include/linux/nmi.h | 8 ++++++ kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 7 +++++ 5 files changed, 76 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9ef6..9101bfc85539e5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8aa01fd859fb84..a36abe2da13e1a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace; #define sysctl_softlockup_all_cpu_backtrace 0 #define sysctl_hardlockup_all_cpu_backtrace 0 #endif + +#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ + defined(CONFIG_HARDLOCKUP_DETECTOR) +void watchdog_update_hrtimer_threshold(u64 period); +#else +static inline void watchdog_update_hrtimer_threshold(u64 period) { } +#endif + extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0df0..f5d52024f6b72a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934cb1..3a09ea1b1d3d5e 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715522e8d1..c617b9d1d6cb68 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -797,6 +797,13 @@ config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR +# +# Enables a timestamp based low pass filter to compensate for perf based +# hard lockup detection which runs too fast due to turbo modes. +# +config HARDLOCKUP_CHECK_TIMESTAMP + bool + # # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard # lockup detector rather than the perf based detector. From c005390374957baacbc38eef96ea360559510aa7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Aug 2017 12:24:47 +0200 Subject: [PATCH 113/154] blk-mq-pci: add a fallback when pci_irq_get_affinity returns NULL While pci_irq_get_affinity should never fail for SMP kernel that implement the affinity mapping, it will always return NULL in the UP case, so provide a fallback mapping of all queues to CPU 0 in that case. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-pci.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 0c3354cf355287..76944e3271bf34 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -36,12 +36,18 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) for (queue = 0; queue < set->nr_hw_queues; queue++) { mask = pci_irq_get_affinity(pdev, queue); if (!mask) - return -EINVAL; + goto fallback; for_each_cpu(cpu, mask) set->mq_map[cpu] = queue; } return 0; + +fallback: + WARN_ON_ONCE(set->nr_hw_queues > 1); + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); From acc8b31665b4cc17b35c4fa445427f7e2f6dc86b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 18 Aug 2017 10:10:43 +0200 Subject: [PATCH 114/154] net: sched: fix p_filter_chain check in tcf_chain_flush The dereference before check is wrong and leads to an oops when p_filter_chain is NULL. The check needs to be done on the pointer to prevent NULL dereference. Fixes: f93e1cdcf42c ("net/sched: fix filter flushing") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 39da0c5801c908..9fd44c22134783 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -205,7 +205,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - if (*chain->p_filter_chain) + if (chain->p_filter_chain) RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); From eac2c68d663effb077210218788952b5a0c1f60e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Aug 2017 12:11:50 +0100 Subject: [PATCH 115/154] nfp: fix infinite loop on umapping cleanup The while loop that performs the dma page unmapping never decrements index counter f and hence loops forever. Fix this with a pre-decrement on f. Detected by CoverityScan, CID#1357309 ("Infinite loop") Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs") Signed-off-by: Colin Ian King Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 4631ca8b8eb278..9f77ce038a4a33 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -908,8 +908,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_OK; err_unmap: - --f; - while (f >= 0) { + while (--f >= 0) { frag = &skb_shinfo(skb)->frags[f]; dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr, skb_frag_size(frag), DMA_TO_DEVICE); From a120d9ab65354727559b9db75ded8071b7ef19e2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Aug 2017 14:12:06 +0100 Subject: [PATCH 116/154] netxen: fix incorrect loop counter decrement The loop counter k is currently being decremented from zero which is incorrect. Fix this by incrementing k instead Detected by CoverityScan, CID#401847 ("Infinite loop") Fixes: 83f18a557c6d ("netxen_nic: fw dump support") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c index 66ff15d08bad4e..0a66389c06c252 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c @@ -2311,7 +2311,7 @@ netxen_md_rdqueue(struct netxen_adapter *adapter, loop_cnt++) { NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, queue_id); read_addr = queueEntry->read_addr; - for (k = 0; k < read_cnt; k--) { + for (k = 0; k < read_cnt; k++) { NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0, &read_value); *data_buff++ = read_value; From 2110ba58303f0c2a03360c5f81fbe67ed312e7b9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 18 Aug 2017 17:11:06 +0200 Subject: [PATCH 117/154] bpf, doc: improve sysctl knob description Current context speaking of tcpdump filters is out of date these days, so lets improve the sysctl description for the BPF knobs a bit. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index b9c3c60780104e..d7c2b88b92ae80 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -35,23 +35,32 @@ Table : Subdirectories in /proc/sys/net bpf_jit_enable -------------- -This enables Berkeley Packet Filter Just in Time compiler. - -There are two flavors of JIT, the new eBPF JIT supported on: +This enables the BPF Just in Time (JIT) compiler. BPF is a flexible +and efficient infrastructure allowing to execute bytecode at various +hook points. It is used in a number of Linux kernel subsystems such +as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints) +and security (e.g. seccomp). LLVM has a BPF back end that can compile +restricted C into a sequence of BPF instructions. After program load +through bpf(2) and passing a verifier in the kernel, a JIT will then +translate these BPF proglets into native CPU instructions. There are +two flavors of JITs, the newer eBPF JIT currently supported on: - x86_64 - arm64 - ppc64 - sparc64 - mips64 -And the older cBPF JIT supported on: +And the older cBPF JIT supported on the following archs: - arm - mips - ppc - sparc -The BPF JIT provides a framework to speed packet filtering, the one used by -tcpdump/libpcap for example. +eBPF JITs are a superset of cBPF JITs, meaning the kernel will +migrate cBPF instructions into eBPF instructions and then JIT +compile them transparently. Older cBPF JITs can only translate +tcpdump filters, seccomp rules, etc, but not mentioned eBPF +programs loaded through bpf(2). Values : 0 - disable the JIT (default value) @@ -61,9 +70,9 @@ Values : bpf_jit_harden -------------- -This enables hardening for the Berkeley Packet Filter Just in Time compiler. -Supported are eBPF JIT backends. Enabling hardening trades off performance, -but can mitigate JIT spraying. +This enables hardening for the BPF JIT compiler. Supported are eBPF +JIT backends. Enabling hardening trades off performance, but can +mitigate JIT spraying. Values : 0 - disable JIT hardening (default value) 1 - enable JIT hardening for unprivileged users only @@ -72,11 +81,11 @@ Values : bpf_jit_kallsyms ---------------- -When Berkeley Packet Filter Just in Time compiler is enabled, then compiled -images are unknown addresses to the kernel, meaning they neither show up in -traces nor in /proc/kallsyms. This enables export of these addresses, which -can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature -is disabled. +When BPF JIT compiler is enabled, then compiled images are unknown +addresses to the kernel, meaning they neither show up in traces nor +in /proc/kallsyms. This enables export of these addresses, which can +be used for debugging/tracing. If bpf_jit_harden is enabled, this +feature is disabled. Values : 0 - disable JIT kallsyms export (default value) 1 - enable JIT kallsyms export for privileged users only From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 18 Aug 2017 15:04:54 -0400 Subject: [PATCH 118/154] datagram: When peeking datagrams with offset < 0 don't skip empty skbs Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove headers from UDP packets before queueing"), when udp packets are being peeked the requested extra offset is always 0 as there is no need to skip the udp header. However, when the offset is 0 and the next skb is of length 0, it is only returned once. The behaviour can be seen with the following python script: from socket import *; f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); f.bind(('::', 0)); addr=('::1', f.getsockname()[1]); g.sendto(b'', addr) g.sendto(b'b', addr) print(f.recvfrom(10, MSG_PEEK)); print(f.recvfrom(10, MSG_PEEK)); Where the expected output should be the empty string twice. Instead, make sk_peek_offset return negative values, and pass those values to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset to __skb_try_recv_from_queue is negative, the checked skb is never skipped. __skb_try_recv_from_queue will then ensure the offset is reset back to 0 if a peek is requested without an offset, unless no packets are found. Also simplify the if condition in __skb_try_recv_from_queue. If _off is greater then 0, and off is greater then or equal to skb->len, then (_off || skb->len) must always be true assuming skb->len >= 0 is always true. Also remove a redundant check around a call to sk_peek_offset in af_unix.c, as it double checked if MSG_PEEK was set in the flags. V2: - Moved the negative fixup into __skb_try_recv_from_queue, and remove now redundant checks - Fix peeking in udp{,v6}_recvmsg to report the right value when the offset is 0 V3: - Marked new branch in __skb_try_recv_from_queue as unlikely. Signed-off-by: Matthew Dawson Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 4 +--- net/core/datagram.c | 12 +++++++++--- net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 3 ++- net/unix/af_unix.c | 5 +---- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7c0632c7e87043..aeeec62992ca7d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { - s32 off = READ_ONCE(sk->sk_peek_off); - if (off >= 0) - return off; + return READ_ONCE(sk->sk_peek_off); } return 0; diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647bd91b3f3..a21ca8dee5eadc 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, int *peeked, int *off, int *err, struct sk_buff **last) { + bool peek_at_off = false; struct sk_buff *skb; - int _off = *off; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { _off -= skb->len; continue; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7c804f73990a0..cd1d044a7fa580 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 578142b7ca3e6e..20039c8501eb97 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7b52a380d710d2..be8982b4f8c00b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, */ mutex_lock(&u->iolock); - if (flags & MSG_PEEK) - skip = sk_peek_offset(sk, flags); - else - skip = 0; + skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; From 739f79fc9db1b38f96b5a5109b247a650fbebf6d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 18 Aug 2017 15:15:48 -0700 Subject: [PATCH 119/154] mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback() Jaegeuk and Brad report a NULL pointer crash when writeback ending tries to update the memcg stats: BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0 IP: test_clear_page_writeback+0x12e/0x2c0 [...] RIP: 0010:test_clear_page_writeback+0x12e/0x2c0 Call Trace: end_page_writeback+0x47/0x70 f2fs_write_end_io+0x76/0x180 [f2fs] bio_endio+0x9f/0x120 blk_update_request+0xa8/0x2f0 scsi_end_request+0x39/0x1d0 scsi_io_completion+0x211/0x690 scsi_finish_command+0xd9/0x120 scsi_softirq_done+0x127/0x150 __blk_mq_complete_request_remote+0x13/0x20 flush_smp_call_function_queue+0x56/0x110 generic_smp_call_function_single_interrupt+0x13/0x30 smp_call_function_single_interrupt+0x27/0x40 call_function_single_interrupt+0x89/0x90 RIP: 0010:native_safe_halt+0x6/0x10 (gdb) l *(test_clear_page_writeback+0x12e) 0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619). 614 mod_node_page_state(page_pgdat(page), idx, val); 615 if (mem_cgroup_disabled() || !page->mem_cgroup) 616 return; 617 mod_memcg_state(page->mem_cgroup, idx, val); 618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; 619 this_cpu_add(pn->lruvec_stat->count[idx], val); 620 } 621 622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 623 gfp_t gfp_mask, The issue is that writeback doesn't hold a page reference and the page might get freed after PG_writeback is cleared (and the mapping is unlocked) in test_clear_page_writeback(). The stat functions looking up the page's node or zone are safe, as those attributes are static across allocation and free cycles. But page->mem_cgroup is not, and it will get cleared if we race with truncation or migration. It appears this race window has been around for a while, but less likely to trigger when the memcg stats were updated first thing after PG_writeback is cleared. Recent changes reshuffled this code to update the global node stats before the memcg ones, though, stretching the race window out to an extent where people can reproduce the problem. Update test_clear_page_writeback() to look up and pin page->mem_cgroup before clearing PG_writeback, then not use that pointer afterward. It is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()") but leaves the pageref-holding callsites that aren't affected alone. Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()") Signed-off-by: Johannes Weiner Reported-by: Jaegeuk Kim Tested-by: Jaegeuk Kim Reported-by: Bradley Bolen Tested-by: Brad Bolen Cc: Vladimir Davydov Cc: Michal Hocko Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 +++++++-- mm/memcontrol.c | 43 +++++++++++++++++++++++++++----------- mm/page-writeback.c | 15 ++++++++++--- 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3914e3dd61680a..9b15a4bcfa77dc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -void lock_page_memcg(struct page *page); +struct mem_cgroup *lock_page_memcg(struct page *page); +void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, @@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void lock_page_memcg(struct page *page) +static inline struct mem_cgroup *lock_page_memcg(struct page *page) +{ + return NULL; +} + +static inline void __unlock_page_memcg(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3df3c04d73ab08..e09741af816f8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1611,9 +1611,13 @@ bool mem_cgroup_oom_synchronize(bool handle) * @page: the page * * This function protects unlocked LRU pages from being moved to - * another cgroup and stabilizes their page->mem_cgroup binding. + * another cgroup. + * + * It ensures lifetime of the returned memcg. Caller is responsible + * for the lifetime of the page; __unlock_page_memcg() is available + * when @page might get freed inside the locked section. */ -void lock_page_memcg(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page itself from being freed. E.g. writeback + * doesn't hold a page reference and relies on PG_writeback to + * keep off truncation, migration and so forth. + */ rcu_read_lock(); if (mem_cgroup_disabled()) - return; + return NULL; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return; + return NULL; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1649,18 +1659,18 @@ void lock_page_memcg(struct page *page) memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return; + return memcg; } EXPORT_SYMBOL(lock_page_memcg); /** - * unlock_page_memcg - unlock a page->mem_cgroup binding - * @page: the page + * __unlock_page_memcg - unlock and unpin a memcg + * @memcg: the memcg + * + * Unlock and unpin a memcg returned by lock_page_memcg(). */ -void unlock_page_memcg(struct page *page) +void __unlock_page_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = page->mem_cgroup; - if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page) rcu_read_unlock(); } + +/** + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page + */ +void unlock_page_memcg(struct page *page) +{ + __unlock_page_memcg(page->mem_cgroup); +} EXPORT_SYMBOL(unlock_page_memcg); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96e93b214d317b..bf050ab025b76a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - lock_page_memcg(page); + memcg = lock_page_memcg(page); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + /* + * NOTE: Page might be free now! Writeback doesn't hold a page + * reference on its own, it relies on truncation to wait for + * the clearing of PG_writeback. The below can only access + * page state that is static across allocation cycles. + */ if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); + dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - unlock_page_memcg(page); + __unlock_page_memcg(memcg); return ret; } From 92e5aae457787d0bc6b255200d2fb116edf69794 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 18 Aug 2017 15:15:51 -0700 Subject: [PATCH 120/154] kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog Commit 05a4a9527931 ("kernel/watchdog: split up config options") lost the perf-based hardlockup detector's dependency on PERF_EVENTS, which can result in broken builds with some powerpc configurations. Restore the dependency. Add it in for x86 too, despite x86 always selecting PERF_EVENTS it seems reasonable to make the dependency explicit. Link: http://lkml.kernel.org/r/20170810114452.6673-1-npiggin@gmail.com Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Nicholas Piggin Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 36f858c37ca70b..81b0031f909f6d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -199,7 +199,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9ef6..29a1bf85e5077f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -163,7 +163,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API From 8ada92799ec4de00f4bc0f10b1ededa256c1ab22 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:55 -0700 Subject: [PATCH 121/154] wait: add wait_event_killable_timeout() These are the few pending fixes I have queued up for v4.13-final. One is a a generic regression fix for recursive loops on kmod and the other one is a trivial print out correction. During the v4.13 development we assumed that recursive kmod loops were no longer possible. Clearly that is not true. The regression fix makes use of a new killable wait. We use a killable wait to be paranoid in how signals might be sent to modprobe and only accept a proper SIGKILL. The signal will only be available to userspace to issue *iff* a thread has already entered a wait state, and that happens only if we've already throttled after 50 kmod threads have been hit. Note that although it may seem excessive to trigger a failure afer 5 seconds if all kmod thread remain busy, prior to the series of changes that went into v4.13 we would actually *always* fatally fail any request which came in if the limit was already reached. The new waiting implemented in v4.13 actually gives us *more* breathing room -- the wait for 5 seconds is a wait for *any* kmod thread to finish. We give up and fail *iff* no kmod thread has finished and they're *all* running straight for 5 consecutive seconds. If 50 kmod threads are running consecutively for 5 seconds something else must be really bad. Recursive loops with kmod are bad but they're also hard to implement properly as a selftest without currently fooling current userspace tools like kmod [1]. For instance kmod will complain when you run depmod if it finds a recursive loop with symbol dependency between modules as such this type of recursive loop cannot go upstream as the modules_install target will fail after running depmod. These tests already exist on userspace kmod upstream though (refer to the testsuite/module-playground/mod-loop-*.c files). The same is not true if request_module() is used though, or worst if aliases are used. Likewise the issue with 64-bit kernels booting 32-bit userspace without a binfmt handler built-in is also currently not detected and proactively avoided by userspace kmod tools, or kconfig for all architectures. Although we could complain in the kernel when some of these individual recursive issues creep up, proactively avoiding these situations in userspace at build time is what we should keep striving for. Lastly, since recursive loops could happen with kmod it may mean recursive loops may also be possible with other kernel usermode helpers, this should be investigated and long term if we can come up with a more sensible generic solution even better! [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/log/?h=20170809-kmod-for-v4.13-final [1] https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git This patch (of 3): This wait is similar to wait_event_interruptible_timeout() but only accepts SIGKILL interrupt signal. Other signals are ignored. Link: http://lkml.kernel.org/r/20170809234635.13443-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Kees Cook Cc: Dmitry Torokhov Cc: Jessica Yu Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Miroslav Benes Cc: Josh Poimboeuf Cc: "Eric W. Biederman" Cc: Shuah Khan Cc: Matt Redfearn Cc: Dan Carpenter Cc: Colin Ian King Cc: Daniel Mentz Cc: David Binderman Cc: Matt Redfearn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5b74e36c0ca896..dc19880c02f5eb 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_killable_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_KILLABLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_KILLABLE) until the + * @condition evaluates to true or a kill signal is received. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a kill signal. + * + * Only kill signals interrupt this process. + */ +#define wait_event_killable_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_killable_timeout(wq_head, \ + condition, timeout); \ + __ret; \ +}) + #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:58 -0700 Subject: [PATCH 122/154] kmod: fix wait on recursive loop Recursive loops with module loading were previously handled in kmod by restricting the number of modprobe calls to 50 and if that limit was breached request_module() would return an error and a user would see the following on their kernel dmesg: request_module: runaway loop modprobe binfmt-464c Starting init:/sbin/init exists but couldn't execute it (error -8) This issue could happen for instance when a 64-bit kernel boots a 32-bit userspace on some architectures and has no 32-bit binary format hanlders. This is visible, for instance, when a CONFIG_MODULES enabled 64-bit MIPS kernel boots a into o32 root filesystem and the binfmt handler for o32 binaries is not built-in. After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now don't have any visible signs of an error and the kernel just waits for the loop to end somehow. Although this *particular* recursive loop could also be addressed by doing a sanity check on search_binary_handler() and disallowing a modular binfmt to be required for modprobe, a generic solution for any recursive kernel kmod issues is still needed. This should catch these loops. We can investigate each loop and address each one separately as they come in, this however puts a stop gap for them as before. Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org Fixes: 6d7964a722af ("kmod: throttle kmod thread limit") Signed-off-by: Luis R. Rodriguez Reported-by: Matt Redfearn Tested-by: Matt Redfearn Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: Daniel Mentz Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c839..2f37acde640b6c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem); static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); +/* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + /* modprobe_path is set via /proc/sys. */ @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); From 768dc4e48420955518974d8486c1b00ec05e7274 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:16:02 -0700 Subject: [PATCH 123/154] test_kmod: fix description for -s -and -c parameters The descriptions were reversed, correct this. Link: http://lkml.kernel.org/r/20170809234635.13443-4-mcgrof@kernel.org Fixes: 64b671204afd71 ("test_sysctl: add generic script to expand on tests") Signed-off-by: Luis R. Rodriguez Reported-by: Daniel Mentz Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Matt Redfearn Cc: Matt Redfearn Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/kmod/kmod.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 8cecae9a8bca13..7956ea3be6675f 100755 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -473,8 +473,8 @@ usage() echo " all Runs all tests (default)" echo " -t Run test ID the number amount of times is recommended" echo " -w Watch test ID run until it runs into an error" - echo " -c Run test ID once" - echo " -s Run test ID x test-count number of times" + echo " -s Run test ID once" + echo " -c Run test ID x test-count number of times" echo " -l List all test ID list" echo " -h|--help Help" echo From 3010f876500f9ba921afaeccec30c45ca6584dc8 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 18 Aug 2017 15:16:05 -0700 Subject: [PATCH 124/154] mm: discard memblock data later There is existing use after free bug when deferred struct pages are enabled: The memblock_add() allocates memory for the memory array if more than 128 entries are needed. See comment in e820__memblock_setup(): * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries * than that - so allow memblock resizing. This memblock memory is freed here: free_low_memory_core_early() We access the freed memblock.memory later in boot when deferred pages are initialized in this path: deferred_init_memmap() for_each_mem_pfn_range() __next_mem_pfn_range() type = &memblock.memory; One possible explanation for why this use-after-free hasn't been hit before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded at least on systems where deferred struct pages were enabled. Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128, and verifying in qemu that this code is getting excuted and that the freed pages are sane. Link: http://lkml.kernel.org/r/1502485554-318703-2-git-send-email-pasha.tatashin@oracle.com Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd") Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Acked-by: Michal Hocko Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 6 ++++-- mm/memblock.c | 38 +++++++++++++++++--------------------- mm/nobootmem.c | 16 ---------------- mm/page_alloc.c | 4 ++++ 4 files changed, 25 insertions(+), 39 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77d427974f5756..bae11c7e7bf319 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -61,6 +61,7 @@ extern int memblock_debug; #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata +void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock @@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); -phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); -phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); @@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, phys_addr_t *out_end); +void __memblock_free_early(phys_addr_t base, phys_addr_t size); +void __memblock_free_late(phys_addr_t base, phys_addr_t size); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. diff --git a/mm/memblock.c b/mm/memblock.c index 2cb25fe4452c27..bf14aea6ab709d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - -phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( - phys_addr_t *addr) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - *addr = __pa(memblock.reserved.regions); - - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.reserved.max); -} - -phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( - phys_addr_t *addr) +/** + * Discard memory and reserved arrays if they were allocated + */ +void __init memblock_discard(void) { - if (memblock.memory.regions == memblock_memory_init_regions) - return 0; + phys_addr_t addr, size; - *addr = __pa(memblock.memory.regions); + if (memblock.reserved.regions != memblock_reserved_init_regions) { + addr = __pa(memblock.reserved.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); + __memblock_free_late(addr, size); + } - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.memory.max); + if (memblock.memory.regions == memblock_memory_init_regions) { + addr = __pa(memblock.memory.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); + __memblock_free_late(addr, size); + } } - #endif /** diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 36454d0f96ee6b..3637809a18d04f 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void) NULL) count += __free_memory_core(start, end); -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - { - phys_addr_t size; - - /* Free memblock.reserved array if it was allocated */ - size = get_allocated_memblock_reserved_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - - /* Free memblock.memory array if it was allocated */ - size = get_allocated_memblock_memory_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - } -#endif - return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d00f746c2fd96..1bad301820c7a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); From f6ba488073fe8159851fe398cc3c5ee383bb4c7a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 18 Aug 2017 15:16:08 -0700 Subject: [PATCH 125/154] slub: fix per memcg cache leak on css offline To avoid a possible deadlock, sysfs_slab_remove() schedules an asynchronous work to delete sysfs entries corresponding to the kmem cache. To ensure the cache isn't freed before the work function is called, it takes a reference to the cache kobject. The reference is supposed to be released by the work function. However, the work function (sysfs_slab_remove_workfn()) does nothing in case the cache sysfs entry has already been deleted, leaking the kobject and the corresponding cache. This may happen on a per memcg cache destruction, because sysfs entries of a per memcg cache are deleted on memcg offline if the cache is empty (see __kmemcg_cache_deactivate()). The kmemleak report looks like this: unreferenced object 0xffff9f798a79f540 (size 32): comm "kworker/1:4", pid 15416, jiffies 4307432429 (age 28687.554s) hex dump (first 32 bytes): 6b 6d 61 6c 6c 6f 63 2d 31 36 28 31 35 39 39 3a kmalloc-16(1599: 6e 65 77 72 6f 6f 74 29 00 23 6b c0 ff ff ff ff newroot).#k..... backtrace: kmemleak_alloc+0x4a/0xa0 __kmalloc_track_caller+0x148/0x2c0 kvasprintf+0x66/0xd0 kasprintf+0x49/0x70 memcg_create_kmem_cache+0xe6/0x160 memcg_kmem_cache_create_func+0x20/0x110 process_one_work+0x205/0x5d0 worker_thread+0x4e/0x3a0 kthread+0x109/0x140 ret_from_fork+0x2a/0x40 unreferenced object 0xffff9f79b6136840 (size 416): comm "kworker/1:4", pid 15416, jiffies 4307432429 (age 28687.573s) hex dump (first 32 bytes): 40 fb 80 c2 3e 33 00 00 00 00 00 40 00 00 00 00 @...>3.....@.... 00 00 00 00 00 00 00 00 10 00 00 00 10 00 00 00 ................ backtrace: kmemleak_alloc+0x4a/0xa0 kmem_cache_alloc+0x128/0x280 create_cache+0x3b/0x1e0 memcg_create_kmem_cache+0x118/0x160 memcg_kmem_cache_create_func+0x20/0x110 process_one_work+0x205/0x5d0 worker_thread+0x4e/0x3a0 kthread+0x109/0x140 ret_from_fork+0x2a/0x40 Fix the leak by adding the missing call to kobject_put() to sysfs_slab_remove_workfn(). Link: http://lkml.kernel.org/r/20170812181134.25027-1-vdavydov.dev@gmail.com Fixes: 3b7b314053d02 ("slub: make sysfs file removal asynchronous") Signed-off-by: Vladimir Davydov Reported-by: Andrei Vagin Tested-by: Andrei Vagin Acked-by: Tejun Heo Acked-by: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: [4.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 1d3f9835f4eabe..e8b4e31162cae8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) * A cache is never shut down before deactivation is * complete, so no need to worry about synchronization. */ - return; + goto out; #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); +out: kobject_put(&s->kobj); } From 5b53a6ea886700a128b697a6fe8375340dea2c30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 18 Aug 2017 15:16:12 -0700 Subject: [PATCH 126/154] mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS Tetsuo Handa has noticed that MMF_UNSTABLE SIGBUS path in handle_mm_fault causes a lockdep splat Out of memory: Kill process 1056 (a.out) score 603 or sacrifice child Killed process 1056 (a.out) total-vm:4268108kB, anon-rss:2246048kB, file-rss:0kB, shmem-rss:0kB a.out (1169) used greatest stack depth: 11664 bytes left DEBUG_LOCKS_WARN_ON(depth <= 0) ------------[ cut here ]------------ WARNING: CPU: 6 PID: 1339 at kernel/locking/lockdep.c:3617 lock_release+0x172/0x1e0 CPU: 6 PID: 1339 Comm: a.out Not tainted 4.13.0-rc3-next-20170803+ #142 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 RIP: 0010:lock_release+0x172/0x1e0 Call Trace: up_read+0x1a/0x40 __do_page_fault+0x28e/0x4c0 do_page_fault+0x30/0x80 page_fault+0x28/0x30 The reason is that the page fault path might have dropped the mmap_sem and returned with VM_FAULT_RETRY. MMF_UNSTABLE check however rewrites the error path to VM_FAULT_SIGBUS and we always expect mmap_sem taken in that path. Fix this by taking mmap_sem when VM_FAULT_RETRY is held in the MMF_UNSTABLE path. We cannot simply add VM_FAULT_SIGBUS to the existing error code because all arch specific page fault handlers and g-u-p would have to learn a new error code combination. Link: http://lkml.kernel.org/r/20170807113839.16695-2-mhocko@kernel.org Fixes: 3f70dc38cec2 ("mm: make sure that kthreads will not refault oom reaped memory") Reported-by: Tetsuo Handa Signed-off-by: Michal Hocko Acked-by: David Rientjes Cc: Andrea Argangeli Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Wenwei Tao Cc: [4.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e158f7ac67300b..c717b5bcc80e2c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3910,8 +3910,18 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * further. */ if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) - && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) + && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) { + + /* + * We are going to enforce SIGBUS but the PF path might have + * dropped the mmap_sem already so take it again so that + * we do not break expectations of all arch specific PF paths + * and g-u-p + */ + if (ret & VM_FAULT_RETRY) + down_read(&vma->vm_mm->mmap_sem); ret = VM_FAULT_SIGBUS; + } return ret; } From 6b31d5955cb29a51c5baffee382f213d75e98fb8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 18 Aug 2017 15:16:15 -0700 Subject: [PATCH 127/154] mm, oom: fix potential data corruption when oom_reaper races with writer Wenwei Tao has noticed that our current assumption that the oom victim is dying and never doing any visible changes after it dies, and so the oom_reaper can tear it down, is not entirely true. __task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set but do_group_exit sends SIGKILL to all threads _after_ the flag is set. So there is a race window when some threads won't have fatal_signal_pending while the oom_reaper could start unmapping the address space. Moreover some paths might not check for fatal signals before each PF/g-u-p/copy_from_user. We already have a protection for oom_reaper vs. PF races by checking MMF_UNSTABLE. This has been, however, checked only for kernel threads (use_mm users) which can outlive the oom victim. A simple fix would be to extend the current check in handle_mm_fault for all tasks but that wouldn't be sufficient because the current check assumes that a kernel thread would bail out after EFAULT from get_user*/copy_from_user and never re-read the same address which would succeed because the PF path has established page tables already. This seems to be the case for the only existing use_mm user currently (virtio driver) but it is rather fragile in general. This is even more fragile in general for more complex paths such as generic_perform_write which can re-read the same address more times (e.g. iov_iter_copy_from_user_atomic to fail and then iov_iter_fault_in_readable on retry). Therefore we have to implement MMF_UNSTABLE protection in a robust way and never make a potentially corrupted content visible. That requires to hook deeper into the PF path and check for the flag _every time_ before a pte for anonymous memory is established (that means all !VM_SHARED mappings). The corruption can be triggered artificially (http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp) but there doesn't seem to be any real life bug report. The race window should be quite tight to trigger most of the time. Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko Reported-by: Wenwei Tao Tested-by: Tetsuo Handa Cc: "Kirill A. Shutemov" Cc: Andrea Argangeli Cc: David Rientjes Cc: Oleg Nesterov Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 22 ++++++++++++++++++++++ mm/huge_memory.c | 30 +++++++++++++++++++++-------- mm/memory.c | 46 ++++++++++++++++++++------------------------- 3 files changed, 64 insertions(+), 34 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 8a266e2be5a63a..76aac4ce39bcf5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -6,6 +6,8 @@ #include #include #include +#include /* MMF_* */ +#include /* VM_FAULT* */ struct zonelist; struct notifier_block; @@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Checks whether a page fault on the given mm is still reliable. + * This is no longer true if the oom reaper started to reap the + * address space which is reflected by MMF_UNSTABLE flag set in + * the mm. At that moment any !shared mapping would lose the content + * and could cause a memory corruption (zero pages instead of the + * original content). + * + * User should call this before establishing a page table entry for + * a !shared mapping and under the proper page table lock. + * + * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise. + */ +static inline int check_stable_address_space(struct mm_struct *mm) +{ + if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + return VM_FAULT_SIGBUS; + return 0; +} + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 216114f6ef0b7f..90731e3b7e589e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + int ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto release; } clear_huge_page(page, haddr, HPAGE_PMD_NR); @@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - pte_free(vma->vm_mm, pgtable); + goto unlock_release; } else { pmd_t entry; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { int ret; @@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + mem_cgroup_cancel_charge(page, memcg, true); + put_page(page); + return ret; + } /* @@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = 0; set = false; if (pmd_none(*vmf->pmd)) { - if (userfaultfd_missing(vma)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) { + spin_unlock(vmf->ptl); + } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); diff --git a/mm/memory.c b/mm/memory.c index c717b5bcc80e2c..fe2fba27ded2fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; + int ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto unlock; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!pte_none(*vmf->pte)) goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2978,7 +2987,7 @@ static int do_anonymous_page(struct vm_fault *vmf) update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, int finish_fault(struct vm_fault *vmf) { struct page *page; - int ret; + int ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf) page = vmf->cow_page; else page = vmf->page; - ret = alloc_set_pte(vmf, vmf->memcg, page); + + /* + * check even for read faults because we might have lost our CoWed + * page + */ + if (!(vmf->vma->vm_flags & VM_SHARED)) + ret = check_stable_address_space(vmf->vma->vm_mm); + if (!ret) + ret = alloc_set_pte(vmf, vmf->memcg, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } - /* - * This mm has been already reaped by the oom reaper and so the - * refault cannot be trusted in general. Anonymous refaults would - * lose data and give a zero page instead e.g. This is especially - * problem for use_mm() because regular tasks will just die and - * the corrupted data will not be visible anywhere while kthread - * will outlive the oom victim and potentially propagate the date - * further. - */ - if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) - && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) { - - /* - * We are going to enforce SIGBUS but the PF path might have - * dropped the mmap_sem already so take it again so that - * we do not break expectations of all arch specific PF paths - * and g-u-p - */ - if (ret & VM_FAULT_RETRY) - down_read(&vma->vm_mm->mmap_sem); - ret = VM_FAULT_SIGBUS; - } - return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 18 Aug 2017 15:16:18 -0700 Subject: [PATCH 128/154] signal: don't remove SIGNAL_UNKILLABLE for traced tasks. When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive faults, but this is undesirable when tracing. For example, debugging an init process (whether global or namespace), hitting a breakpoint and SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE. Everything continues fine, but then once debugging has finished, the init process is left killable which is unlikely what the user expects, resulting in either an accidentally killed init or an init that stops reaping zombies. Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e64c..ed804a470dcd15 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); From da094e42848e3c36feaa3b5271e53983fd45424f Mon Sep 17 00:00:00 2001 From: Prakash Gupta Date: Fri, 18 Aug 2017 15:16:21 -0700 Subject: [PATCH 129/154] mm/cma_debug.c: fix stack corruption due to sprintf usage name[] in cma_debugfs_add_one() can only accommodate 16 chars including NULL to store sprintf output. It's common for cma device name to be larger than 15 chars. This can cause stack corrpution. If the gcc stack protector is turned on, this can cause a panic due to stack corruption. Below is one example trace: Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffff8e69a75730 Call trace: dump_backtrace+0x0/0x2c4 show_stack+0x20/0x28 dump_stack+0xb8/0xf4 panic+0x154/0x2b0 print_tainted+0x0/0xc0 cma_debugfs_init+0x274/0x290 do_one_initcall+0x5c/0x168 kernel_init_freeable+0x1c8/0x280 Fix the short sprintf buffer in cma_debugfs_add_one() by using scnprintf() instead of sprintf(). Link: http://lkml.kernel.org/r/1502446217-21840-1-git-send-email-guptap@codeaurora.org Fixes: f318dd083c81 ("cma: Store a name in the cma structure") Signed-off-by: Prakash Gupta Acked-by: Laura Abbott Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 595b757bef7272..c03ccbc405a066 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) char name[16]; int u32s; - sprintf(name, "cma-%s", cma->name); + scnprintf(name, sizeof(name), "cma-%s", cma->name); tmp = debugfs_create_dir(name, cma_debugfs_root); From 73223e4e2e3867ebf033a5a8eb2e5df0158ccc99 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 18 Aug 2017 15:16:24 -0700 Subject: [PATCH 130/154] mm/mempolicy: fix use after free when calling get_mempolicy I hit a use after free issue when executing trinity and repoduced it with KASAN enabled. The related call trace is as follows. BUG: KASan: use after free in SyS_get_mempolicy+0x3c8/0x960 at addr ffff8801f582d766 Read of size 2 by task syz-executor1/798 INFO: Allocated in mpol_new.part.2+0x74/0x160 age=3 cpu=1 pid=799 __slab_alloc+0x768/0x970 kmem_cache_alloc+0x2e7/0x450 mpol_new.part.2+0x74/0x160 mpol_new+0x66/0x80 SyS_mbind+0x267/0x9f0 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x2b/0x40 age=4 cpu=1 pid=799 __slab_free+0x495/0x8e0 kmem_cache_free+0x2f3/0x4c0 __mpol_put+0x2b/0x40 SyS_mbind+0x383/0x9f0 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0009cb8dc0 objects=23 used=8 fp=0xffff8801f582de40 flags=0x200000000004080 INFO: Object 0xffff8801f582d760 @offset=5984 fp=0xffff8801f582d600 Bytes b4 ffff8801f582d750: ae 01 ff ff 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ Object ffff8801f582d760: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk Object ffff8801f582d770: 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkk. Redzone ffff8801f582d778: bb bb bb bb bb bb bb bb ........ Padding ffff8801f582d8b8: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ Memory state around the buggy address: ffff8801f582d600: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801f582d680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8801f582d700: fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb fc !shared memory policy is not protected against parallel removal by other thread which is normally protected by the mmap_sem. do_get_mempolicy, however, drops the lock midway while we can still access it later. Early premature up_read is a historical artifact from times when put_user was called in this path see https://lwn.net/Articles/124754/ but that is gone since 8bccd85ffbaf ("[PATCH] Implement sys_* do_* layering in the memory policy layer."). but when we have the the current mempolicy ref count model. The issue was introduced accordingly. Fix the issue by removing the premature release. Link: http://lkml.kernel.org/r/1502950924-27521-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Acked-by: Michal Hocko Cc: Minchan Kim Cc: Vlastimil Babka Cc: David Rientjes Cc: Mel Gorman Cc: [2.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d911fa5cb2a73f..618ab125228bae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy |= (pol->flags & MPOL_MODE_FLAGS); } - if (vma) { - up_read(¤t->mm->mmap_sem); - vma = NULL; - } - err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { From 704b862f9efd6d4c87a8d0a344dda19bda9c6b69 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 18 Aug 2017 15:16:27 -0700 Subject: [PATCH 131/154] mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM Commit 19809c2da28a ("mm, vmalloc: use __GFP_HIGHMEM implicitly") added use of __GFP_HIGHMEM for allocations. vmalloc_32 may use GFP_DMA/GFP_DMA32 which does not play nice with __GFP_HIGHMEM and will trigger a BUG in gfp_zone. Only add __GFP_HIGHMEM if we aren't using GFP_DMA/GFP_DMA32. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1482249 Link: http://lkml.kernel.org/r/20170816220705.31374-1-labbott@redhat.com Fixes: 19809c2da28a ("mm, vmalloc: use __GFP_HIGHMEM implicitly") Signed-off-by: Laura Abbott Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8698c1c86c4dbe..a47e3894c77564 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? + 0 : + __GFP_HIGHMEM; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); @@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_page(alloc_mask|highmem_mask); else - page = alloc_pages_node(node, alloc_mask, 0); + page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask)) + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } From c715b72c1ba406f133217b509044c38d8e714a37 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Aug 2017 15:16:31 -0700 Subject: [PATCH 132/154] mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes Moving the x86_64 and arm64 PIE base from 0x555555554000 to 0x000100000000 broke AddressSanitizer. This is a partial revert of: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") The AddressSanitizer tool has hard-coded expectations about where executable mappings are loaded. The motivation for changing the PIE base in the above commits was to avoid the Stack-Clash CVEs that allowed executable mappings to get too close to heap and stack. This was mainly a problem on 32-bit, but the 64-bit bases were moved too, in an effort to proactively protect those systems (proofs of concept do exist that show 64-bit collisions, but other recent changes to fix stack accounting and setuid behaviors will minimize the impact). The new 32-bit PIE base is fine for ASan (since it matches the ET_EXEC base), so only the 64-bit PIE base needs to be reverted to let x86 and arm64 ASan binaries run again. Future changes to the 64-bit PIE base on these architectures can be made optional once a more dynamic method for dealing with AddressSanitizer is found. (e.g. always loading PIE into the mmap region for marked binaries.) Link: http://lkml.kernel.org/r/20170807201542.GA21271@beast Fixes: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") Fixes: 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") Signed-off-by: Kees Cook Reported-by: Kostya Serebryany Acked-by: Will Deacon Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/elf.h | 4 ++-- arch/x86/include/asm/elf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index acae781f7359ec..3288c2b3673149 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -114,10 +114,10 @@ /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#define ELF_ET_DYN_BASE 0x100000000UL +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1c18d83d3f094d..9aeb91935ce023 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -247,11 +247,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, From ff244c6b29b176f3f448bc75e55df297225e1b3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Aug 2017 13:39:56 -0700 Subject: [PATCH 133/154] tun: handle register_netdevice() failures properly syzkaller reported a double free [1], caused by the fact that tun driver was not updated properly when priv_destructor was added. When/if register_netdevice() fails, priv_destructor() must have been called already. [1] BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023 CPU: 0 PID: 2919 Comm: syzkaller227220 Not tainted 4.13.0-rc4+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x7f/0x260 mm/kasan/report.c:252 kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333 kasan_slab_free+0xa0/0xc0 mm/kasan/kasan.c:514 __cache_free mm/slab.c:3503 [inline] kfree+0xd3/0x260 mm/slab.c:3820 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023 security_tun_dev_free_security+0x48/0x80 security/security.c:1512 tun_set_iff drivers/net/tun.c:1884 [inline] __tun_chr_ioctl+0x2ce6/0x3d50 drivers/net/tun.c:2064 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x443ff9 RSP: 002b:00007ffc34271f68 EFLAGS: 00000217 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000443ff9 RDX: 0000000020533000 RSI: 00000000400454ca RDI: 0000000000000003 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401ce0 R13: 0000000000401d70 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 2919: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:551 kmem_cache_alloc_trace+0x101/0x6f0 mm/slab.c:3627 kmalloc include/linux/slab.h:493 [inline] kzalloc include/linux/slab.h:666 [inline] selinux_tun_dev_alloc_security+0x49/0x170 security/selinux/hooks.c:5012 security_tun_dev_alloc_security+0x6d/0xa0 security/security.c:1506 tun_set_iff drivers/net/tun.c:1839 [inline] __tun_chr_ioctl+0x1730/0x3d50 drivers/net/tun.c:2064 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 2919: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x6e/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kfree+0xd3/0x260 mm/slab.c:3820 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023 security_tun_dev_free_security+0x48/0x80 security/security.c:1512 tun_free_netdev+0x13b/0x1b0 drivers/net/tun.c:1563 register_netdevice+0x8d0/0xee0 net/core/dev.c:7605 tun_set_iff drivers/net/tun.c:1859 [inline] __tun_chr_ioctl+0x1caf/0x3d50 drivers/net/tun.c:2064 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe The buggy address belongs to the object at ffff8801d2843b40 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 0 bytes inside of 32-byte region [ffff8801d2843b40, ffff8801d2843b60) The buggy address belongs to the page: page:ffffea000660cea8 count:1 mapcount:0 mapping:ffff8801d2843000 index:0xffff8801d2843fc1 flags: 0x200000000000100(slab) raw: 0200000000000100 ffff8801d2843000 ffff8801d2843fc1 000000010000003f raw: ffffea0006626a40 ffffea00066141a0 ffff8801dbc00100 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801d2843a00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc ffff8801d2843a80: 00 00 00 fc fc fc fc fc fb fb fb fb fc fc fc fc >ffff8801d2843b00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc ^ ffff8801d2843b80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc ffff8801d2843c00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc ================================================================== Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 32ad87345f5798..0a2c0a42283f78 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1879,6 +1879,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err_detach: tun_detach_all(dev); + /* register_netdevice() already called tun_free_netdev() */ + goto err_free_dev; + err_free_flow: tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); From 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 09:41:54 -0700 Subject: [PATCH 134/154] tipc: fix use-after-free syszkaller reported use-after-free in tipc [1] When msg->rep skb is freed, set the pointer to NULL, so that caller does not free it again. [1] ================================================================== BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466 Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115 CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 skb_push+0xd4/0xe0 net/core/skbuff.c:1466 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9 RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76 R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000 Allocated by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219 alloc_skb include/linux/skbuff.h:903 [inline] tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622 __kfree_skb net/core/skbuff.c:682 [inline] kfree_skb+0x165/0x4c0 net/core/skbuff.c:699 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe The buggy address belongs to the object at ffff8801c6e71dc0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 208 bytes inside of 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0) The buggy address belongs to the page: page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0 flags: 0x200000000000100(slab) raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 9bfe886ab33080..750949dfc1d7d9 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { kfree_skb(msg->rep); + msg->rep = NULL; return -ENOMEM; } err = __tipc_nl_compat_dumpit(cmd, msg, arg); - if (err) + if (err) { kfree_skb(msg->rep); - + msg->rep = NULL; + } kfree_skb(arg); return err; From 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 16 Aug 2017 20:16:40 +0200 Subject: [PATCH 135/154] sctp: fully initialize the IPv6 address in sctp_v6_to_addr() KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below). Make sure all fields of an IPv6 address are initialized, which guarantees that the IPv4 fields are also initialized. ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== Signed-off-by: Alexander Potapenko Reviewed-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 2a186b201ad2c2..a4b6ffb6149541 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -512,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; + addr->v6.sin6_scope_id = 0; } /* Compare addresses exactly. From 383143f31d7d3525a1dbff733d52fff917f82f15 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 16 Aug 2017 11:18:09 -0700 Subject: [PATCH 136/154] ipv6: reset fn->rr_ptr when replacing route syzcaller reported the following use-after-free issue in rt6_select(): BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8 BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8 Read of size 4 by task syz-executor1/439628 CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380 Call Trace: [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0xc1/0x124 lib/dump_stack.c:51 sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead [] kasan_object_err+0x21/0x70 mm/kasan/report.c:158 [] print_address_description mm/kasan/report.c:196 [inline] [] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285 [] kasan_report mm/kasan/report.c:305 [inline] [] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325 [] rt6_select net/ipv6/route.c:755 [inline] [] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 [] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203 [] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95 [] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223 [] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41 [] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224 [] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943 [] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079 [] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91 [] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline] [] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272 [] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284 [] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564 [] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582 [] SyS_connect+0x29/0x30 net/socket.c:1563 [] entry_SYSCALL_64_fastpath+0x12/0x17 Object at ffff8800bc699380, in cache ip6_dst_cache size: 384 The root cause of it is that in fib6_add_rt2node(), when it replaces an existing route with the new one, it does not update fn->rr_ptr. This commit resets fn->rr_ptr to NULL when it points to a route which is replaced in fib6_add_rt2node(). Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ebb299cf72b7e8..e6f878067c6834 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -914,6 +914,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, } nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { @@ -926,6 +928,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { From bc3aae2bbac46dd894c89db5d5e98f7f0ef9e205 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 16 Aug 2017 12:38:52 -0700 Subject: [PATCH 137/154] net: check and errout if res->fi is NULL when RTM_F_FIB_MATCH is set Syzkaller hit 'general protection fault in fib_dump_info' bug on commit 4.13-rc5.. Guilty file: net/ipv4/fib_semantics.c kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff880078562700 task.stack: ffff880078110000 RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP: 0018:ffff880078117010 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 00000000000000fe RCX: 0000000000000002 RDX: 0000000000000006 RSI: ffff880078117084 RDI: 0000000000000030 RBP: ffff880078117268 R08: 000000000000000c R09: ffff8800780d80c8 R10: 0000000058d629b4 R11: 0000000067fce681 R12: 0000000000000000 R13: ffff8800784bd540 R14: ffff8800780d80b5 R15: ffff8800780d80a4 FS: 00000000022fa940(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004387d0 CR3: 0000000079135000 CR4: 00000000000006f0 Call Trace: inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766 rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217 netlink_rcv_skb+0x340/0x470 net/netlink/af_netlink.c:2397 rtnetlink_rcv+0x28/0x30 net/core/rtnetlink.c:4223 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4c4/0x6e0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0x8c4/0xca0 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x779/0x8d0 net/socket.c:2035 __sys_sendmsg+0xd1/0x170 net/socket.c:2069 SYSC_sendmsg net/socket.c:2080 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2076 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x4512e9 RSP: 002b:00007ffc75584cc8 EFLAGS: 00000216 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00000000004512e9 RDX: 0000000000000000 RSI: 0000000020f2cfc8 RDI: 0000000000000003 RBP: 000000000000000e R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: fffffffffffffffe R13: 0000000000718000 R14: 0000000020c44ff0 R15: 0000000000000000 Code: 00 0f b6 8d ec fd ff ff 48 8b 85 f0 fd ff ff 88 48 17 48 8b 45 28 48 8d 78 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e cb 0c 00 00 48 8b 45 28 44 RIP: fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP: ffff880078117010 ---[ end trace 254a7af28348f88b ]--- This patch adds a res->fi NULL check. example run: $ip route get 0.0.0.0 iif virt1-0 broadcast 0.0.0.0 dev lo cache iif virt1-0 $ip route get 0.0.0.0 iif virt1-0 fibmatch RTNETLINK answers: No route to host Reported-by: idaifish Reported-by: Dmitry Vyukov Fixes: b61798130f1b ("net: ipv4: RTM_GETROUTE: return matched fib result when requested") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fe877a4a72b1ec..2331de20ca505d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2763,14 +2763,21 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - if (rtm->rtm_flags & RTM_F_FIB_MATCH) + if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + if (!res.fi) { + err = fib_props[res.type].error; + if (!err) + err = -EHOSTUNREACH; + goto errout_free; + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, table_id, rt->rt_type, res.prefix, res.prefixlen, fl4.flowi4_tos, res.fi, 0); - else + } else { err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + } if (err < 0) goto errout_free; From cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Aug 2017 17:53:36 -0400 Subject: [PATCH 138/154] tcp: when rearming RTO, if RTO time is in past then fire RTO ASAP In some situations tcp_send_loss_probe() can realize that it's unable to send a loss probe (TLP), and falls back to calling tcp_rearm_rto() to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto() realizes that the RTO was eligible to fire immediately or at some point in the past (delta_us <= 0). Previously in such cases tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now + icsk_rto, which caused needless delays of hundreds of milliseconds (and non-linear behavior that made reproducible testing difficult). This commit changes the logic to schedule "overdue" RTOs ASAP, rather than at now + icsk_rto. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Suggested-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53de1424c13cda..bab7f0493098c6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3009,8 +3009,7 @@ void tcp_rearm_rto(struct sock *sk) /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta_us > 0) - rto = usecs_to_jiffies(delta_us); + rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); From b6f6d56c91f5261c55edef2df300698c4486b669 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 17 Aug 2017 13:06:14 +0200 Subject: [PATCH 139/154] PCI: Allow PCI express root ports to find themselves If the pci_find_pcie_root_port() function is called on a root port itself, return the root port rather than NULL. This effectively reverts commit 0e405232871d6 ("PCI: fix oops when try to find Root Port for a PCI device") which added an extra check that would now be redundant. Fixes: a99b646afa8a ("PCI: Disable PCIe Relaxed Ordering if unsupported") Fixes: c56d4450eb68 ("PCI: Turn off Request Attributes to avoid Chelsio T5 Completion erratum") Signed-off-by: Thierry Reding Acked-by: Bjorn Helgaas Tested-by: Shawn Lin Tested-by: Michael Ellerman Signed-off-by: David S. Miller --- drivers/pci/pci.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index da5570cf5c6a4e..fdf65a6c13f625 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -514,7 +514,7 @@ EXPORT_SYMBOL(pci_find_resource); */ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev) { - struct pci_dev *bridge, *highest_pcie_bridge = NULL; + struct pci_dev *bridge, *highest_pcie_bridge = dev; bridge = pci_upstream_bridge(dev); while (bridge && pci_is_pcie(bridge)) { @@ -522,11 +522,10 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev) bridge = pci_upstream_bridge(bridge); } - if (highest_pcie_bridge && - pci_pcie_type(highest_pcie_bridge) == PCI_EXP_TYPE_ROOT_PORT) - return highest_pcie_bridge; + if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT) + return NULL; - return NULL; + return highest_pcie_bridge; } EXPORT_SYMBOL(pci_find_pcie_root_port); From ca3d89a3ebe79367bd41b6b8ba37664478ae2dba Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 17 Aug 2017 18:29:52 +0300 Subject: [PATCH 140/154] net/mlx4_core: Enable 4K UAR if SRIOV module parameter is not enabled enable_4k_uar module parameter was added in patch cited below to address the backward compatibility issue in SRIOV when the VM has system's PAGE_SIZE uar implementation and the Hypervisor has 4k uar implementation. The above compatibility issue does not exist in the non SRIOV case. In this patch, we always enable 4k uar implementation if SRIOV is not enabled on mlx4's supported cards. Fixes: 76e39ccf9c36 ("net/mlx4_core: Fix backward compatibility on VFs") Signed-off-by: Huy Nguyen Reviewed-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 09b9bc17bce998..5fe5cdc5135776 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -432,7 +432,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) /* Virtual PCI function needs to determine UAR page size from * firmware. Only master PCI function can set the uar page size */ - if (enable_4k_uar) + if (enable_4k_uar || !dev->persist->num_vfs) dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT; else dev->uar_page_shift = PAGE_SHIFT; @@ -2277,7 +2277,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev) dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; - if (enable_4k_uar) { + if (enable_4k_uar || !dev->persist->num_vfs) { init_hca.log_uar_sz = ilog2(dev->caps.num_uars) + PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT; init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12; From b024d949a3c24255a7ef1a470420eb478949aa4c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Aug 2017 23:14:58 +0100 Subject: [PATCH 141/154] irda: do not leak initialized list.dev to userspace list.dev has not been initialized and so the copy_to_user is copying data from the stack back to user space which is a potential information leak. Fix this ensuring all of list is initialized to zero. Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/irda/af_irda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 2e6990f8b80b6b..23fa7c8b09a586 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2213,7 +2213,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - struct irda_device_list list; + struct irda_device_list list = { 0 }; struct irda_device_info *discoveries; struct irda_ias_set * ias_opt; /* IAS get/query params */ struct ias_object * ias_obj; /* Object in IAS */ From 9a19bad70cf16b0cdf3576efda7deb490e7aa529 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 18 Aug 2017 00:19:42 +0100 Subject: [PATCH 142/154] rxrpc: Fix oops when discarding a preallocated service call rxrpc_service_prealloc_one() doesn't set the socket pointer on any new call it preallocates, but does add it to the rxrpc net namespace call list. This, however, causes rxrpc_put_call() to oops when the call is discarded when the socket is closed. rxrpc_put_call() needs the socket to be able to reach the namespace so that it can use a lock held therein. Fix this by setting a call's socket pointer immediately before discarding it. This can be triggered by unloading the kafs module, resulting in an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 IP: rxrpc_put_call+0x1e2/0x32d PGD 0 P4D 0 Oops: 0000 [#1] SMP Modules linked in: kafs(E-) CPU: 3 PID: 3037 Comm: rmmod Tainted: G E 4.12.0-fscache+ #213 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 task: ffff8803fc92e2c0 task.stack: ffff8803fef74000 RIP: 0010:rxrpc_put_call+0x1e2/0x32d RSP: 0018:ffff8803fef77e08 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8803fab99ac0 RCX: 000000000000000f RDX: ffffffff81c50a40 RSI: 000000000000000c RDI: ffff8803fc92ea88 RBP: ffff8803fef77e30 R08: ffff8803fc87b941 R09: ffffffff82946d20 R10: ffff8803fef77d10 R11: 00000000000076fc R12: 0000000000000005 R13: ffff8803fab99c20 R14: 0000000000000001 R15: ffffffff816c6aee FS: 00007f915a059700(0000) GS:ffff88041fb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 00000003fef39000 CR4: 00000000001406e0 Call Trace: rxrpc_discard_prealloc+0x325/0x341 rxrpc_listen+0xf9/0x146 kernel_listen+0xb/0xd afs_close_socket+0x3e/0x173 [kafs] afs_exit+0x1f/0x57 [kafs] SyS_delete_module+0x10f/0x19a do_syscall_64+0x8a/0x149 entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_accept.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index dd30d74824b0de..ec3383f97d4c3e 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -223,6 +223,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; + call->socket = rx; if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); From 4f8a881acc9d1adaf1e552349a0b1df28933a04c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Aug 2017 11:01:36 +0800 Subject: [PATCH 143/154] net: sched: fix NULL pointer dereference when action calls some targets As we know in some target's checkentry it may dereference par.entryinfo to check entry stuff inside. But when sched action calls xt_check_target, par.entryinfo is set with NULL. It would cause kernel panic when calling some targets. It can be reproduce with: # tc qd add dev eth1 ingress handle ffff: # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \ -j ECN --ecn-tcp-remove It could also crash kernel when using target CLUSTERIP or TPROXY. By now there's no proper value for par.entryinfo in ipt_init_target, but it can not be set with NULL. This patch is to void all these panics by setting it with an ipt_entry obj with all members = 0. Note that this issue has been there since the very beginning. Signed-off-by: Xin Long Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d516ba8178b809..541707802a2380 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, { struct xt_tgchk_param par; struct xt_target *target; + struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, @@ -52,6 +53,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, memset(&par, 0, sizeof(par)); par.net = net; par.table = table; + par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = hook; From 8fbbe2d7cc478d1544f41f2271787c993c23a4f6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Aug 2017 12:57:51 +0300 Subject: [PATCH 144/154] genirq/ipi: Fixup checks against nr_cpu_ids Valid CPU ids are [0, nr_cpu_ids-1] inclusive. Fixes: 3b8e29a82dd1 ("genirq: Implement ipi_send_mask/single()") Fixes: f9bce791ae2a ("genirq: Add a new function to get IPI reverse mapping") Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170819095751.GB27864@avx2 --- kernel/irq/ipi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea004..259a22aa9934cf 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { From 197e7e521384a23b9e585178f3f11c9fa08274b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Aug 2017 13:26:27 -0700 Subject: [PATCH 145/154] Sanitize 'move_pages()' permission checks The 'move_paghes()' system call was introduced long long ago with the same permission checks as for sending a signal (except using CAP_SYS_NICE instead of CAP_SYS_KILL for the overriding capability). That turns out to not be a great choice - while the system call really only moves physical page allocations around (and you need other capabilities to do a lot of it), you can check the return value to map out some the virtual address choices and defeat ASLR of a binary that still shares your uid. So change the access checks to the more common 'ptrace_may_access()' model instead. This tightens the access checks for the uid, and also effectively changes the CAP_SYS_NICE check to CAP_SYS_PTRACE, but it's unlikely that anybody really _uses_ this legacy system call any more (we hav ebetter NUMA placement models these days), so I expect nobody to notice. Famous last words. Reported-by: Otto Ebeling Acked-by: Eric W. Biederman Cc: Willy Tarreau Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/migrate.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d68a41da6abb07..e84eeb4e43566c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -1652,7 +1653,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, const int __user *, nodes, int __user *, status, int, flags) { - const struct cred *cred = current_cred(), *tcred; struct task_struct *task; struct mm_struct *mm; int err; @@ -1676,14 +1676,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, /* * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * process. Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out; From 14ccee78fc82f5512908f4424f541549a5705b89 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Aug 2017 14:13:52 -0700 Subject: [PATCH 146/154] Linux 4.13-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 90da7bdc3f4552..235826f957411e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Fearless Coyote # *DOCUMENTATION* From d4dd2d75a26ef07cadc2949efeea9fabc2a5c299 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Aug 2017 00:26:03 +0200 Subject: [PATCH 147/154] bpf, doc: also add s390x as arch to sysctl description Looks like this was accidentally missed, so still add s390x as supported eBPF JIT arch to bpf_jit_enable. Fixes: 014cd0a368dc ("bpf: Update sysctl documentation to list all supported architectures") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index d7c2b88b92ae80..28596e03220b55 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -49,6 +49,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on: - ppc64 - sparc64 - mips64 + - s390x And the older cBPF JIT supported on the following archs: - arm From 5a78449810b06c3bc5fcd002d52e1a64f9bb397e Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Mon, 21 Aug 2017 08:52:54 +1200 Subject: [PATCH 148/154] switchdev: documentation: minor typo fixes Two typos in switchdev.txt Signed-off-by: Chris Packham Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 3e7b946dea2778..5e40e1f68873b0 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -228,7 +228,7 @@ Learning on the device port should be enabled, as well as learning_sync: bridge link set dev DEV learning on self bridge link set dev DEV learning_sync on self -Learning_sync attribute enables syncing of the learned/forgotton FDB entry to +Learning_sync attribute enables syncing of the learned/forgotten FDB entry to the bridge's FDB. It's possible, but not optimal, to enable learning on the device port and on the bridge port, and disable learning_sync. @@ -245,7 +245,7 @@ the responsibility of the port driver/device to age out these entries. If the port device supports ageing, when the FDB entry expires, it will notify the driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL. If the device does not support ageing, the driver can simulate ageing using a -garbage collection timer to monitor FBD entries. Expired entries will be +garbage collection timer to monitor FDB entries. Expired entries will be notified to the bridge using SWITCHDEV_FDB_DEL. See rocker driver for example of driver running ageing timer. From 49bf4b36fdee4db3c8bc0507a8413a93a8c711cf Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Sun, 20 Aug 2017 21:48:14 +0200 Subject: [PATCH 149/154] tools lib bpf: improve warning Signed-off-by: Eric Leblond Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/lib/bpf/libbpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1a2c07eb7795bb..8c67a90dbd8229 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -879,7 +879,8 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; int err = *pfd; - pr_warning("failed to create map: %s\n", + pr_warning("failed to create map (name: '%s'): %s\n", + obj->maps[i].name, strerror(errno)); for (j = 0; j < i; j++) zclose(obj->maps[j].fd); From 68a66d149a8c78ec6720f268597302883e48e9fa Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 19 Aug 2017 15:37:07 +0300 Subject: [PATCH 150/154] net_sched: fix order of queue length updates in qdisc_replace() This important to call qdisc_tree_reduce_backlog() after changing queue length. Parent qdisc should deactivate class in ->qlen_notify() called from qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero. Missed class deactivations leads to crashes/warnings at picking packets from empty qdisc and corrupting state at reactivating this class in future. Signed-off-by: Konstantin Khlebnikov Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper") Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1c123e2b241579..67f815e5d52517 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -806,8 +806,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) { - qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); + unsigned int qlen = old->q.qlen; + unsigned int backlog = old->qstats.backlog; + qdisc_reset(old); + qdisc_tree_reduce_backlog(old, qlen, backlog); } sch_tree_unlock(sch); From 348a4002729ccab8b888b38cbc099efa2f2a2036 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 18 Aug 2017 17:14:49 -0700 Subject: [PATCH 151/154] ipv6: repair fib6 tree in failure case In fib6_add(), it is possible that fib6_add_1() picks an intermediate node and sets the node's fn->leaf to NULL in order to add this new route. However, if fib6_add_rt2node() fails to add the new route for some reason, fn->leaf will be left as NULL and could potentially cause crash when fn->leaf is accessed in fib6_locate(). This patch makes sure fib6_repair_tree() is called to properly repair fn->leaf in the above failure case. Here is the syzkaller reported general protection fault in fib6_locate: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000 RIP: 0010:[] [] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline] RIP: 0010:[] [] ipv6_prefix_equal include/net/ipv6.h:492 [inline] RIP: 0010:[] [] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline] RIP: 0010:[] [] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233 RSP: 0018:ffff8801d01a36a8 EFLAGS: 00010202 RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000 RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100 RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000 FS: 00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988 Call Trace: [] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109 [] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075 [] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450 [] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281 [] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456 [] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline] [] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232 [] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778 [] sock_sendmsg_nosec net/socket.c:609 [inline] [] sock_sendmsg+0xcf/0x110 net/socket.c:619 [] sock_write_iter+0x222/0x3a0 net/socket.c:834 [] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478 [] __vfs_write+0xe4/0x110 fs/read_write.c:491 [] vfs_write+0x178/0x4b0 fs/read_write.c:538 [] SYSC_write fs/read_write.c:585 [inline] [] SyS_write+0xd9/0x1b0 fs/read_write.c:577 [] entry_SYSCALL_64_fastpath+0x12/0x17 Note: there is no "Fixes" tag as this seems to be a bug introduced very early. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e6f878067c6834..5cc0ea0381981b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1018,7 +1018,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Create subtree root node */ sfn = node_alloc(); if (!sfn) - goto st_failure; + goto failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); @@ -1035,12 +1035,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { /* If it is failed, discard just allocated - root, and then (in st_failure) stale node + root, and then (in failure) stale node in main tree. */ node_free(sfn); err = PTR_ERR(sn); - goto st_failure; + goto failure; } /* Now link new subtree to main tree */ @@ -1055,7 +1055,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { err = PTR_ERR(sn); - goto st_failure; + goto failure; } } @@ -1096,18 +1096,17 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, atomic_inc(&pn->leaf->rt6i_ref); } #endif - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); + goto failure; } return err; -#ifdef CONFIG_IPV6_SUBTREES - /* Subtree creation failed, probably main tree node - is orphan. If it is, shoot it. +failure: + /* fn->leaf could be NULL if fn is an intermediate node and we + * failed to add the new route to it in both subtree creation + * failure and fib6_add_rt2node() failure case. + * In both cases, fib6_repair_tree() should be called to fix + * fn->leaf. */ -st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); /* Always release dst as dst->__refcnt is guaranteed @@ -1115,7 +1114,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ dst_release_immediate(&rt->dst); return err; -#endif } /* From 03619844d81d7459874e88034f0f36d959f0e2df Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 25 Jul 2017 18:44:04 +0200 Subject: [PATCH 152/154] rtc: ds1307: fix regmap config Current max_register setting breaks reading nvram on certain chips and also reading the standard registers on RX8130 where register map starts at 0x10. Signed-off-by: Heiner Kallweit Fixes: 11e5890b5342 "rtc: ds1307: convert driver to regmap" Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1307.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 4fac49e55d473e..4b43aa62fbc789 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -1301,7 +1301,6 @@ static void ds1307_clks_register(struct ds1307 *ds1307) static const struct regmap_config regmap_config = { .reg_bits = 8, .val_bits = 8, - .max_register = 0x12, }; static int ds1307_probe(struct i2c_client *client, From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: [PATCH 153/154] pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/pid.h | 4 +++- include/linux/sched.h | 51 +++++++++++++++++++++++-------------------- kernel/pid.c | 11 ++++------ 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e43108..719582744a2e87 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -8,7 +8,9 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, PIDTYPE_SID, - PIDTYPE_MAX + PIDTYPE_MAX, + /* only valid to __task_pid_nr_ns() */ + __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2e7..c05ac5f5aa034d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) return tsk->tgid; } -extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p) return p->pids[PIDTYPE_PID].pid != NULL; } -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); @@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5a2..020dedbdf066bc 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); From 2dc77533f1e495788d73ffa4bee4323b2646d2bb Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 13 Aug 2017 23:14:58 +0200 Subject: [PATCH 154/154] sparc: kernel/pcic: silence gcc 7.x warning in pcibios_fixup_bus() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building the kernel for Sparc using gcc 7.x, the build fails with: arch/sparc/kernel/pcic.c: In function ‘pcibios_fixup_bus’: arch/sparc/kernel/pcic.c:647:8: error: ‘cmd’ may be used uninitialized in this function [-Werror=maybe-uninitialized] cmd |= PCI_COMMAND_IO; ^~ The simplified code looks like this: unsigned int cmd; [...] pcic_read_config(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd); [...] cmd |= PCI_COMMAND_IO; I.e, the code assumes that pcic_read_config() will always initialize cmd. But it's not the case. Looking at pcic_read_config(), if bus->number is != 0 or if the size is not one of 1, 2 or 4, *val will not be initialized. As a simple fix, we initialize cmd to zero at the beginning of pcibios_fixup_bus. Signed-off-by: Thomas Petazzoni Signed-off-by: David S. Miller --- arch/sparc/kernel/pcic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index a38787b8432208..732af9a9f6ddef 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -602,7 +602,7 @@ void pcibios_fixup_bus(struct pci_bus *bus) { struct pci_dev *dev; int i, has_io, has_mem; - unsigned int cmd; + unsigned int cmd = 0; struct linux_pcic *pcic; /* struct linux_pbm_info* pbm = &pcic->pbm; */ int node;