From bbb07af4cdfd0c154db4c636927bd262f71c6401 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 30 Sep 2014 12:03:47 +0300 Subject: [PATCH 1/4] net/mlx4_core: Don't disable SRIOV if there are active VFs When unloading the host driver while there are VFs active on VMs, the PF driver disabled sriov anyway, causing kernel crashes. We now leave SRIOV enabled, to avoid that. When the driver is reloaded, __mlx4_init_one is invoked on the PF. It now checks to see if SRIOV is already enabled on the PF -- and if so does not enable sriov again. Signed-off-by: Tal Alon Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 28 +++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 1f10023af1db..4e9857b409f3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2272,6 +2272,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) unsigned total_vfs = 0; int sriov_initialized = 0; unsigned int i; + int existing_vfs = 0; pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); @@ -2431,7 +2432,15 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = 0; } else { atomic_inc(&pf_loading); - err = pci_enable_sriov(pdev, total_vfs); + existing_vfs = pci_num_vf(pdev); + if (existing_vfs) { + err = 0; + if (existing_vfs != total_vfs) + mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n", + existing_vfs, total_vfs); + } else { + err = pci_enable_sriov(pdev, total_vfs); + } if (err) { mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n", err); @@ -2645,7 +2654,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) mlx4_cmd_cleanup(dev); err_sriov: - if (dev->flags & MLX4_FLAG_SRIOV) + if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) pci_disable_sriov(pdev); err_rel_own: @@ -2693,16 +2702,21 @@ static void __mlx4_remove_one(struct pci_dev *pdev) struct mlx4_priv *priv = mlx4_priv(dev); int pci_dev_data; int p; + int active_vfs = 0; if (priv->removed) return; pci_dev_data = priv->pci_dev_data; - /* in SRIOV it is not allowed to unload the pf's - * driver while there are alive vf's */ - if (mlx4_is_master(dev) && mlx4_how_many_lives_vf(dev)) - pr_warn("Removing PF when there are assigned VF's !!!\n"); + /* Disabling SR-IOV is not allowed while there are active vf's */ + if (mlx4_is_master(dev)) { + active_vfs = mlx4_how_many_lives_vf(dev); + if (active_vfs) { + pr_warn("Removing PF when there are active VF's !!\n"); + pr_warn("Will not disable SR-IOV.\n"); + } + } mlx4_stop_sense(dev); mlx4_unregister_device(dev); @@ -2745,7 +2759,7 @@ static void __mlx4_remove_one(struct pci_dev *pdev) if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); - if (dev->flags & MLX4_FLAG_SRIOV) { + if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) { mlx4_warn(dev, "Disabling SR-IOV\n"); pci_disable_sriov(pdev); dev->num_vfs = 0; From e1c00e10e92c04aa637126db2e59b092bd4878f8 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 30 Sep 2014 12:03:48 +0300 Subject: [PATCH 2/4] net/mlx4_core: New init and exit flow for mlx4_core In the new flow, we separate the pci initialization and teardown from the initialization and teardown of the other resources. __mlx4_init_one handles the pci resources initialization. It then calls mlx4_load_one to initialize the remainder of the resources. When removing a device, mlx4_remove_one is invoked. However, now mlx4_remove_one calls mlx4_unload_one to free all the resources except the pci resources. When mlx4_unload_one returns, mlx4_remove_one then frees the pci resources. The above separation will allow us to implement 'reset flow' in the future. It will also enable more EQs for VFs and is a pre-step to the modern API to enable/disable SRIOV. Also added nvfs; an integer array of size MLX4_MAX_PORTS + 1; to the mlx4_dev struct. This new field is used to avoid parsing the num_vfs module parameter each time the mlx4_restart_one is called. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 393 ++++++++++++---------- include/linux/mlx4/device.h | 1 + 2 files changed, 220 insertions(+), 174 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 4e9857b409f3..f6c32a947185 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2259,116 +2259,18 @@ static void mlx4_free_ownership(struct mlx4_dev *dev) iounmap(owner); } -static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) +static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, + int total_vfs, int *nvfs, struct mlx4_priv *priv) { - struct mlx4_priv *priv; struct mlx4_dev *dev; + unsigned sum = 0; int err; int port; - int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { - {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; - unsigned total_vfs = 0; - int sriov_initialized = 0; - unsigned int i; + int i; int existing_vfs = 0; - pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); - - err = pci_enable_device(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); - return err; - } - - /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS - * per port, we must limit the number of VFs to 63 (since their are - * 128 MACs) - */ - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; - total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { - nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; - if (nvfs[i] < 0) { - dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); - return -EINVAL; - } - } - for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; - i++) { - prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; - if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { - dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); - return -EINVAL; - } - } - if (total_vfs >= MLX4_MAX_NUM_VF) { - dev_err(&pdev->dev, - "Requested more VF's (%d) than allowed (%d)\n", - total_vfs, MLX4_MAX_NUM_VF - 1); - return -EINVAL; - } - - for (i = 0; i < MLX4_MAX_PORTS; i++) { - if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { - dev_err(&pdev->dev, - "Requested more VF's (%d) for port (%d) than allowed (%d)\n", - nvfs[i] + nvfs[2], i + 1, - MLX4_MAX_NUM_VF_P_PORT - 1); - return -EINVAL; - } - } - - - /* - * Check for BARs. - */ - if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && - !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", - pci_dev_data, pci_resource_flags(pdev, 0)); - err = -ENODEV; - goto err_disable_pdev; - } - if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing UAR, aborting\n"); - err = -ENODEV; - goto err_disable_pdev; - } - - err = pci_request_regions(pdev, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); - goto err_disable_pdev; - } - - pci_set_master(pdev); + dev = &priv->dev; - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - - /* Allow large DMA segments, up to the firmware limit of 1 GB */ - dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - - dev = pci_get_drvdata(pdev); - priv = mlx4_priv(dev); - dev->pdev = pdev; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); @@ -2382,28 +2284,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) dev->rev_id = pdev->revision; dev->numa_node = dev_to_node(&pdev->dev); + /* Detect if this device is a virtual function */ if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { - /* When acting as pf, we normally skip vfs unless explicitly - * requested to probe them. */ - if (total_vfs) { - unsigned vfs_offset = 0; - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && - vfs_offset + nvfs[i] < extended_func_num(pdev); - vfs_offset += nvfs[i], i++) - ; - if (i == sizeof(nvfs)/sizeof(nvfs[0])) { - err = -ENODEV; - goto err_free_dev; - } - if ((extended_func_num(pdev) - vfs_offset) - > prb_vf[i]) { - mlx4_warn(dev, "Skipping virtual function:%d\n", - extended_func_num(pdev)); - err = -ENODEV; - goto err_free_dev; - } - } mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); dev->flags |= MLX4_FLAG_SLAVE; } else { @@ -2413,11 +2296,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_get_ownership(dev); if (err) { if (err < 0) - goto err_free_dev; + return err; else { mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n"); - err = -EINVAL; - goto err_free_dev; + return -EINVAL; } } @@ -2429,7 +2311,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) GFP_KERNEL); if (NULL == dev->dev_vfs) { mlx4_err(dev, "Failed to allocate memory for VFs\n"); - err = 0; + err = -ENOMEM; + goto err_free_own; } else { atomic_inc(&pf_loading); existing_vfs = pci_num_vf(pdev); @@ -2445,13 +2328,11 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n", err); atomic_dec(&pf_loading); - err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; dev->num_vfs = total_vfs; - sriov_initialized = 1; } } } @@ -2467,7 +2348,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_reset(dev); if (err) { mlx4_err(dev, "Failed to reset HCA, aborting\n"); - goto err_rel_own; + goto err_sriov; } } @@ -2517,34 +2398,46 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) /* In master functions, the communication channel must be initialized * after obtaining its address from fw */ if (mlx4_is_master(dev)) { - unsigned sum = 0; - err = mlx4_multi_func_init(dev); - if (err) { - mlx4_err(dev, "Failed to init master mfunc interface, aborting\n"); + int ib_ports = 0; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + if (ib_ports && + (num_vfs_argc > 1 || probe_vfs_argc > 1)) { + mlx4_err(dev, + "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); + err = -EINVAL; + goto err_close; + } + if (dev->caps.num_ports < 2 && + num_vfs_argc > 1) { + err = -EINVAL; + mlx4_err(dev, + "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n", + dev->caps.num_ports); goto err_close; } - if (sriov_initialized) { - int ib_ports = 0; - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) - ib_ports++; + memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs)); - if (ib_ports && - (num_vfs_argc > 1 || probe_vfs_argc > 1)) { - mlx4_err(dev, - "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); - err = -EINVAL; - goto err_master_mfunc; - } - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]); i++) { - unsigned j; - for (j = 0; j < nvfs[i]; ++sum, ++j) { - dev->dev_vfs[sum].min_port = - i < 2 ? i + 1 : 1; - dev->dev_vfs[sum].n_ports = i < 2 ? 1 : - dev->caps.num_ports; - } + for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) { + unsigned j; + + for (j = 0; j < dev->nvfs[i]; ++sum, ++j) { + dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1; + dev->dev_vfs[sum].n_ports = i < 2 ? 1 : + dev->caps.num_ports; } } + + /* In master functions, the communication channel + * must be initialized after obtaining its address from fw + */ + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n"); + goto err_close; + } } err = mlx4_alloc_eq_table(dev); @@ -2565,7 +2458,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) if (!mlx4_is_slave(dev)) { err = mlx4_init_steering(dev); if (err) - goto err_free_eq; + goto err_disable_msix; } err = mlx4_setup_hca(dev); @@ -2625,6 +2518,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); +err_disable_msix: + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + err_free_eq: mlx4_free_eq_table(dev); @@ -2641,9 +2538,6 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) } err_close: - if (dev->flags & MLX4_FLAG_MSI_X) - pci_disable_msix(pdev); - mlx4_close_hca(dev); err_mfunc: @@ -2657,17 +2551,151 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) pci_disable_sriov(pdev); -err_rel_own: - if (!mlx4_is_slave(dev)) - mlx4_free_ownership(dev); - if (mlx4_is_master(dev) && dev->num_vfs) atomic_dec(&pf_loading); kfree(priv->dev.dev_vfs); -err_free_dev: - kfree(priv); +err_free_own: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + return err; +} + +static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, + struct mlx4_priv *priv) +{ + int err; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { + {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; + unsigned total_vfs = 0; + unsigned int i; + + pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); + return err; + } + + /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS + * per port, we must limit the number of VFs to 63 (since their are + * 128 MACs) + */ + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; + total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { + nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; + if (nvfs[i] < 0) { + dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; + i++) { + prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; + if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { + dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + if (total_vfs >= MLX4_MAX_NUM_VF) { + dev_err(&pdev->dev, + "Requested more VF's (%d) than allowed (%d)\n", + total_vfs, MLX4_MAX_NUM_VF - 1); + err = -EINVAL; + goto err_disable_pdev; + } + + for (i = 0; i < MLX4_MAX_PORTS; i++) { + if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { + dev_err(&pdev->dev, + "Requested more VF's (%d) for port (%d) than allowed (%d)\n", + nvfs[i] + nvfs[2], i + 1, + MLX4_MAX_NUM_VF_P_PORT - 1); + err = -EINVAL; + goto err_disable_pdev; + } + } + + /* Check for BARs. */ + if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", + pci_dev_data, pci_resource_flags(pdev, 0)); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. + */ + if (total_vfs) { + unsigned vfs_offset = 0; + + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && + vfs_offset + nvfs[i] < extended_func_num(pdev); + vfs_offset += nvfs[i], i++) + ; + if (i == sizeof(nvfs)/sizeof(nvfs[0])) { + err = -ENODEV; + goto err_release_regions; + } + if ((extended_func_num(pdev) - vfs_offset) + > prb_vf[i]) { + dev_warn(&pdev->dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_release_regions; + } + } + } + + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) + goto err_release_regions; + return 0; err_release_regions: pci_release_regions(pdev); @@ -2682,6 +2710,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx4_priv *priv; struct mlx4_dev *dev; + int ret; printk_once(KERN_INFO "%s", mlx4_version); @@ -2690,13 +2719,18 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; dev = &priv->dev; + dev->pdev = pdev; pci_set_drvdata(pdev, dev); priv->pci_dev_data = id->driver_data; - return __mlx4_init_one(pdev, id->driver_data); + ret = __mlx4_init_one(pdev, id->driver_data, priv); + if (ret) + kfree(priv); + + return ret; } -static void __mlx4_remove_one(struct pci_dev *pdev) +static void mlx4_unload_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); @@ -2775,8 +2809,6 @@ static void __mlx4_remove_one(struct pci_dev *pdev) kfree(dev->caps.qp1_proxy); kfree(dev->dev_vfs); - pci_release_regions(pdev); - pci_disable_device(pdev); memset(priv, 0, sizeof(*priv)); priv->pci_dev_data = pci_dev_data; priv->removed = 1; @@ -2787,7 +2819,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); kfree(priv); pci_set_drvdata(pdev, NULL); } @@ -2796,11 +2830,22 @@ int mlx4_restart_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - int pci_dev_data; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int pci_dev_data, err, total_vfs; pci_dev_data = priv->pci_dev_data; - __mlx4_remove_one(pdev); - return __mlx4_init_one(pdev, pci_dev_data); + total_vfs = dev->num_vfs; + memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs)); + + mlx4_unload_one(pdev); + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) { + mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n", + __func__, pci_name(pdev), err); + return err; + } + + return err; } static const struct pci_device_id mlx4_pci_table[] = { @@ -2854,7 +2899,7 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; @@ -2866,7 +2911,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) struct mlx4_priv *priv = mlx4_priv(dev); int ret; - ret = __mlx4_init_one(pdev, priv->pci_dev_data); + ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } @@ -2880,7 +2925,7 @@ static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, - .shutdown = __mlx4_remove_one, + .shutdown = mlx4_unload_one, .remove = mlx4_remove_one, .err_handler = &mlx4_err_handler, }; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 03b5608a4329..b2f8ab9a57c4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -707,6 +707,7 @@ struct mlx4_dev { u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; struct mlx4_vf_dev *dev_vfs; + int nvfs[MLX4_MAX_PORTS + 1]; }; struct mlx4_eqe { From a7401b9cf342775921f7b8eb2f9cedcaf004a929 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 30 Sep 2014 12:03:49 +0300 Subject: [PATCH 3/4] net/mlx4_core: Protect QUERY_PORT wrapper from untrusted guests The function mlx4_QUERY_PORT_wrapper implements only the QUERY_PORT "general" case (opcode modifier = 0). Verify that the opcode modifier is zero, and also that the input modifier contains only the port number in bits 0..7 (all other bits should be zero). Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 13b2e4a51ef4..2e88a235e26b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -982,8 +982,13 @@ int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, if (port < 0) return -EINVAL; - vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) | - (port & 0xFF); + /* Protect against untrusted guests: enforce that this is the + * QUERY_PORT general query. + */ + if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF) + return -EINVAL; + + vhcr->in_modifier = port; err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, From 1daa4303b4caceda802949b1b188442870193764 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 30 Sep 2014 12:03:50 +0300 Subject: [PATCH 4/4] net/mlx4_core: Deprecate error message at ConnectX-2 cards startup to debug ConnectX2 HCAs have max_mtu=4k and max_vl=8 vls. However, if you specify a 4K mtu, the max_vl supported for 4K is 4 vls. The driver at startup attempts to set a 4K mtu using the max_vl value obtained from QUERY_PORT. Since the max_vl value is 8 vls (which is supported up to 2K mtu size), the first attempt to set the mtl/vl port value will fail, generating the following error message in the log: mlx4_core 0000:06:00.0: command 0xc failed: fw status = 0x40 The driver then tries again, using mtu=4k, vls=4, and this succeeds. Since we do not want to have this error message always displayed at driver start when there are ConnectX2 HCAs on the host, we deprecate the error message for this specific command/input_modifier/opcode_modifier/fw-status to be debug. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 02a2e90d581a..436c82f64304 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -580,8 +580,18 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, err = context->result; if (err) { - mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", - op, context->fw_status); + /* Since we do not want to have this error message always + * displayed at driver start when there are ConnectX2 HCAs + * on the host, we deprecate the error message for this + * specific command/input_mod/opcode_mod/fw-status to be debug. + */ + if (op == MLX4_CMD_SET_PORT && in_modifier == 1 && + op_modifier == 0 && context->fw_status == CMD_STAT_BAD_SIZE) + mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + else + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); goto out; }