From 7d5d408c77cee95d1380511de46b7a4c8dc2211d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 8 Feb 2008 09:50:08 +0900
Subject: [PATCH 01/66] [SCSI] advansys: fix overrun_buf aligned bug

struct asc_dvc_var needs overrun buffer to be placed on an 8 byte
boundary. advansys defines struct asc_dvc_var:

struct asc_dvc_var {
	...
	uchar overrun_buf[ASC_OVERRUN_BSIZE] __aligned(8);

The problem is that struct asc_dvc_var is placed on
shost->hostdata. So if the hostdata is not on an 8 byte boundary, the
advansys crashes. The hostdata is placed on a sizeof(unsigned long)
boundary so the 8 byte boundary is not garanteed with x86_32.

With 2.6.23 and 2.6.24, the hostdata is on an 8 byte boundary by
chance, but with the current git, it's not.

This patch removes overrun_buf static array and use kzalloc.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/advansys.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index ccef891d642f..3c2d6888bb8c 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -566,7 +566,7 @@ typedef struct asc_dvc_var {
 	ASC_SCSI_BIT_ID_TYPE unit_not_ready;
 	ASC_SCSI_BIT_ID_TYPE queue_full_or_busy;
 	ASC_SCSI_BIT_ID_TYPE start_motor;
-	uchar overrun_buf[ASC_OVERRUN_BSIZE] __aligned(8);
+	uchar *overrun_buf;
 	dma_addr_t overrun_dma;
 	uchar scsi_reset_wait;
 	uchar chip_no;
@@ -13833,6 +13833,12 @@ static int __devinit advansys_board_found(struct Scsi_Host *shost,
 	 */
 	if (ASC_NARROW_BOARD(boardp)) {
 		ASC_DBG(2, "AscInitAsc1000Driver()\n");
+
+		asc_dvc_varp->overrun_buf = kzalloc(ASC_OVERRUN_BSIZE, GFP_KERNEL);
+		if (!asc_dvc_varp->overrun_buf) {
+			ret = -ENOMEM;
+			goto err_free_wide_mem;
+		}
 		warn_code = AscInitAsc1000Driver(asc_dvc_varp);
 
 		if (warn_code || asc_dvc_varp->err_code) {
@@ -13840,8 +13846,10 @@ static int __devinit advansys_board_found(struct Scsi_Host *shost,
 					"warn 0x%x, error 0x%x\n",
 					asc_dvc_varp->init_state, warn_code,
 					asc_dvc_varp->err_code);
-			if (asc_dvc_varp->err_code)
+			if (asc_dvc_varp->err_code) {
 				ret = -ENODEV;
+				kfree(asc_dvc_varp->overrun_buf);
+			}
 		}
 	} else {
 		if (advansys_wide_init_chip(shost))
@@ -13894,6 +13902,7 @@ static int advansys_release(struct Scsi_Host *shost)
 		dma_unmap_single(board->dev,
 					board->dvc_var.asc_dvc_var.overrun_dma,
 					ASC_OVERRUN_BSIZE, DMA_FROM_DEVICE);
+		kfree(board->dvc_var.asc_dvc_var.overrun_buf);
 	} else {
 		iounmap(board->ioremap_addr);
 		advansys_wide_free_mem(board);

From 90a95af85f22c82f87e5fb714bac7ee06673b0ff Mon Sep 17 00:00:00 2001
From: Thomas Horsten <thomas@horsten.com>
Date: Mon, 4 Feb 2008 23:53:18 -0800
Subject: [PATCH 02/66] [SCSI] MegaRAID driver management char device moved to
 misc

The MegaRAID driver's common management module (megaraid_mm.c) creates a
char device used by the management tool "megarc" from LSI Logic (and
possibly other management tools).

In 2.6 with udev, this device doesn't get created because it is not
registered in sysfs.

I first fixed this by registering a class "megaraid_mm", but realized that
this should probably be moved to misc devices, instead of taking up a char
major.  This is because only 1 device is used, even if there are multiple
adapters - the minor is never used (the adapter info is in the ioctl block
sent to the driver, not detected based on the minor number as one might
think).  So it is a complete waste to have an entire major taken by this.

So it now uses a misc device which I named "megadev0" (the name that megarc
expects), and has a dynamic minor (previoulsy a dynamic major was used).

I have tested this on my own system with the megarc tool, and it works just
as fine as before (only now the device gets created correctly by udev).

Acked-by: "Patro, Sumant" <Sumant.Patro@lsi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/megaraid/megaraid_mm.c | 20 +++++++++++++-------
 drivers/scsi/megaraid/megaraid_mm.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c
index b6587a6d8486..0ad215e27b83 100644
--- a/drivers/scsi/megaraid/megaraid_mm.c
+++ b/drivers/scsi/megaraid/megaraid_mm.c
@@ -59,7 +59,6 @@ EXPORT_SYMBOL(mraid_mm_register_adp);
 EXPORT_SYMBOL(mraid_mm_unregister_adp);
 EXPORT_SYMBOL(mraid_mm_adapter_app_handle);
 
-static int majorno;
 static uint32_t drvr_ver	= 0x02200207;
 
 static int adapters_count_g;
@@ -76,6 +75,12 @@ static const struct file_operations lsi_fops = {
 	.owner	= THIS_MODULE,
 };
 
+static struct miscdevice megaraid_mm_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name   = "megadev0",
+	.fops   = &lsi_fops,
+};
+
 /**
  * mraid_mm_open - open routine for char node interface
  * @inode	: unused
@@ -1184,15 +1189,16 @@ mraid_mm_teardown_dma_pools(mraid_mmadp_t *adp)
 static int __init
 mraid_mm_init(void)
 {
+	int err;
+
 	// Announce the driver version
 	con_log(CL_ANN, (KERN_INFO "megaraid cmm: %s %s\n",
 		LSI_COMMON_MOD_VERSION, LSI_COMMON_MOD_EXT_VERSION));
 
-	majorno = register_chrdev(0, "megadev", &lsi_fops);
-
-	if (majorno < 0) {
-		con_log(CL_ANN, ("megaraid cmm: cannot get major\n"));
-		return majorno;
+	err = misc_register(&megaraid_mm_dev);
+	if (err < 0) {
+		con_log(CL_ANN, ("megaraid cmm: cannot register misc device\n"));
+		return err;
 	}
 
 	init_waitqueue_head(&wait_q);
@@ -1230,7 +1236,7 @@ mraid_mm_exit(void)
 {
 	con_log(CL_DLEVEL1 , ("exiting common mod\n"));
 
-	unregister_chrdev(majorno, "megadev");
+	misc_deregister(&megaraid_mm_dev);
 }
 
 module_init(mraid_mm_init);
diff --git a/drivers/scsi/megaraid/megaraid_mm.h b/drivers/scsi/megaraid/megaraid_mm.h
index c8762b2b8ed1..55b425c0a654 100644
--- a/drivers/scsi/megaraid/megaraid_mm.h
+++ b/drivers/scsi/megaraid/megaraid_mm.h
@@ -22,6 +22,7 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/list.h>
+#include <linux/miscdevice.h>
 
 #include "mbox_defs.h"
 #include "megaraid_ioctl.h"

From 07df8afa0dd54b8b89ad8aa93994c0d55a4a5921 Mon Sep 17 00:00:00 2001
From: "Prakash, Sathya" <sathya.prakash@lsi.com>
Date: Fri, 8 Feb 2008 16:35:40 +0530
Subject: [PATCH 03/66] [SCSI] mpt fusion: Avoid racing when mptsas and mptcl
 module are loaded in parallel

This patch sets the IOC pointer in drvrdata of pcidev before adding
the IOC into the list of IOCs. Without this patch the driver oops when
the mptsas and mptctl modules are loaded in parallel.

Signed-off-by: Sathya Prakash <sathya.prakash@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/message/fusion/mptbase.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 425f60c21fdd..d381c38ccbaf 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1658,6 +1658,9 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	ioc->active = 0;
 	CHIPREG_WRITE32(&ioc->chip->IntStatus, 0);
 
+	/* Set IOC ptr in the pcidev's driver data. */
+	pci_set_drvdata(ioc->pcidev, ioc);
+
 	/* Set lookup ptr. */
 	list_add_tail(&ioc->list, &ioc_list);
 
@@ -1999,7 +2002,6 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 			irq_allocated = 1;
 			ioc->pci_irq = ioc->pcidev->irq;
 			pci_set_master(ioc->pcidev);		/* ?? */
-			pci_set_drvdata(ioc->pcidev, ioc);
 			dprintk(ioc, printk(MYIOC_s_INFO_FMT "installed at interrupt "
 			    "%d\n", ioc->name, ioc->pcidev->irq));
 		}

From 8ef2224707996aede1808f40116cd467b7c8d549 Mon Sep 17 00:00:00 2001
From: "Salyzyn, Mark" <Mark_Salyzyn@adaptec.com>
Date: Fri, 8 Feb 2008 05:48:22 -0800
Subject: [PATCH 04/66] [SCSI] aacraid: add optional MSI support

Added support for MSI utilizing the aacraid.msi=1 parameter. This
patch adds some localized or like-minded janitor fixes. Since the
default is disabled, there is no impact on the code paths unless the
customer wishes to experiment with the MSI performance.

Signed-off-by: Mark Salyzyn <aacraid@adaptec.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/aacraid/aachba.c  | 56 +++++++++++++++++++++++++---------
 drivers/scsi/aacraid/aacraid.h |  2 ++
 drivers/scsi/aacraid/linit.c   | 32 ++++++++++---------
 drivers/scsi/aacraid/rx.c      |  5 ++-
 drivers/scsi/aacraid/sa.c      |  5 +--
 5 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index bfd0e64964ac..f8d2ae301283 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -144,51 +144,77 @@ static char *aac_get_status_string(u32 status);
  */
 
 static int nondasd = -1;
-static int aac_cache = 0;
+static int aac_cache;
 static int dacmode = -1;
-
+int aac_msi;
 int aac_commit = -1;
 int startup_timeout = 180;
 int aif_timeout = 120;
 
 module_param(nondasd, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(nondasd, "Control scanning of hba for nondasd devices. 0=off, 1=on");
+MODULE_PARM_DESC(nondasd, "Control scanning of hba for nondasd devices."
+	" 0=off, 1=on");
 module_param_named(cache, aac_cache, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(cache, "Disable Queue Flush commands:\n\tbit 0 - Disable FUA in WRITE SCSI commands\n\tbit 1 - Disable SYNCHRONIZE_CACHE SCSI command\n\tbit 2 - Disable only if Battery not protecting Cache");
+MODULE_PARM_DESC(cache, "Disable Queue Flush commands:\n"
+	"\tbit 0 - Disable FUA in WRITE SCSI commands\n"
+	"\tbit 1 - Disable SYNCHRONIZE_CACHE SCSI command\n"
+	"\tbit 2 - Disable only if Battery not protecting Cache");
 module_param(dacmode, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(dacmode, "Control whether dma addressing is using 64 bit DAC. 0=off, 1=on");
+MODULE_PARM_DESC(dacmode, "Control whether dma addressing is using 64 bit DAC."
+	" 0=off, 1=on");
 module_param_named(commit, aac_commit, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(commit, "Control whether a COMMIT_CONFIG is issued to the adapter for foreign arrays.\nThis is typically needed in systems that do not have a BIOS. 0=off, 1=on");
+MODULE_PARM_DESC(commit, "Control whether a COMMIT_CONFIG is issued to the"
+	" adapter for foreign arrays.\n"
+	"This is typically needed in systems that do not have a BIOS."
+	" 0=off, 1=on");
+module_param_named(msi, aac_msi, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(msi, "IRQ handling."
+	" 0=PIC(default), 1=MSI, 2=MSI-X(unsupported, uses MSI)");
 module_param(startup_timeout, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(startup_timeout, "The duration of time in seconds to wait for adapter to have it's kernel up and\nrunning. This is typically adjusted for large systems that do not have a BIOS.");
+MODULE_PARM_DESC(startup_timeout, "The duration of time in seconds to wait for"
+	" adapter to have it's kernel up and\n"
+	"running. This is typically adjusted for large systems that do not"
+	" have a BIOS.");
 module_param(aif_timeout, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(aif_timeout, "The duration of time in seconds to wait for applications to pick up AIFs before\nderegistering them. This is typically adjusted for heavily burdened systems.");
+MODULE_PARM_DESC(aif_timeout, "The duration of time in seconds to wait for"
+	" applications to pick up AIFs before\n"
+	"deregistering them. This is typically adjusted for heavily burdened"
+	" systems.");
 
 int numacb = -1;
 module_param(numacb, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(numacb, "Request a limit to the number of adapter control blocks (FIB) allocated. Valid values are 512 and down. Default is to use suggestion from Firmware.");
+MODULE_PARM_DESC(numacb, "Request a limit to the number of adapter control"
+	" blocks (FIB) allocated. Valid values are 512 and down. Default is"
+	" to use suggestion from Firmware.");
 
 int acbsize = -1;
 module_param(acbsize, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(acbsize, "Request a specific adapter control block (FIB) size. Valid values are 512, 2048, 4096 and 8192. Default is to use suggestion from Firmware.");
+MODULE_PARM_DESC(acbsize, "Request a specific adapter control block (FIB)"
+	" size. Valid values are 512, 2048, 4096 and 8192. Default is to use"
+	" suggestion from Firmware.");
 
 int update_interval = 30 * 60;
 module_param(update_interval, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(update_interval, "Interval in seconds between time sync updates issued to adapter.");
+MODULE_PARM_DESC(update_interval, "Interval in seconds between time sync"
+	" updates issued to adapter.");
 
 int check_interval = 24 * 60 * 60;
 module_param(check_interval, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(check_interval, "Interval in seconds between adapter health checks.");
+MODULE_PARM_DESC(check_interval, "Interval in seconds between adapter health"
+	" checks.");
 
 int aac_check_reset = 1;
 module_param_named(check_reset, aac_check_reset, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(aac_check_reset, "If adapter fails health check, reset the adapter. a value of -1 forces the reset to adapters programmed to ignore it.");
+MODULE_PARM_DESC(aac_check_reset, "If adapter fails health check, reset the"
+	" adapter. a value of -1 forces the reset to adapters programmed to"
+	" ignore it.");
 
 int expose_physicals = -1;
 module_param(expose_physicals, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(expose_physicals, "Expose physical components of the arrays. -1=protect 0=off, 1=on");
+MODULE_PARM_DESC(expose_physicals, "Expose physical components of the arrays."
+	" -1=protect 0=off, 1=on");
 
-int aac_reset_devices = 0;
+int aac_reset_devices;
 module_param_named(reset_devices, aac_reset_devices, int, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(reset_devices, "Force an adapter reset at initialization.");
 
diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 3195d29f2177..ace0b751c131 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -1026,6 +1026,7 @@ struct aac_dev
 	u8			raw_io_64;
 	u8			printf_enabled;
 	u8			in_reset;
+	u8			msi;
 };
 
 #define aac_adapter_interrupt(dev) \
@@ -1881,6 +1882,7 @@ extern int startup_timeout;
 extern int aif_timeout;
 extern int expose_physicals;
 extern int aac_reset_devices;
+extern int aac_msi;
 extern int aac_commit;
 extern int update_interval;
 extern int check_interval;
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index e80d2a0c46af..7232e7326ded 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -275,9 +275,9 @@ static const char *aac_info(struct Scsi_Host *shost)
 
 /**
  *	aac_get_driver_ident
- * 	@devtype: index into lookup table
+ *	@devtype: index into lookup table
  *
- * 	Returns a pointer to the entry in the driver lookup table.
+ *	Returns a pointer to the entry in the driver lookup table.
  */
 
 struct aac_driver_ident* aac_get_driver_ident(int devtype)
@@ -1004,32 +1004,32 @@ static const struct file_operations aac_cfg_fops = {
 
 static struct scsi_host_template aac_driver_template = {
 	.module				= THIS_MODULE,
-	.name           		= "AAC",
+	.name				= "AAC",
 	.proc_name			= AAC_DRIVERNAME,
-	.info           		= aac_info,
-	.ioctl          		= aac_ioctl,
+	.info				= aac_info,
+	.ioctl				= aac_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl			= aac_compat_ioctl,
 #endif
-	.queuecommand   		= aac_queuecommand,
-	.bios_param     		= aac_biosparm,
+	.queuecommand			= aac_queuecommand,
+	.bios_param			= aac_biosparm,
 	.shost_attrs			= aac_attrs,
 	.slave_configure		= aac_slave_configure,
 	.change_queue_depth		= aac_change_queue_depth,
 	.sdev_attrs			= aac_dev_attrs,
 	.eh_abort_handler		= aac_eh_abort,
 	.eh_host_reset_handler		= aac_eh_reset,
-	.can_queue      		= AAC_NUM_IO_FIB,
-	.this_id        		= MAXIMUM_NUM_CONTAINERS,
-	.sg_tablesize   		= 16,
-	.max_sectors    		= 128,
+	.can_queue			= AAC_NUM_IO_FIB,
+	.this_id			= MAXIMUM_NUM_CONTAINERS,
+	.sg_tablesize			= 16,
+	.max_sectors			= 128,
 #if (AAC_NUM_IO_FIB > 256)
 	.cmd_per_lun			= 256,
 #else
-	.cmd_per_lun    		= AAC_NUM_IO_FIB,
+	.cmd_per_lun			= AAC_NUM_IO_FIB,
 #endif
 	.use_clustering			= ENABLE_CLUSTERING,
-	.emulated                       = 1,
+	.emulated			= 1,
 };
 
 static void __aac_shutdown(struct aac_dev * aac)
@@ -1039,6 +1039,8 @@ static void __aac_shutdown(struct aac_dev * aac)
 	aac_send_shutdown(aac);
 	aac_adapter_disable_int(aac);
 	free_irq(aac->pdev->irq, aac);
+	if (aac->msi)
+		pci_disable_msi(aac->pdev);
 }
 
 static int __devinit aac_probe_one(struct pci_dev *pdev,
@@ -1254,7 +1256,7 @@ static struct pci_driver aac_pci_driver = {
 	.id_table	= aac_pci_tbl,
 	.probe		= aac_probe_one,
 	.remove		= __devexit_p(aac_remove_one),
-	.shutdown 	= aac_shutdown,
+	.shutdown	= aac_shutdown,
 };
 
 static int __init aac_init(void)
@@ -1271,7 +1273,7 @@ static int __init aac_init(void)
 	aac_cfg_major = register_chrdev( 0, "aac", &aac_cfg_fops);
 	if (aac_cfg_major < 0) {
 		printk(KERN_WARNING
-		       "aacraid: unable to register \"aac\" device.\n");
+			"aacraid: unable to register \"aac\" device.\n");
 	}
 
 	return 0;
diff --git a/drivers/scsi/aacraid/rx.c b/drivers/scsi/aacraid/rx.c
index a08bbf1fd76c..1f18b83e1e02 100644
--- a/drivers/scsi/aacraid/rx.c
+++ b/drivers/scsi/aacraid/rx.c
@@ -625,8 +625,11 @@ int _aac_rx_init(struct aac_dev *dev)
 	if (aac_init_adapter(dev) == NULL)
 		goto error_iounmap;
 	aac_adapter_comm(dev, dev->comm_interface);
-	if (request_irq(dev->scsi_host_ptr->irq, dev->a_ops.adapter_intr,
+	dev->msi = aac_msi && !pci_enable_msi(dev->pdev);
+	if (request_irq(dev->pdev->irq, dev->a_ops.adapter_intr,
 			IRQF_SHARED|IRQF_DISABLED, "aacraid", dev) < 0) {
+		if (dev->msi)
+			pci_disable_msi(dev->pdev);
 		printk(KERN_ERR "%s%d: Interrupt unavailable.\n",
 			name, instance);
 		goto error_iounmap;
diff --git a/drivers/scsi/aacraid/sa.c b/drivers/scsi/aacraid/sa.c
index 85b91bc578c9..cfc3410ec073 100644
--- a/drivers/scsi/aacraid/sa.c
+++ b/drivers/scsi/aacraid/sa.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
@@ -385,7 +386,7 @@ int aac_sa_init(struct aac_dev *dev)
 
 	if(aac_init_adapter(dev) == NULL)
 		goto error_irq;
-	if (request_irq(dev->scsi_host_ptr->irq, dev->a_ops.adapter_intr,
+	if (request_irq(dev->pdev->irq, dev->a_ops.adapter_intr,
 			IRQF_SHARED|IRQF_DISABLED,
 			"aacraid", (void *)dev ) < 0) {
 		printk(KERN_WARNING "%s%d: Interrupt unavailable.\n",
@@ -403,7 +404,7 @@ int aac_sa_init(struct aac_dev *dev)
 
 error_irq:
 	aac_sa_disable_interrupt(dev);
-	free_irq(dev->scsi_host_ptr->irq, (void *)dev);
+	free_irq(dev->pdev->irq, (void *)dev);
 
 error_iounmap:
 

From 2f7ecc55b37ef9f0208360e64d8b9d2313df8ce6 Mon Sep 17 00:00:00 2001
From: "Salyzyn, Mark" <Mark_Salyzyn@adaptec.com>
Date: Fri, 8 Feb 2008 08:36:23 -0800
Subject: [PATCH 05/66] [SCSI] aacraid: ignore adapter reset check polarity

The Adapter's Ignore Reset flag and insmod parameter boolean polarity
is incorrect in the driver.

Signed-off-by: Mark Salyzyn <aacraid@adaptec.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/aacraid/aachba.c  | 14 ++++++++------
 drivers/scsi/aacraid/commsup.c |  2 +-
 drivers/scsi/aacraid/linit.c   |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index f8d2ae301283..c05092fd3a9d 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -1341,7 +1341,7 @@ int aac_get_adapter_info(struct aac_dev* dev)
 			  (int)sizeof(dev->supplement_adapter_info.VpdInfo.Tsid),
 			  dev->supplement_adapter_info.VpdInfo.Tsid);
 		}
-		if (!aac_check_reset || ((aac_check_reset != 1) &&
+		if (!aac_check_reset || ((aac_check_reset == 1) &&
 		  (dev->supplement_adapter_info.SupportedOptions2 &
 		  AAC_OPTION_IGNORE_RESET))) {
 			printk(KERN_INFO "%s%d: Reset Adapter Ignored\n",
@@ -1379,13 +1379,14 @@ int aac_get_adapter_info(struct aac_dev* dev)
 
 	if (nondasd != -1)
 		dev->nondasd_support = (nondasd!=0);
-	if(dev->nondasd_support != 0) {
+	if (dev->nondasd_support && !dev->in_reset)
 		printk(KERN_INFO "%s%d: Non-DASD support enabled.\n",dev->name, dev->id);
-	}
 
 	dev->dac_support = 0;
 	if( (sizeof(dma_addr_t) > 4) && (dev->adapter_info.options & AAC_OPT_SGMAP_HOST64)){
-		printk(KERN_INFO "%s%d: 64bit support enabled.\n", dev->name, dev->id);
+		if (!dev->in_reset)
+			printk(KERN_INFO "%s%d: 64bit support enabled.\n",
+				dev->name, dev->id);
 		dev->dac_support = 1;
 	}
 
@@ -1395,8 +1396,9 @@ int aac_get_adapter_info(struct aac_dev* dev)
 	if(dev->dac_support != 0) {
 		if (!pci_set_dma_mask(dev->pdev, DMA_64BIT_MASK) &&
 			!pci_set_consistent_dma_mask(dev->pdev, DMA_64BIT_MASK)) {
-			printk(KERN_INFO"%s%d: 64 Bit DAC enabled\n",
-				dev->name, dev->id);
+			if (!dev->in_reset)
+				printk(KERN_INFO"%s%d: 64 Bit DAC enabled\n",
+					dev->name, dev->id);
 		} else if (!pci_set_dma_mask(dev->pdev, DMA_32BIT_MASK) &&
 			!pci_set_consistent_dma_mask(dev->pdev, DMA_32BIT_MASK)) {
 			printk(KERN_INFO"%s%d: DMA mask set failed, 64 Bit DAC disabled\n",
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 81b36923e0ef..47434499e82b 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -1458,7 +1458,7 @@ int aac_check_health(struct aac_dev * aac)
 
 	printk(KERN_ERR "%s: Host adapter BLINK LED 0x%x\n", aac->name, BlinkLED);
 
-	if (!aac_check_reset || ((aac_check_reset != 1) &&
+	if (!aac_check_reset || ((aac_check_reset == 1) &&
 		(aac->supplement_adapter_info.SupportedOptions2 &
 			AAC_OPTION_IGNORE_RESET)))
 		goto out;
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 7232e7326ded..fdfbad0903fd 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -641,7 +641,7 @@ static int aac_eh_reset(struct scsi_cmnd* cmd)
 	   AAC_OPTION_MU_RESET) &&
 	  aac_check_reset &&
 	  ((aac_check_reset != 1) ||
-	   (aac->supplement_adapter_info.SupportedOptions2 &
+	   !(aac->supplement_adapter_info.SupportedOptions2 &
 	    AAC_OPTION_IGNORE_RESET)))
 		aac_reset_adapter(aac, 2); /* Bypass wait for command quiesce */
 	return SUCCESS; /* Cause an immediate retry of the command with a ten second delay after successful tur */

From e78d5b8f1e73ab82f3fd041d05824cfee7d83a2c Mon Sep 17 00:00:00 2001
From: "Prakash, Sathya" <sathya.prakash@lsi.com>
Date: Fri, 8 Feb 2008 22:05:35 +0530
Subject: [PATCH 06/66] [SCSI] mpt fusion: Request I/O resources only when
 required

This patch modifies the I/O resource allocation behavior of FUSION
driver.  The current version of driver allocates the I/O resources
even if they are not required and this creates trouble in low resource
environments.  This driver now uses
pci_enable_device_mem/pci_enable_device functions to differentiate the
resource allocations.

Signed-off-by: Sathya Prakash <sathya.prakash@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/message/fusion/mptbase.c | 50 +++++++++++++++++++++++++++-----
 drivers/message/fusion/mptbase.h |  1 +
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index d381c38ccbaf..bfda731696f7 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1470,9 +1470,6 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (mpt_debug_level)
 		printk(KERN_INFO MYNAM ": mpt_debug_level=%xh\n", mpt_debug_level);
 
-	if (pci_enable_device(pdev))
-		return r;
-
 	ioc = kzalloc(sizeof(MPT_ADAPTER), GFP_ATOMIC);
 	if (ioc == NULL) {
 		printk(KERN_ERR MYNAM ": ERROR - Insufficient memory to add adapter!\n");
@@ -1482,6 +1479,20 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	ioc->id = mpt_ids++;
 	sprintf(ioc->name, "ioc%d", ioc->id);
 
+	ioc->bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (pci_enable_device_mem(pdev)) {
+		kfree(ioc);
+		printk(MYIOC_s_ERR_FMT "pci_enable_device_mem() "
+		       "failed\n", ioc->name);
+		return r;
+	}
+	if (pci_request_selected_regions(pdev, ioc->bars, "mpt")) {
+		kfree(ioc);
+		printk(MYIOC_s_ERR_FMT "pci_request_selected_regions() with "
+		       "MEM failed\n", ioc->name);
+		return r;
+	}
+
 	dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": mpt_adapter_install\n", ioc->name));
 
 	if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
@@ -1794,6 +1805,7 @@ mpt_suspend(struct pci_dev *pdev, pm_message_t state)
 	CHIPREG_WRITE32(&ioc->chip->IntStatus, 0);
 
 	pci_disable_device(pdev);
+	pci_release_selected_regions(pdev, ioc->bars);
 	pci_set_power_state(pdev, device_state);
 
 	return 0;
@@ -1810,7 +1822,6 @@ mpt_resume(struct pci_dev *pdev)
 	MPT_ADAPTER *ioc = pci_get_drvdata(pdev);
 	u32 device_state = pdev->current_state;
 	int recovery_state;
-	int err;
 
 	printk(MYIOC_s_INFO_FMT
 	"pci-resume: pdev=0x%p, slot=%s, Previous operating state [D%d]\n",
@@ -1818,9 +1829,18 @@ mpt_resume(struct pci_dev *pdev)
 
 	pci_set_power_state(pdev, 0);
 	pci_restore_state(pdev);
-	err = pci_enable_device(pdev);
-	if (err)
-		return err;
+	if (ioc->facts.Flags & MPI_IOCFACTS_FLAGS_FW_DOWNLOAD_BOOT) {
+		ioc->bars = pci_select_bars(ioc->pcidev, IORESOURCE_MEM |
+			IORESOURCE_IO);
+		if (pci_enable_device(pdev))
+			return 0;
+	} else {
+		ioc->bars = pci_select_bars(pdev, IORESOURCE_MEM);
+		if (pci_enable_device_mem(pdev))
+			return 0;
+	}
+	if (pci_request_selected_regions(pdev, ioc->bars, "mpt"))
+		return 0;
 
 	/* enable interrupts */
 	CHIPREG_WRITE32(&ioc->chip->IntMask, MPI_HIM_DIM);
@@ -1881,6 +1901,7 @@ mpt_signal_reset(u8 index, MPT_ADAPTER *ioc, int reset_phase)
  *		-2 if READY but IOCFacts Failed
  *		-3 if READY but PrimeIOCFifos Failed
  *		-4 if READY but IOCInit Failed
+ *		-5 if failed to enable_device and/or request_selected_regions
  */
 static int
 mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
@@ -1979,6 +2000,18 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 		}
 	}
 
+	if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP) &&
+	    (ioc->facts.Flags & MPI_IOCFACTS_FLAGS_FW_DOWNLOAD_BOOT)) {
+		pci_release_selected_regions(ioc->pcidev, ioc->bars);
+		ioc->bars = pci_select_bars(ioc->pcidev, IORESOURCE_MEM |
+		    IORESOURCE_IO);
+		if (pci_enable_device(ioc->pcidev))
+			return -5;
+		if (pci_request_selected_regions(ioc->pcidev, ioc->bars,
+			"mpt"))
+			return -5;
+	}
+
 	/*
 	 * Device is reset now. It must have de-asserted the interrupt line
 	 * (if it was asserted) and it should be safe to register for the
@@ -2383,6 +2416,9 @@ mpt_adapter_dispose(MPT_ADAPTER *ioc)
 		ioc->memmap = NULL;
 	}
 
+	pci_disable_device(ioc->pcidev);
+	pci_release_selected_regions(ioc->pcidev, ioc->bars);
+
 #if defined(CONFIG_MTRR) && 0
 	if (ioc->mtrr_reg > 0) {
 		mtrr_del(ioc->mtrr_reg, 0, 0);
diff --git a/drivers/message/fusion/mptbase.h b/drivers/message/fusion/mptbase.h
index b49b706c0020..d83ea96fe135 100644
--- a/drivers/message/fusion/mptbase.h
+++ b/drivers/message/fusion/mptbase.h
@@ -629,6 +629,7 @@ typedef struct _MPT_ADAPTER
 	dma_addr_t		HostPageBuffer_dma;
 	int			 mtrr_reg;
 	struct pci_dev		*pcidev;	/* struct pci_dev pointer */
+	int			bars;		/* bitmask of BAR's that must be configured */
 	u8			__iomem *memmap;	/* mmap address */
 	struct Scsi_Host	*sh;		/* Scsi Host pointer */
 	SpiCfgData		spi_data;	/* Scsi config. data */

From 95f6fb578970c9dbfcaa436ff98d2f3c6bdea953 Mon Sep 17 00:00:00 2001
From: "Salyzyn, Mark" <Mark_Salyzyn@adaptec.com>
Date: Fri, 8 Feb 2008 09:01:34 -0800
Subject: [PATCH 07/66] [SCSI] aacraid: informational sysfs value corrections

Some sysfs problems reported. The serial number on late model
controllers was truncated. Non-DASD devices (tapes and CDROMs) were
showing up as JBOD in the level report on the physical channel.

Signed-off-by: Mark Salyzyn <aacraid@adaptec.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/aacraid/linit.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index fdfbad0903fd..ae5f74fb62d5 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -494,13 +494,14 @@ static int aac_change_queue_depth(struct scsi_device *sdev, int depth)
 
 static ssize_t aac_show_raid_level(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct scsi_device * sdev = to_scsi_device(dev);
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct aac_dev *aac = (struct aac_dev *)(sdev->host->hostdata);
 	if (sdev_channel(sdev) != CONTAINER_CHANNEL)
 		return snprintf(buf, PAGE_SIZE, sdev->no_uld_attach
-		  ? "Hidden\n" : "JBOD");
+		  ? "Hidden\n" :
+		  ((aac->jbod && (sdev->type == TYPE_DISK)) ? "JBOD\n" : ""));
 	return snprintf(buf, PAGE_SIZE, "%s\n",
-	  get_container_type(((struct aac_dev *)(sdev->host->hostdata))
-	    ->fsa_dev[sdev_id(sdev)].type));
+	  get_container_type(aac->fsa_dev[sdev_id(sdev)].type));
 }
 
 static struct device_attribute aac_raid_level_attr = {
@@ -860,8 +861,8 @@ ssize_t aac_show_serial_number(struct class_device *class_dev, char *buf)
 		  le32_to_cpu(dev->adapter_info.serial[0]));
 	if (len &&
 	  !memcmp(&dev->supplement_adapter_info.MfgPcbaSerialNo[
-	    sizeof(dev->supplement_adapter_info.MfgPcbaSerialNo)+2-len],
-	  buf, len))
+	    sizeof(dev->supplement_adapter_info.MfgPcbaSerialNo)-len],
+	  buf, len-1))
 		len = snprintf(buf, PAGE_SIZE, "%.*s\n",
 		  (int)sizeof(dev->supplement_adapter_info.MfgPcbaSerialNo),
 		  dev->supplement_adapter_info.MfgPcbaSerialNo);

From 7c46c20aef185c3782d28c5111dcf1df88bbab32 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@sun.com>
Date: Sun, 10 Feb 2008 23:25:25 -0800
Subject: [PATCH 08/66] [SCSI] ses: fix memory leaks

fix leaking with scomp leaking when failing. Also free page10 on
driver removal and remove one extra space.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/ses.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 2a6e4f472eaa..a57fed47b39d 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -416,11 +416,11 @@ static int ses_intf_add(struct class_device *cdev,
 	int i, j, types, len, components = 0;
 	int err = -ENOMEM;
 	struct enclosure_device *edev;
-	struct ses_component *scomp;
+	struct ses_component *scomp = NULL;
 
 	if (!scsi_device_enclosure(sdev)) {
 		/* not an enclosure, but might be in one */
-		edev = 	enclosure_find(&sdev->host->shost_gendev);
+		edev = enclosure_find(&sdev->host->shost_gendev);
 		if (edev) {
 			ses_match_to_enclosure(edev, sdev);
 			class_device_put(&edev->cdev);
@@ -456,9 +456,6 @@ static int ses_intf_add(struct class_device *cdev,
 	if (!buf)
 		goto err_free;
 
-	ses_dev->page1 = buf;
-	ses_dev->page1_len = len;
-
 	result = ses_recv_diag(sdev, 1, buf, len);
 	if (result)
 		goto recv_failed;
@@ -473,6 +470,9 @@ static int ses_intf_add(struct class_device *cdev,
 		    type_ptr[0] == ENCLOSURE_COMPONENT_ARRAY_DEVICE)
 			components += type_ptr[1];
 	}
+	ses_dev->page1 = buf;
+	ses_dev->page1_len = len;
+	buf = NULL;
 
 	result = ses_recv_diag(sdev, 2, hdr_buf, INIT_ALLOC_SIZE);
 	if (result)
@@ -489,6 +489,7 @@ static int ses_intf_add(struct class_device *cdev,
 		goto recv_failed;
 	ses_dev->page2 = buf;
 	ses_dev->page2_len = len;
+	buf = NULL;
 
 	/* The additional information page --- allows us
 	 * to match up the devices */
@@ -506,11 +507,12 @@ static int ses_intf_add(struct class_device *cdev,
 		goto recv_failed;
 	ses_dev->page10 = buf;
 	ses_dev->page10_len = len;
+	buf = NULL;
 
  no_page10:
-	scomp = kmalloc(sizeof(struct ses_component) * components, GFP_KERNEL);
+	scomp = kzalloc(sizeof(struct ses_component) * components, GFP_KERNEL);
 	if (!scomp)
-		goto  err_free;
+		goto err_free;
 
 	edev = enclosure_register(cdev->dev, sdev->sdev_gendev.bus_id,
 				  components, &ses_enclosure_callbacks);
@@ -521,10 +523,9 @@ static int ses_intf_add(struct class_device *cdev,
 
 	edev->scratch = ses_dev;
 	for (i = 0; i < components; i++)
-		edev->component[i].scratch = scomp++;
+		edev->component[i].scratch = scomp + i;
 
 	/* Page 7 for the descriptors is optional */
-	buf = NULL;
 	result = ses_recv_diag(sdev, 7, hdr_buf, INIT_ALLOC_SIZE);
 	if (result)
 		goto simple_populate;
@@ -532,6 +533,8 @@ static int ses_intf_add(struct class_device *cdev,
 	len = (hdr_buf[2] << 8) + hdr_buf[3] + 4;
 	/* add 1 for trailing '\0' we'll use */
 	buf = kzalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		goto simple_populate;
 	result = ses_recv_diag(sdev, 7, buf, len);
 	if (result) {
  simple_populate:
@@ -598,6 +601,7 @@ static int ses_intf_add(struct class_device *cdev,
 	err = -ENODEV;
  err_free:
 	kfree(buf);
+	kfree(scomp);
 	kfree(ses_dev->page10);
 	kfree(ses_dev->page2);
 	kfree(ses_dev->page1);
@@ -630,6 +634,7 @@ static void ses_intf_remove(struct class_device *cdev,
 	ses_dev = edev->scratch;
 	edev->scratch = NULL;
 
+	kfree(ses_dev->page10);
 	kfree(ses_dev->page1);
 	kfree(ses_dev->page2);
 	kfree(ses_dev);

From ccf9ea91aba0d3b8145900ec02f6edf03dda708c Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 10 Sep 2007 22:39:11 +0300
Subject: [PATCH 09/66] [SCSI] fas216: Use scsi_eh API for REQUEST_SENSE
 invocation

Use new scsi_eh_prep/restor_cmnd() for synchronous REQUEST_SENSE
invocation.  This also converts the driver to the new accessor based
scatterlist implementation.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/arm/fas216.c | 16 +++-------------
 drivers/scsi/arm/fas216.h |  3 +++
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index fb5f20284389..a715632e19d4 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -2018,6 +2018,7 @@ static void fas216_rq_sns_done(FAS216_Info *info, struct scsi_cmnd *SCpnt,
 	 * the upper layers to process.  This would have been set
 	 * correctly by fas216_std_done.
 	 */
+	scsi_eh_restore_cmnd(SCpnt, &info->ses);
 	SCpnt->scsi_done(SCpnt);
 }
 
@@ -2103,23 +2104,12 @@ fas216_std_done(FAS216_Info *info, struct scsi_cmnd *SCpnt, unsigned int result)
 	if (SCpnt->cmnd[0] == REQUEST_SENSE)
 		goto done;
 
+	scsi_eh_prep_cmnd(SCpnt, &info->ses, NULL, 0, ~0);
 	fas216_log_target(info, LOG_CONNECT, SCpnt->device->id,
 			  "requesting sense");
-	memset(SCpnt->cmnd, 0, sizeof (SCpnt->cmnd));
-	SCpnt->cmnd[0] = REQUEST_SENSE;
-	SCpnt->cmnd[1] = SCpnt->device->lun << 5;
-	SCpnt->cmnd[4] = sizeof(SCpnt->sense_buffer);
-	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
-	SCpnt->SCp.buffer = NULL;
-	SCpnt->SCp.buffers_residual = 0;
-	SCpnt->SCp.ptr = (char *)SCpnt->sense_buffer;
-	SCpnt->SCp.this_residual = sizeof(SCpnt->sense_buffer);
-	SCpnt->SCp.phase = sizeof(SCpnt->sense_buffer);
+	init_SCp(SCpnt);
 	SCpnt->SCp.Message = 0;
 	SCpnt->SCp.Status = 0;
-	SCpnt->request_bufflen = sizeof(SCpnt->sense_buffer);
-	SCpnt->sc_data_direction = DMA_FROM_DEVICE;
-	SCpnt->use_sg = 0;
 	SCpnt->tag = 0;
 	SCpnt->host_scribble = (void *)fas216_rq_sns_done;
 
diff --git a/drivers/scsi/arm/fas216.h b/drivers/scsi/arm/fas216.h
index 00e5f055afdc..3e73e264972e 100644
--- a/drivers/scsi/arm/fas216.h
+++ b/drivers/scsi/arm/fas216.h
@@ -16,6 +16,8 @@
 #define NO_IRQ 255
 #endif
 
+#include <scsi/scsi_eh.h>
+
 #include "queue.h"
 #include "msgqueue.h"
 
@@ -311,6 +313,7 @@ typedef struct {
 
 	/* miscellaneous */
 	int			internal_done;		/* flag to indicate request done */
+	struct scsi_eh_save	*ses;		/* holds request sense restore info */
 	unsigned long		magic_end;
 } FAS216_Info;
 

From 90b0c41829450d60da388edcd346c5b31371e7be Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 6 Feb 2008 15:38:33 +0200
Subject: [PATCH 10/66] [SCSI] aic94xx: fix ABORT_TASK define conflict

include/scsi/scsi.h as a definition:
#define ABORT_TASK          0x0d

on the other hand drivers/scsi/aic94xx/aic94xx_sas.h has:
#define ABORT_TASK              0x03

rename the latter to SCB_ABORT_TASK

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/aic94xx/aic94xx_sas.h | 2 +-
 drivers/scsi/aic94xx/aic94xx_tmf.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/aic94xx/aic94xx_sas.h b/drivers/scsi/aic94xx/aic94xx_sas.h
index fa7c5290257d..912e6b755f74 100644
--- a/drivers/scsi/aic94xx/aic94xx_sas.h
+++ b/drivers/scsi/aic94xx/aic94xx_sas.h
@@ -292,7 +292,7 @@ struct scb_header {
 #define INITIATE_SSP_TASK       0x00
 #define INITIATE_LONG_SSP_TASK  0x01
 #define INITIATE_BIDIR_SSP_TASK 0x02
-#define ABORT_TASK              0x03
+#define SCB_ABORT_TASK          0x03
 #define INITIATE_SSP_TMF        0x04
 #define SSP_TARG_GET_DATA       0x05
 #define SSP_TARG_GET_DATA_GOOD  0x06
diff --git a/drivers/scsi/aic94xx/aic94xx_tmf.c b/drivers/scsi/aic94xx/aic94xx_tmf.c
index 87b2f6e6adfe..b52124f3d3ac 100644
--- a/drivers/scsi/aic94xx/aic94xx_tmf.c
+++ b/drivers/scsi/aic94xx/aic94xx_tmf.c
@@ -369,7 +369,7 @@ int asd_abort_task(struct sas_task *task)
 		return -ENOMEM;
 	scb = ascb->scb;
 
-	scb->header.opcode = ABORT_TASK;
+	scb->header.opcode = SCB_ABORT_TASK;
 
 	switch (task->task_proto) {
 	case SAS_PROTOCOL_SATA:

From 4660c8ed5aaed99d82785499f034a8cc9199866d Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 10 Feb 2008 09:42:46 -0600
Subject: [PATCH 11/66] [SCSI] update SG_ALL to avoid causing chaining

Since the sg chaining patches went in, our current value of 255 for
SG_ALL excites chaining on some drivers which cannot support it (and
would thus oops).  Redefine SG_ALL to mean no sg table size
preference, but use the single allocation (non chained) limit.  This
also helps for drivers that use it to size an internal table.

We'll do an opt in system later where truly chaining supporting
drivers can define their sg_tablesize to be anything up to
SCSI_MAX_SG_CHAIN_ELEMENTS.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/scsi/scsi_host.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index d1299e999723..530ff4c553f8 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <scsi/scsi.h>
 
 struct request_queue;
 struct block_device;
@@ -25,12 +26,15 @@ struct blk_queue_tags;
  * NONE: Self evident.	Host adapter is not capable of scatter-gather.
  * ALL:	 Means that the host adapter module can do scatter-gather,
  *	 and that there is no limit to the size of the table to which
- *	 we scatter/gather data.
+ *	 we scatter/gather data.  The value we set here is the maximum
+ *	 single element sglist.  To use chained sglists, the adapter
+ *	 has to set a value beyond ALL (and correctly use the chain
+ *	 handling API.
  * Anything else:  Indicates the maximum number of chains that can be
  *	 used in one scatter-gather request.
  */
 #define SG_NONE 0
-#define SG_ALL 0xff
+#define SG_ALL	SCSI_MAX_SG_SEGMENTS
 
 #define MODE_UNKNOWN 0x00
 #define MODE_INITIATOR 0x01

From e47c9093531d3406a8ae38acca4ce207ef70cc0e Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:49:26 -0500
Subject: [PATCH 12/66] [SCSI] lpfc 8.2.5 : Correct ndlp referencing issues

Correct ndlp referencing issues:
- Fix ndlp kref issues due to race conditions between threads
- Fix cancel els delay retry event which missed an ndlp reference count

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc.h           |   2 +
 drivers/scsi/lpfc/lpfc_attr.c      |   5 +-
 drivers/scsi/lpfc/lpfc_crtn.h      |   6 +-
 drivers/scsi/lpfc/lpfc_ct.c        |  39 +++-
 drivers/scsi/lpfc/lpfc_disc.h      |  33 +++-
 drivers/scsi/lpfc/lpfc_els.c       | 237 ++++++++++++++++-------
 drivers/scsi/lpfc/lpfc_hbadisc.c   | 296 +++++++++++++++++++++++++----
 drivers/scsi/lpfc/lpfc_init.c      |  41 +++-
 drivers/scsi/lpfc/lpfc_nportdisc.c |  39 +++-
 drivers/scsi/lpfc/lpfc_scsi.c      |   4 +-
 drivers/scsi/lpfc/lpfc_vport.c     |  70 ++++++-
 11 files changed, 630 insertions(+), 142 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 83567b9755b4..572c525ad2b5 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -595,6 +595,8 @@ struct lpfc_hba {
 	unsigned long last_completion_time;
 	struct timer_list hb_tmofunc;
 	uint8_t hb_outstanding;
+	/* ndlp reference management */
+	spinlock_t ndlp_lock;
 	/*
 	 * Following bit will be set for all buffer tags which are not
 	 * associated with any HBQ.
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 4bae4a2ed2f1..ef061d97a222 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1191,7 +1191,7 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport)
 	shost = lpfc_shost_from_vport(vport);
 	spin_lock_irq(shost->host_lock);
 	list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp)
-		if (ndlp->rport)
+		if (NLP_CHK_NODE_ACT(ndlp) && ndlp->rport)
 			ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo;
 	spin_unlock_irq(shost->host_lock);
 }
@@ -2384,7 +2384,8 @@ lpfc_get_node_by_target(struct scsi_target *starget)
 	spin_lock_irq(shost->host_lock);
 	/* Search for this, mapped, target ID */
 	list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
-		if (ndlp->nlp_state == NLP_STE_MAPPED_NODE &&
+		if (NLP_CHK_NODE_ACT(ndlp) &&
+		    ndlp->nlp_state == NLP_STE_MAPPED_NODE &&
 		    starget->id == ndlp->nlp_sid) {
 			spin_unlock_irq(shost->host_lock);
 			return ndlp;
diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h
index 50fcb7c930bc..848d97744b4d 100644
--- a/drivers/scsi/lpfc/lpfc_crtn.h
+++ b/drivers/scsi/lpfc/lpfc_crtn.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  *                                                                 *
@@ -53,7 +53,11 @@ void lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *, LPFC_MBOXQ_t *);
 void lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *);
 void lpfc_mbx_cmpl_ns_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *);
 void lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *);
+void lpfc_enqueue_node(struct lpfc_vport *, struct lpfc_nodelist *);
 void lpfc_dequeue_node(struct lpfc_vport *, struct lpfc_nodelist *);
+void lpfc_disable_node(struct lpfc_vport *, struct lpfc_nodelist *);
+struct lpfc_nodelist *lpfc_enable_node(struct lpfc_vport *,
+					struct lpfc_nodelist *, int);
 void lpfc_nlp_set_state(struct lpfc_vport *, struct lpfc_nodelist *, int);
 void lpfc_drop_node(struct lpfc_vport *, struct lpfc_nodelist *);
 void lpfc_set_disctmo(struct lpfc_vport *);
diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c
index 92441ce610ed..aea8d33f6d09 100644
--- a/drivers/scsi/lpfc/lpfc_ct.c
+++ b/drivers/scsi/lpfc/lpfc_ct.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  *                                                                 *
@@ -294,7 +294,7 @@ lpfc_gen_req(struct lpfc_vport *vport, struct lpfc_dmabuf *bmp,
 	/* Save for completion so we can release these resources */
 	geniocb->context1 = (uint8_t *) inp;
 	geniocb->context2 = (uint8_t *) outp;
-	geniocb->context_un.ndlp = ndlp;
+	geniocb->context_un.ndlp = lpfc_nlp_get(ndlp);
 
 	/* Fill in payload, bp points to frame payload */
 	icmd->ulpCommand = CMD_GEN_REQUEST64_CR;
@@ -489,8 +489,10 @@ lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint32_t Size)
 						 */
 						ndlp = lpfc_findnode_did(vport,
 							Did);
-						if (ndlp && (ndlp->nlp_type &
-							NLP_FCP_TARGET))
+						if (ndlp &&
+						    NLP_CHK_NODE_ACT(ndlp)
+						    && (ndlp->nlp_type &
+						     NLP_FCP_TARGET))
 							lpfc_setup_disc_node
 								(vport, Did);
 						else if (lpfc_ns_cmd(vport,
@@ -1064,7 +1066,8 @@ lpfc_ns_cmd(struct lpfc_vport *vport, int cmdcode,
 	int rc = 0;
 
 	ndlp = lpfc_findnode_did(vport, NameServer_DID);
-	if (ndlp == NULL || ndlp->nlp_state != NLP_STE_UNMAPPED_NODE) {
+	if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)
+	    || ndlp->nlp_state != NLP_STE_UNMAPPED_NODE) {
 		rc=1;
 		goto ns_cmd_exit;
 	}
@@ -1213,8 +1216,9 @@ lpfc_ns_cmd(struct lpfc_vport *vport, int cmdcode,
 		cmpl = lpfc_cmpl_ct_cmd_rff_id;
 		break;
 	}
-	lpfc_nlp_get(ndlp);
-
+	/* The lpfc_ct_cmd/lpfc_get_req shall increment ndlp reference count
+	 * to hold ndlp reference for the corresponding callback function.
+	 */
 	if (!lpfc_ct_cmd(vport, mp, bmp, ndlp, cmpl, rsp_size, retry)) {
 		/* On success, The cmpl function will free the buffers */
 		lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_CT,
@@ -1222,9 +1226,13 @@ lpfc_ns_cmd(struct lpfc_vport *vport, int cmdcode,
 			cmdcode, ndlp->nlp_DID, 0);
 		return 0;
 	}
-
 	rc=6;
+
+	/* Decrement ndlp reference count to release ndlp reference held
+	 * for the failed command's callback function.
+	 */
 	lpfc_nlp_put(ndlp);
+
 	lpfc_mbuf_free(phba, bmp->virt, bmp->phys);
 ns_cmd_free_bmp:
 	kfree(bmp);
@@ -1271,6 +1279,9 @@ lpfc_cmpl_ct_cmd_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 	}
 
 	ndlp = lpfc_findnode_did(vport, FDMI_DID);
+	if (!ndlp || !NLP_CHK_NODE_ACT(ndlp))
+		goto fail_out;
+
 	if (fdmi_rsp == be16_to_cpu(SLI_CT_RESPONSE_FS_RJT)) {
 		/* FDMI rsp failed */
 		lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
@@ -1294,6 +1305,8 @@ lpfc_cmpl_ct_cmd_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		lpfc_fdmi_cmd(vport, ndlp, SLI_MGMT_RHBA);
 		break;
 	}
+
+fail_out:
 	lpfc_ct_free_iocb(phba, cmdiocb);
 	return;
 }
@@ -1650,12 +1663,18 @@ lpfc_fdmi_cmd(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, int cmdcode)
 	bpl->tus.w = le32_to_cpu(bpl->tus.w);
 
 	cmpl = lpfc_cmpl_ct_cmd_fdmi;
-	lpfc_nlp_get(ndlp);
 
+	/* The lpfc_ct_cmd/lpfc_get_req shall increment ndlp reference count
+	 * to hold ndlp reference for the corresponding callback function.
+	 */
 	if (!lpfc_ct_cmd(vport, mp, bmp, ndlp, cmpl, FC_MAX_NS_RSP, 0))
 		return 0;
 
+	/* Decrement ndlp reference count to release ndlp reference held
+	 * for the failed command's callback function.
+	 */
 	lpfc_nlp_put(ndlp);
+
 	lpfc_mbuf_free(phba, bmp->virt, bmp->phys);
 fdmi_cmd_free_bmp:
 	kfree(bmp);
@@ -1698,7 +1717,7 @@ lpfc_fdmi_timeout_handler(struct lpfc_vport *vport)
 	struct lpfc_nodelist *ndlp;
 
 	ndlp = lpfc_findnode_did(vport, FDMI_DID);
-	if (ndlp) {
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp)) {
 		if (init_utsname()->nodename[0] != '\0')
 			lpfc_fdmi_cmd(vport, ndlp, SLI_MGMT_DHBA);
 		else
diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h
index cfe81c50529a..8eb007309599 100644
--- a/drivers/scsi/lpfc/lpfc_disc.h
+++ b/drivers/scsi/lpfc/lpfc_disc.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  *                                                                 *
@@ -73,6 +73,12 @@ struct lpfc_nodelist {
 	uint8_t         nlp_fcp_info;	        /* class info, bits 0-3 */
 #define NLP_FCP_2_DEVICE   0x10			/* FCP-2 device */
 
+	uint16_t        nlp_usg_map;	/* ndlp management usage bitmap */
+#define NLP_USG_NODE_ACT_BIT	0x1	/* Indicate ndlp is actively used */
+#define NLP_USG_IACT_REQ_BIT	0x2	/* Request to inactivate ndlp */
+#define NLP_USG_FREE_REQ_BIT	0x4	/* Request to invoke ndlp memory free */
+#define NLP_USG_FREE_ACK_BIT	0x8	/* Indicate ndlp memory free invoked */
+
 	struct timer_list   nlp_delayfunc;	/* Used for delayed ELS cmds */
 	struct fc_rport *rport;			/* Corresponding FC transport
 						   port structure */
@@ -105,6 +111,31 @@ struct lpfc_nodelist {
 #define NLP_NODEV_REMOVE   0x8000000	/* Defer removal till discovery ends */
 #define NLP_TARGET_REMOVE  0x10000000   /* Target remove in process */
 
+/* ndlp usage management macros */
+#define NLP_CHK_NODE_ACT(ndlp)		(((ndlp)->nlp_usg_map \
+						& NLP_USG_NODE_ACT_BIT) \
+					&& \
+					!((ndlp)->nlp_usg_map \
+						& NLP_USG_FREE_ACK_BIT))
+#define NLP_SET_NODE_ACT(ndlp)		((ndlp)->nlp_usg_map \
+						|= NLP_USG_NODE_ACT_BIT)
+#define NLP_INT_NODE_ACT(ndlp)		((ndlp)->nlp_usg_map \
+						= NLP_USG_NODE_ACT_BIT)
+#define NLP_CLR_NODE_ACT(ndlp)		((ndlp)->nlp_usg_map \
+						&= ~NLP_USG_NODE_ACT_BIT)
+#define NLP_CHK_IACT_REQ(ndlp)          ((ndlp)->nlp_usg_map \
+						& NLP_USG_IACT_REQ_BIT)
+#define NLP_SET_IACT_REQ(ndlp)          ((ndlp)->nlp_usg_map \
+						|= NLP_USG_IACT_REQ_BIT)
+#define NLP_CHK_FREE_REQ(ndlp)		((ndlp)->nlp_usg_map \
+						& NLP_USG_FREE_REQ_BIT)
+#define NLP_SET_FREE_REQ(ndlp)		((ndlp)->nlp_usg_map \
+						|= NLP_USG_FREE_REQ_BIT)
+#define NLP_CHK_FREE_ACK(ndlp)		((ndlp)->nlp_usg_map \
+						& NLP_USG_FREE_ACK_BIT)
+#define NLP_SET_FREE_ACK(ndlp)		((ndlp)->nlp_usg_map \
+						|= NLP_USG_FREE_ACK_BIT)
+
 /* There are 4 different double linked lists nodelist entries can reside on.
  * The Port Login (PLOGI) list and Address Discovery (ADISC) list are used
  * when Link Up discovery or Registered State Change Notification (RSCN)
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index c6b739dc6bc3..39268e6a1a05 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -113,6 +113,7 @@ lpfc_prep_els_iocb(struct lpfc_vport *vport, uint8_t expectRsp,
 
 	if (elsiocb == NULL)
 		return NULL;
+
 	icmd = &elsiocb->iocb;
 
 	/* fill in BDEs for command */
@@ -134,9 +135,8 @@ lpfc_prep_els_iocb(struct lpfc_vport *vport, uint8_t expectRsp,
 		if (!prsp || !prsp->virt)
 			goto els_iocb_free_prsp_exit;
 		INIT_LIST_HEAD(&prsp->list);
-	} else {
+	} else
 		prsp = NULL;
-	}
 
 	/* Allocate buffer for Buffer ptr list */
 	pbuflist = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL);
@@ -246,7 +246,7 @@ lpfc_issue_fabric_reglogin(struct lpfc_vport *vport)
 
 	sp = &phba->fc_fabparam;
 	ndlp = lpfc_findnode_did(vport, Fabric_DID);
-	if (!ndlp) {
+	if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
 		err = 1;
 		goto fail;
 	}
@@ -282,6 +282,9 @@ lpfc_issue_fabric_reglogin(struct lpfc_vport *vport)
 
 	mbox->mbox_cmpl = lpfc_mbx_cmpl_fabric_reg_login;
 	mbox->vport = vport;
+	/* increment the reference count on ndlp to hold reference
+	 * for the callback routine.
+	 */
 	mbox->context2 = lpfc_nlp_get(ndlp);
 
 	rc = lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT);
@@ -293,6 +296,9 @@ lpfc_issue_fabric_reglogin(struct lpfc_vport *vport)
 	return 0;
 
 fail_issue_reg_login:
+	/* decrement the reference count on ndlp just incremented
+	 * for the failed mbox command.
+	 */
 	lpfc_nlp_put(ndlp);
 	mp = (struct lpfc_dmabuf *) mbox->context1;
 	lpfc_mbuf_free(phba, mp->virt, mp->phys);
@@ -381,6 +387,8 @@ lpfc_cmpl_els_flogi_fabric(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 		 */
 		list_for_each_entry_safe(np, next_np,
 					&vport->fc_nodes, nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if ((np->nlp_state != NLP_STE_NPR_NODE) ||
 				   !(np->nlp_flag & NLP_NPR_ADISC))
 				continue;
@@ -456,6 +464,9 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 			mempool_free(mbox, phba->mbox_mem_pool);
 			goto fail;
 		}
+		/* Decrement ndlp reference count indicating that ndlp can be
+		 * safely released when other references to it are done.
+		 */
 		lpfc_nlp_put(ndlp);
 
 		ndlp = lpfc_findnode_did(vport, PT2PT_RemoteID);
@@ -467,22 +478,29 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 			ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
 			if (!ndlp)
 				goto fail;
-
 			lpfc_nlp_init(vport, ndlp, PT2PT_RemoteID);
+		} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+			ndlp = lpfc_enable_node(vport, ndlp,
+						NLP_STE_UNUSED_NODE);
+			if(!ndlp)
+				goto fail;
 		}
 
 		memcpy(&ndlp->nlp_portname, &sp->portName,
 		       sizeof(struct lpfc_name));
 		memcpy(&ndlp->nlp_nodename, &sp->nodeName,
 		       sizeof(struct lpfc_name));
+		/* Set state will put ndlp onto node list if not already done */
 		lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
 		spin_lock_irq(shost->host_lock);
 		ndlp->nlp_flag |= NLP_NPR_2B_DISC;
 		spin_unlock_irq(shost->host_lock);
-	} else {
-		/* This side will wait for the PLOGI */
+	} else
+		/* This side will wait for the PLOGI, decrement ndlp reference
+		 * count indicating that ndlp can be released when other
+		 * references to it are done.
+		 */
 		lpfc_nlp_put(ndlp);
-	}
 
 	/* If we are pt2pt with another NPort, force NPIV off! */
 	phba->sli3_options &= ~LPFC_SLI3_NPIV_ENABLED;
@@ -728,16 +746,21 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
 		if (!ndlp)
 			return 0;
 		lpfc_nlp_init(vport, ndlp, Fabric_DID);
-	} else {
-		lpfc_dequeue_node(vport, ndlp);
+		/* Put ndlp onto node list */
+		lpfc_enqueue_node(vport, ndlp);
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		/* re-setup ndlp without removing from node list */
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
+		if (!ndlp)
+			return 0;
 	}
 
-	if (lpfc_issue_els_flogi(vport, ndlp, 0)) {
+	if (lpfc_issue_els_flogi(vport, ndlp, 0))
 		/* This decrement of reference count to node shall kick off
 		 * the release of the node.
 		 */
 		lpfc_nlp_put(ndlp);
-	}
+
 	return 1;
 }
 
@@ -755,9 +778,15 @@ lpfc_initial_fdisc(struct lpfc_vport *vport)
 		if (!ndlp)
 			return 0;
 		lpfc_nlp_init(vport, ndlp, Fabric_DID);
-	} else {
-		lpfc_dequeue_node(vport, ndlp);
+		/* Put ndlp onto node list */
+		lpfc_enqueue_node(vport, ndlp);
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		/* re-setup ndlp without removing from node list */
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
+		if (!ndlp)
+			return 0;
 	}
+
 	if (lpfc_issue_els_fdisc(vport, ndlp, 0)) {
 		/* decrement node reference count to trigger the release of
 		 * the node.
@@ -816,7 +845,7 @@ lpfc_plogi_confirm_nport(struct lpfc_hba *phba, uint32_t *prsp,
 	 */
 	new_ndlp = lpfc_findnode_wwpn(vport, &sp->portName);
 
-	if (new_ndlp == ndlp)
+	if (new_ndlp == ndlp && NLP_CHK_NODE_ACT(new_ndlp))
 		return ndlp;
 
 	if (!new_ndlp) {
@@ -827,8 +856,12 @@ lpfc_plogi_confirm_nport(struct lpfc_hba *phba, uint32_t *prsp,
 		new_ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_ATOMIC);
 		if (!new_ndlp)
 			return ndlp;
-
 		lpfc_nlp_init(vport, new_ndlp, ndlp->nlp_DID);
+	} else if (!NLP_CHK_NODE_ACT(new_ndlp)) {
+		new_ndlp = lpfc_enable_node(vport, new_ndlp,
+						NLP_STE_UNUSED_NODE);
+		if (!new_ndlp)
+			return ndlp;
 	}
 
 	lpfc_unreg_rpi(vport, new_ndlp);
@@ -839,6 +872,7 @@ lpfc_plogi_confirm_nport(struct lpfc_hba *phba, uint32_t *prsp,
 		new_ndlp->nlp_flag |= NLP_NPR_2B_DISC;
 	ndlp->nlp_flag &= ~NLP_NPR_2B_DISC;
 
+	/* Set state will put new_ndlp on to node list if not already done */
 	lpfc_nlp_set_state(vport, new_ndlp, ndlp->nlp_state);
 
 	/* Move this back to NPR state */
@@ -912,7 +946,7 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		irsp->un.elsreq64.remoteID);
 
 	ndlp = lpfc_findnode_did(vport, irsp->un.elsreq64.remoteID);
-	if (!ndlp) {
+	if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
 		lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS,
 				 "0136 PLOGI completes to NPort x%x "
 				 "with no ndlp. Data: x%x x%x x%x\n",
@@ -962,12 +996,11 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		}
 		/* PLOGI failed */
 		/* Do not call DSM for lpfc_els_abort'ed ELS cmds */
-		if (lpfc_error_lost_link(irsp)) {
+		if (lpfc_error_lost_link(irsp))
 			rc = NLP_STE_FREED_NODE;
-		} else {
+		else
 			rc = lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 						     NLP_EVT_CMPL_PLOGI);
-		}
 	} else {
 		/* Good status, call state machine */
 		prsp = list_entry(((struct lpfc_dmabuf *)
@@ -1015,8 +1048,10 @@ lpfc_issue_els_plogi(struct lpfc_vport *vport, uint32_t did, uint8_t retry)
 	pring = &psli->ring[LPFC_ELS_RING];	/* ELS ring */
 
 	ndlp = lpfc_findnode_did(vport, did);
-	/* If ndlp if not NULL, we will bump the reference count on it */
+	if (ndlp && !NLP_CHK_NODE_ACT(ndlp))
+		ndlp = NULL;
 
+	/* If ndlp is not NULL, we will bump the reference count on it */
 	cmdsize = (sizeof(uint32_t) + sizeof(struct serv_parm));
 	elsiocb = lpfc_prep_els_iocb(vport, 1, cmdsize, retry, ndlp, did,
 				     ELS_CMD_PLOGI);
@@ -1097,18 +1132,15 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		}
 		/* PRLI failed */
 		/* Do not call DSM for lpfc_els_abort'ed ELS cmds */
-		if (lpfc_error_lost_link(irsp)) {
+		if (lpfc_error_lost_link(irsp))
 			goto out;
-		} else {
+		else
 			lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 						NLP_EVT_CMPL_PRLI);
-		}
-	} else {
+	} else
 		/* Good status, call state machine */
 		lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 					NLP_EVT_CMPL_PRLI);
-	}
-
 out:
 	lpfc_els_free_iocb(phba, cmdiocb);
 	return;
@@ -1275,15 +1307,13 @@ lpfc_cmpl_els_adisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		}
 		/* ADISC failed */
 		/* Do not call DSM for lpfc_els_abort'ed ELS cmds */
-		if (!lpfc_error_lost_link(irsp)) {
+		if (!lpfc_error_lost_link(irsp))
 			lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 						NLP_EVT_CMPL_ADISC);
-		}
-	} else {
+	} else
 		/* Good status, call state machine */
 		lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 					NLP_EVT_CMPL_ADISC);
-	}
 
 	if (disc && vport->num_disc_nodes) {
 		/* Check to see if there are more ADISCs to be sent */
@@ -1443,14 +1473,12 @@ lpfc_cmpl_els_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		else
 			lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 						NLP_EVT_CMPL_LOGO);
-	} else {
+	} else
 		/* Good status, call state machine.
 		 * This will unregister the rpi if needed.
 		 */
 		lpfc_disc_state_machine(vport, ndlp, cmdiocb,
 					NLP_EVT_CMPL_LOGO);
-	}
-
 out:
 	lpfc_els_free_iocb(phba, cmdiocb);
 	return;
@@ -1556,11 +1584,19 @@ lpfc_issue_els_scr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
 	psli = &phba->sli;
 	pring = &psli->ring[LPFC_ELS_RING];	/* ELS ring */
 	cmdsize = (sizeof(uint32_t) + sizeof(SCR));
-	ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
-	if (!ndlp)
-		return 1;
 
-	lpfc_nlp_init(vport, ndlp, nportid);
+	ndlp = lpfc_findnode_did(vport, nportid);
+	if (!ndlp) {
+		ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+		if (!ndlp)
+			return 1;
+		lpfc_nlp_init(vport, ndlp, nportid);
+		lpfc_enqueue_node(vport, ndlp);
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
+		if (!ndlp)
+			return 1;
+	}
 
 	elsiocb = lpfc_prep_els_iocb(vport, 1, cmdsize, retry, ndlp,
 				     ndlp->nlp_DID, ELS_CMD_SCR);
@@ -1623,11 +1659,19 @@ lpfc_issue_els_farpr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
 	psli = &phba->sli;
 	pring = &psli->ring[LPFC_ELS_RING];	/* ELS ring */
 	cmdsize = (sizeof(uint32_t) + sizeof(FARP));
-	ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
-	if (!ndlp)
-		return 1;
 
-	lpfc_nlp_init(vport, ndlp, nportid);
+	ndlp = lpfc_findnode_did(vport, nportid);
+	if (!ndlp) {
+		ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+		if (!ndlp)
+			return 1;
+		lpfc_nlp_init(vport, ndlp, nportid);
+		lpfc_enqueue_node(vport, ndlp);
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
+		if (!ndlp)
+			return 1;
+	}
 
 	elsiocb = lpfc_prep_els_iocb(vport, 1, cmdsize, retry, ndlp,
 				     ndlp->nlp_DID, ELS_CMD_RNID);
@@ -1657,7 +1701,7 @@ lpfc_issue_els_farpr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
 	memcpy(&fp->RportName, &vport->fc_portname, sizeof(struct lpfc_name));
 	memcpy(&fp->RnodeName, &vport->fc_nodename, sizeof(struct lpfc_name));
 	ondlp = lpfc_findnode_did(vport, nportid);
-	if (ondlp) {
+	if (ondlp && NLP_CHK_NODE_ACT(ondlp)) {
 		memcpy(&fp->OportName, &ondlp->nlp_portname,
 		       sizeof(struct lpfc_name));
 		memcpy(&fp->OnodeName, &ondlp->nlp_nodename,
@@ -1690,6 +1734,7 @@ void
 lpfc_cancel_retry_delay_tmo(struct lpfc_vport *vport, struct lpfc_nodelist *nlp)
 {
 	struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+	struct lpfc_work_evt *evtp;
 
 	spin_lock_irq(shost->host_lock);
 	nlp->nlp_flag &= ~NLP_DELAY_TMO;
@@ -1697,8 +1742,12 @@ lpfc_cancel_retry_delay_tmo(struct lpfc_vport *vport, struct lpfc_nodelist *nlp)
 	del_timer_sync(&nlp->nlp_delayfunc);
 	nlp->nlp_last_elscmd = 0;
 
-	if (!list_empty(&nlp->els_retry_evt.evt_listp))
+	if (!list_empty(&nlp->els_retry_evt.evt_listp)) {
 		list_del_init(&nlp->els_retry_evt.evt_listp);
+		/* Decrement nlp reference count held for the delayed retry */
+		evtp = &nlp->els_retry_evt;
+		lpfc_nlp_put((struct lpfc_nodelist *)evtp->evt_arg1);
+	}
 
 	if (nlp->nlp_flag & NLP_NPR_2B_DISC) {
 		spin_lock_irq(shost->host_lock);
@@ -1842,13 +1891,14 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		cmd = *elscmd++;
 	}
 
-	if (ndlp)
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp))
 		did = ndlp->nlp_DID;
 	else {
 		/* We should only hit this case for retrying PLOGI */
 		did = irsp->un.elsreq64.remoteID;
 		ndlp = lpfc_findnode_did(vport, did);
-		if (!ndlp && (cmd != ELS_CMD_PLOGI))
+		if ((!ndlp || !NLP_CHK_NODE_ACT(ndlp))
+		    && (cmd != ELS_CMD_PLOGI))
 			return 1;
 	}
 
@@ -2322,6 +2372,9 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		if ((rspiocb->iocb.ulpStatus == 0)
 		    && (ndlp->nlp_flag & NLP_ACC_REGLOGIN)) {
 			lpfc_unreg_rpi(vport, ndlp);
+			/* Increment reference count to ndlp to hold the
+			 * reference to ndlp for the callback function.
+			 */
 			mbox->context2 = lpfc_nlp_get(ndlp);
 			mbox->vport = vport;
 			if (ndlp->nlp_flag & NLP_RM_DFLT_RPI) {
@@ -2335,9 +2388,13 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 					   NLP_STE_REG_LOGIN_ISSUE);
 			}
 			if (lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT)
-			    != MBX_NOT_FINISHED) {
+			    != MBX_NOT_FINISHED)
 				goto out;
-			}
+			else
+				/* Decrement the ndlp reference count we
+				 * set for this failed mailbox command.
+				 */
+				lpfc_nlp_put(ndlp);
 
 			/* ELS rsp: Cannot issue reg_login for <NPortid> */
 			lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS,
@@ -2796,6 +2853,8 @@ lpfc_els_disc_adisc(struct lpfc_vport *vport)
 
 	/* go thru NPR nodes and issue any remaining ELS ADISCs */
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_NPR_NODE &&
 		    (ndlp->nlp_flag & NLP_NPR_2B_DISC) != 0 &&
 		    (ndlp->nlp_flag & NLP_NPR_ADISC) != 0) {
@@ -2833,6 +2892,8 @@ lpfc_els_disc_plogi(struct lpfc_vport *vport)
 
 	/* go thru NPR nodes and issue any remaining ELS PLOGIs */
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_NPR_NODE &&
 		    (ndlp->nlp_flag & NLP_NPR_2B_DISC) != 0 &&
 		    (ndlp->nlp_flag & NLP_DELAY_TMO) == 0 &&
@@ -2943,7 +3004,8 @@ lpfc_rscn_recovery_check(struct lpfc_vport *vport)
 	 */
 
 	list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
-		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE ||
+		if (!NLP_CHK_NODE_ACT(ndlp) ||
+		    ndlp->nlp_state == NLP_STE_UNUSED_NODE ||
 		    lpfc_rscn_payload_check(vport, ndlp->nlp_DID) == 0)
 			continue;
 
@@ -3145,7 +3207,8 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
 	vport->num_disc_nodes = 0;
 
 	ndlp = lpfc_findnode_did(vport, NameServer_DID);
-	if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp)
+	    && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
 		/* Good ndlp, issue CT Request to NameServer */
 		if (lpfc_ns_cmd(vport, SLI_CTNS_GID_FT, 0, 0) == 0)
 			/* Wait for NameServer query cmpl before we can
@@ -3155,25 +3218,35 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
 		/* If login to NameServer does not exist, issue one */
 		/* Good status, issue PLOGI to NameServer */
 		ndlp = lpfc_findnode_did(vport, NameServer_DID);
-		if (ndlp)
+		if (ndlp && NLP_CHK_NODE_ACT(ndlp))
 			/* Wait for NameServer login cmpl before we can
 			   continue */
 			return 1;
 
-		ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
-		if (!ndlp) {
-			lpfc_els_flush_rscn(vport);
-			return 0;
+		if (ndlp) {
+			ndlp = lpfc_enable_node(vport, ndlp,
+						NLP_STE_PLOGI_ISSUE);
+			if (!ndlp) {
+				lpfc_els_flush_rscn(vport);
+				return 0;
+			}
+			ndlp->nlp_prev_state = NLP_STE_UNUSED_NODE;
 		} else {
+			ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+			if (!ndlp) {
+				lpfc_els_flush_rscn(vport);
+				return 0;
+			}
 			lpfc_nlp_init(vport, ndlp, NameServer_DID);
-			ndlp->nlp_type |= NLP_FABRIC;
 			ndlp->nlp_prev_state = ndlp->nlp_state;
 			lpfc_nlp_set_state(vport, ndlp, NLP_STE_PLOGI_ISSUE);
-			lpfc_issue_els_plogi(vport, NameServer_DID, 0);
-			/* Wait for NameServer login cmpl before we can
-			   continue */
-			return 1;
 		}
+		ndlp->nlp_type |= NLP_FABRIC;
+		lpfc_issue_els_plogi(vport, NameServer_DID, 0);
+		/* Wait for NameServer login cmpl before we can
+		 * continue
+		 */
+		return 1;
 	}
 
 	lpfc_els_flush_rscn(vport);
@@ -3672,6 +3745,8 @@ lpfc_els_rcv_fan(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 
 			list_for_each_entry_safe(ndlp, next_ndlp,
 						 &vport->fc_nodes, nlp_listp) {
+				if (!NLP_CHK_NODE_ACT(ndlp))
+					continue;
 				if (ndlp->nlp_state != NLP_STE_NPR_NODE)
 					continue;
 				if (ndlp->nlp_type & NLP_FABRIC) {
@@ -3697,6 +3772,8 @@ lpfc_els_rcv_fan(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 		 */
 		list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes,
 					 nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state != NLP_STE_NPR_NODE)
 				continue;
 
@@ -3936,7 +4013,7 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 	uint32_t cmd, did, newnode, rjt_err = 0;
 	IOCB_t *icmd = &elsiocb->iocb;
 
-	if (vport == NULL || elsiocb->context2 == NULL)
+	if (!vport || !(elsiocb->context2))
 		goto dropit;
 
 	newnode = 0;
@@ -3971,14 +4048,20 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 		lpfc_nlp_init(vport, ndlp, did);
 		lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
 		newnode = 1;
-		if ((did & Fabric_DID_MASK) == Fabric_DID_MASK) {
+		if ((did & Fabric_DID_MASK) == Fabric_DID_MASK)
 			ndlp->nlp_type |= NLP_FABRIC;
+	} else {
+		if (!NLP_CHK_NODE_ACT(ndlp)) {
+			ndlp = lpfc_enable_node(vport, ndlp,
+						NLP_STE_UNUSED_NODE);
+			if (!ndlp)
+				goto dropit;
 		}
-	}
-	else {
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE) {
 			/* This is simular to the new node path */
-			lpfc_nlp_get(ndlp);
+			ndlp = lpfc_nlp_get(ndlp);
+			if (!ndlp)
+				goto dropit;
 			lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
 			newnode = 1;
 		}
@@ -3987,6 +4070,7 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 	phba->fc_stat.elsRcvFrame++;
 	if (elsiocb->context1)
 		lpfc_nlp_put(elsiocb->context1);
+
 	elsiocb->context1 = lpfc_nlp_get(ndlp);
 	elsiocb->vport = vport;
 
@@ -4314,6 +4398,18 @@ lpfc_do_scr_ns_plogi(struct lpfc_hba *phba, struct lpfc_vport *vport)
 		}
 		lpfc_nlp_init(vport, ndlp, NameServer_DID);
 		ndlp->nlp_type |= NLP_FABRIC;
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
+		if (!ndlp) {
+			if (phba->fc_topology == TOPOLOGY_LOOP) {
+				lpfc_disc_start(vport);
+				return;
+			}
+			lpfc_vport_set_state(vport, FC_VPORT_FAILED);
+			lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS,
+					"0348 NameServer login: node freed\n");
+			return;
+		}
 	}
 
 	lpfc_nlp_set_state(vport, ndlp, NLP_STE_PLOGI_ISSUE);
@@ -4471,7 +4567,6 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 				 irsp->ulpStatus, irsp->un.ulpWord[4]);
 		if (vport->fc_vport->vport_state == FC_VPORT_INITIALIZING)
 			lpfc_vport_set_state(vport, FC_VPORT_FAILED);
-
 		lpfc_nlp_put(ndlp);
 		/* giving up on FDISC. Cancel discovery timer */
 		lpfc_can_disctmo(vport);
@@ -4492,8 +4587,9 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 			 */
 			list_for_each_entry_safe(np, next_np,
 				&vport->fc_nodes, nlp_listp) {
-				if (np->nlp_state != NLP_STE_NPR_NODE
-				   || !(np->nlp_flag & NLP_NPR_ADISC))
+				if (!NLP_CHK_NODE_ACT(ndlp) ||
+				    (np->nlp_state != NLP_STE_NPR_NODE) ||
+				    !(np->nlp_flag & NLP_NPR_ADISC))
 					continue;
 				spin_lock_irq(shost->host_lock);
 				np->nlp_flag &= ~NLP_NPR_ADISC;
@@ -4599,6 +4695,8 @@ lpfc_cmpl_els_npiv_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 {
 	struct lpfc_vport *vport = cmdiocb->vport;
 	IOCB_t *irsp;
+	struct lpfc_nodelist *ndlp;
+	ndlp = (struct lpfc_nodelist *)cmdiocb->context1;
 
 	irsp = &rspiocb->iocb;
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_ELS_CMD,
@@ -4607,6 +4705,9 @@ lpfc_cmpl_els_npiv_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 
 	lpfc_els_free_iocb(phba, cmdiocb);
 	vport->unreg_vpi_cmpl = VPORT_ERROR;
+
+	/* Trigger the release of the ndlp after logo */
+	lpfc_nlp_put(ndlp);
 }
 
 int
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index dc042bd97baa..1ee3e62c78a7 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -272,9 +272,8 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp)
 	if (!(vport->load_flag & FC_UNLOADING) &&
 	    !(ndlp->nlp_flag & NLP_DELAY_TMO) &&
 	    !(ndlp->nlp_flag & NLP_NPR_2B_DISC) &&
-	    (ndlp->nlp_state != NLP_STE_UNMAPPED_NODE)) {
+	    (ndlp->nlp_state != NLP_STE_UNMAPPED_NODE))
 		lpfc_disc_state_machine(vport, ndlp, NULL, NLP_EVT_DEVICE_RM);
-	}
 }
 
 
@@ -566,9 +565,10 @@ lpfc_cleanup_rpis(struct lpfc_vport *vport, int remove)
 	int  rc;
 
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 			continue;
-
 		if ((phba->sli3_options & LPFC_SLI3_VPORT_TEARDOWN) ||
 			((vport->port_type == LPFC_NPIV_PORT) &&
 			(ndlp->nlp_DID == NameServer_DID)))
@@ -684,20 +684,21 @@ lpfc_linkup_cleanup_nodes(struct lpfc_vport *vport)
 	struct lpfc_nodelist *ndlp;
 
 	list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 			continue;
-
 		if (ndlp->nlp_type & NLP_FABRIC) {
-				/* On Linkup its safe to clean up the ndlp
-				 * from Fabric connections.
-				 */
+			/* On Linkup its safe to clean up the ndlp
+			 * from Fabric connections.
+			 */
 			if (ndlp->nlp_DID != Fabric_DID)
 				lpfc_unreg_rpi(vport, ndlp);
 			lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
 		} else if (!(ndlp->nlp_flag & NLP_NPR_ADISC)) {
-				/* Fail outstanding IO now since device is
-				 * marked for PLOGI.
-				 */
+			/* Fail outstanding IO now since device is
+			 * marked for PLOGI.
+			 */
 			lpfc_unreg_rpi(vport, ndlp);
 		}
 	}
@@ -1305,7 +1306,6 @@ lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_mbuf_free(phba, mp->virt, mp->phys);
 		kfree(mp);
 		mempool_free(pmb, phba->mbox_mem_pool);
-		lpfc_nlp_put(ndlp);
 
 		if (phba->fc_topology == TOPOLOGY_LOOP) {
 			/* FLOGI failed, use loop map to make discovery list */
@@ -1313,6 +1313,10 @@ lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 
 			/* Start discovery */
 			lpfc_disc_start(vport);
+			/* Decrement the reference count to ndlp after the
+			 * reference to the ndlp are done.
+			 */
+			lpfc_nlp_put(ndlp);
 			return;
 		}
 
@@ -1320,6 +1324,10 @@ lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_printf_vlog(vport, KERN_ERR, LOG_MBOX,
 				 "0258 Register Fabric login error: 0x%x\n",
 				 mb->mbxStatus);
+		/* Decrement the reference count to ndlp after the reference
+		 * to the ndlp are done.
+		 */
+		lpfc_nlp_put(ndlp);
 		return;
 	}
 
@@ -1327,8 +1335,6 @@ lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	ndlp->nlp_type |= NLP_FABRIC;
 	lpfc_nlp_set_state(vport, ndlp, NLP_STE_UNMAPPED_NODE);
 
-	lpfc_nlp_put(ndlp);	/* Drop the reference from the mbox */
-
 	if (vport->port_state == LPFC_FABRIC_CFG_LINK) {
 		vports = lpfc_create_vport_work_array(phba);
 		if (vports != NULL)
@@ -1356,6 +1362,11 @@ lpfc_mbx_cmpl_fabric_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_mbuf_free(phba, mp->virt, mp->phys);
 	kfree(mp);
 	mempool_free(pmb, phba->mbox_mem_pool);
+
+	/* Drop the reference count from the mbox at the end after
+	 * all the current reference to the ndlp have been done.
+	 */
+	lpfc_nlp_put(ndlp);
 	return;
 }
 
@@ -1463,9 +1474,8 @@ lpfc_register_remote_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 	 * registered the port.
 	 */
 	if (ndlp->rport && ndlp->rport->dd_data &&
-	    ((struct lpfc_rport_data *) ndlp->rport->dd_data)->pnode == ndlp) {
+	    ((struct lpfc_rport_data *) ndlp->rport->dd_data)->pnode == ndlp)
 		lpfc_nlp_put(ndlp);
-	}
 
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_RPORT,
 		"rport add:       did:x%x flg:x%x type x%x",
@@ -1659,6 +1669,18 @@ lpfc_nlp_set_state(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	lpfc_nlp_state_cleanup(vport, ndlp, old_state, state);
 }
 
+void
+lpfc_enqueue_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
+{
+	struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+
+	if (list_empty(&ndlp->nlp_listp)) {
+		spin_lock_irq(shost->host_lock);
+		list_add_tail(&ndlp->nlp_listp, &vport->fc_nodes);
+		spin_unlock_irq(shost->host_lock);
+	}
+}
+
 void
 lpfc_dequeue_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 {
@@ -1672,7 +1694,80 @@ lpfc_dequeue_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 	list_del_init(&ndlp->nlp_listp);
 	spin_unlock_irq(shost->host_lock);
 	lpfc_nlp_state_cleanup(vport, ndlp, ndlp->nlp_state,
-			       NLP_STE_UNUSED_NODE);
+				NLP_STE_UNUSED_NODE);
+}
+
+void
+lpfc_disable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
+{
+	if ((ndlp->nlp_flag & NLP_DELAY_TMO) != 0)
+		lpfc_cancel_retry_delay_tmo(vport, ndlp);
+	if (ndlp->nlp_state && !list_empty(&ndlp->nlp_listp))
+		lpfc_nlp_counters(vport, ndlp->nlp_state, -1);
+	lpfc_nlp_state_cleanup(vport, ndlp, ndlp->nlp_state,
+				NLP_STE_UNUSED_NODE);
+}
+
+struct lpfc_nodelist *
+lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
+		 int state)
+{
+	struct lpfc_hba *phba = vport->phba;
+	uint32_t did;
+	unsigned long flags;
+
+	if (!ndlp)
+		return NULL;
+
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	/* The ndlp should not be in memory free mode */
+	if (NLP_CHK_FREE_REQ(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0277 lpfc_enable_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return NULL;
+	}
+	/* The ndlp should not already be in active mode */
+	if (NLP_CHK_NODE_ACT(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0278 lpfc_enable_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return NULL;
+	}
+
+	/* Keep the original DID */
+	did = ndlp->nlp_DID;
+
+	/* re-initialize ndlp except of ndlp linked list pointer */
+	memset((((char *)ndlp) + sizeof (struct list_head)), 0,
+		sizeof (struct lpfc_nodelist) - sizeof (struct list_head));
+	INIT_LIST_HEAD(&ndlp->els_retry_evt.evt_listp);
+	INIT_LIST_HEAD(&ndlp->dev_loss_evt.evt_listp);
+	init_timer(&ndlp->nlp_delayfunc);
+	ndlp->nlp_delayfunc.function = lpfc_els_retry_delay;
+	ndlp->nlp_delayfunc.data = (unsigned long)ndlp;
+	ndlp->nlp_DID = did;
+	ndlp->vport = vport;
+	ndlp->nlp_sid = NLP_NO_SID;
+	/* ndlp management re-initialize */
+	kref_init(&ndlp->kref);
+	NLP_INT_NODE_ACT(ndlp);
+
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+
+	if (state != NLP_STE_UNUSED_NODE)
+		lpfc_nlp_set_state(vport, ndlp, state);
+
+	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_NODE,
+		"node enable:       did:x%x",
+		ndlp->nlp_DID, 0, 0);
+	return ndlp;
 }
 
 void
@@ -1972,7 +2067,21 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 			 "Data: x%x x%x x%x\n",
 			 ndlp->nlp_DID, ndlp->nlp_flag,
 			 ndlp->nlp_state, ndlp->nlp_rpi);
-	lpfc_dequeue_node(vport, ndlp);
+	if (NLP_CHK_FREE_REQ(ndlp)) {
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0280 lpfc_cleanup_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		lpfc_dequeue_node(vport, ndlp);
+	} else {
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0281 lpfc_cleanup_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		lpfc_disable_node(vport, ndlp);
+	}
 
 	/* cleanup any ndlp on mbox q waiting for reglogin cmpl */
 	if ((mb = phba->sli.mbox_active)) {
@@ -1994,12 +2103,16 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 			}
 			list_del(&mb->list);
 			mempool_free(mb, phba->mbox_mem_pool);
-			lpfc_nlp_put(ndlp);
+			/* We shall not invoke the lpfc_nlp_put to decrement
+			 * the ndlp reference count as we are in the process
+			 * of lpfc_nlp_release.
+			 */
 		}
 	}
 	spin_unlock_irq(&phba->hbalock);
 
-	lpfc_els_abort(phba,ndlp);
+	lpfc_els_abort(phba, ndlp);
+
 	spin_lock_irq(shost->host_lock);
 	ndlp->nlp_flag &= ~NLP_DELAY_TMO;
 	spin_unlock_irq(shost->host_lock);
@@ -2057,7 +2170,6 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 			}
 		}
 	}
-
 	lpfc_cleanup_node(vport, ndlp);
 
 	/*
@@ -2182,7 +2294,16 @@ lpfc_setup_disc_node(struct lpfc_vport *vport, uint32_t did)
 		ndlp->nlp_flag |= NLP_NPR_2B_DISC;
 		spin_unlock_irq(shost->host_lock);
 		return ndlp;
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_NPR_NODE);
+		if (!ndlp)
+			return NULL;
+		spin_lock_irq(shost->host_lock);
+		ndlp->nlp_flag |= NLP_NPR_2B_DISC;
+		spin_unlock_irq(shost->host_lock);
+		return ndlp;
 	}
+
 	if (vport->fc_flag & FC_RSCN_MODE) {
 		if (lpfc_rscn_payload_check(vport, did)) {
 			/* If we've already recieved a PLOGI from this NPort
@@ -2485,6 +2606,8 @@ lpfc_disc_flush_list(struct lpfc_vport *vport)
 	if (vport->fc_plogi_cnt || vport->fc_adisc_cnt) {
 		list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes,
 					 nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state == NLP_STE_PLOGI_ISSUE ||
 			    ndlp->nlp_state == NLP_STE_ADISC_ISSUE) {
 				lpfc_free_tx(phba, ndlp);
@@ -2572,6 +2695,8 @@ lpfc_disc_timeout_handler(struct lpfc_vport *vport)
 		/* Start discovery by sending FLOGI, clean up old rpis */
 		list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes,
 					 nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state != NLP_STE_NPR_NODE)
 				continue;
 			if (ndlp->nlp_type & NLP_FABRIC) {
@@ -2618,7 +2743,7 @@ lpfc_disc_timeout_handler(struct lpfc_vport *vport)
 				 "NameServer login\n");
 		/* Next look for NameServer ndlp */
 		ndlp = lpfc_findnode_did(vport, NameServer_DID);
-		if (ndlp)
+		if (ndlp && NLP_CHK_NODE_ACT(ndlp))
 			lpfc_els_abort(phba, ndlp);
 
 		/* ReStart discovery */
@@ -2897,6 +3022,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	ndlp->nlp_sid = NLP_NO_SID;
 	INIT_LIST_HEAD(&ndlp->nlp_listp);
 	kref_init(&ndlp->kref);
+	NLP_INT_NODE_ACT(ndlp);
 
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_NODE,
 		"node init:       did:x%x",
@@ -2911,6 +3037,8 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 static void
 lpfc_nlp_release(struct kref *kref)
 {
+	struct lpfc_hba *phba;
+	unsigned long flags;
 	struct lpfc_nodelist *ndlp = container_of(kref, struct lpfc_nodelist,
 						  kref);
 
@@ -2918,8 +3046,24 @@ lpfc_nlp_release(struct kref *kref)
 		"node release:    did:x%x flg:x%x type:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag, ndlp->nlp_type);
 
+	lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
+			"0279 lpfc_nlp_release: ndlp:x%p "
+			"usgmap:x%x refcnt:%d\n",
+			(void *)ndlp, ndlp->nlp_usg_map,
+			atomic_read(&ndlp->kref.refcount));
+
+	/* remove ndlp from action. */
 	lpfc_nlp_remove(ndlp->vport, ndlp);
-	mempool_free(ndlp, ndlp->vport->phba->nlp_mem_pool);
+
+	/* clear the ndlp active flag for all release cases */
+	phba = ndlp->vport->phba;
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	NLP_CLR_NODE_ACT(ndlp);
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+
+	/* free ndlp memory for final ndlp release */
+	if (NLP_CHK_FREE_REQ(ndlp))
+		mempool_free(ndlp, ndlp->vport->phba->nlp_mem_pool);
 }
 
 /* This routine bumps the reference count for a ndlp structure to ensure
@@ -2929,37 +3073,108 @@ lpfc_nlp_release(struct kref *kref)
 struct lpfc_nodelist *
 lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 {
+	struct lpfc_hba *phba;
+	unsigned long flags;
+
 	if (ndlp) {
 		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 			"node get:        did:x%x flg:x%x refcnt:x%x",
 			ndlp->nlp_DID, ndlp->nlp_flag,
 			atomic_read(&ndlp->kref.refcount));
-		kref_get(&ndlp->kref);
+		/* The check of ndlp usage to prevent incrementing the
+		 * ndlp reference count that is in the process of being
+		 * released.
+		 */
+		phba = ndlp->vport->phba;
+		spin_lock_irqsave(&phba->ndlp_lock, flags);
+		if (!NLP_CHK_NODE_ACT(ndlp) || NLP_CHK_FREE_ACK(ndlp)) {
+			spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+			lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0276 lpfc_nlp_get: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+			return NULL;
+		} else
+			kref_get(&ndlp->kref);
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
 	}
 	return ndlp;
 }
 
-
 /* This routine decrements the reference count for a ndlp structure. If the
- * count goes to 0, this indicates the the associated nodelist should be freed.
+ * count goes to 0, this indicates the the associated nodelist should be
+ * freed. Returning 1 indicates the ndlp resource has been released; on the
+ * other hand, returning 0 indicates the ndlp resource has not been released
+ * yet.
  */
 int
 lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 {
-	if (ndlp) {
-		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
-		"node put:        did:x%x flg:x%x refcnt:x%x",
-			ndlp->nlp_DID, ndlp->nlp_flag,
-			atomic_read(&ndlp->kref.refcount));
+	struct lpfc_hba *phba;
+	unsigned long flags;
+
+	if (!ndlp)
+		return 1;
+
+	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
+	"node put:        did:x%x flg:x%x refcnt:x%x",
+		ndlp->nlp_DID, ndlp->nlp_flag,
+		atomic_read(&ndlp->kref.refcount));
+	phba = ndlp->vport->phba;
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	/* Check the ndlp memory free acknowledge flag to avoid the
+	 * possible race condition that kref_put got invoked again
+	 * after previous one has done ndlp memory free.
+	 */
+	if (NLP_CHK_FREE_ACK(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0274 lpfc_nlp_put: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return 1;
+	}
+	/* Check the ndlp inactivate log flag to avoid the possible
+	 * race condition that kref_put got invoked again after ndlp
+	 * is already in inactivating state.
+	 */
+	if (NLP_CHK_IACT_REQ(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0275 lpfc_nlp_put: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return 1;
 	}
-	return ndlp ? kref_put(&ndlp->kref, lpfc_nlp_release) : 0;
+	/* For last put, mark the ndlp usage flags to make sure no
+	 * other kref_get and kref_put on the same ndlp shall get
+	 * in between the process when the final kref_put has been
+	 * invoked on this ndlp.
+	 */
+	if (atomic_read(&ndlp->kref.refcount) == 1) {
+		/* Indicate ndlp is put to inactive state. */
+		NLP_SET_IACT_REQ(ndlp);
+		/* Acknowledge ndlp memory free has been seen. */
+		if (NLP_CHK_FREE_REQ(ndlp))
+			NLP_SET_FREE_ACK(ndlp);
+	}
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+	/* Note, the kref_put returns 1 when decrementing a reference
+	 * count that was 1, it invokes the release callback function,
+	 * but it still left the reference count as 1 (not actually
+	 * performs the last decrementation). Otherwise, it actually
+	 * decrements the reference count and returns 0.
+	 */
+	return kref_put(&ndlp->kref, lpfc_nlp_release);
 }
 
 /* This routine free's the specified nodelist if it is not in use
- * by any other discovery thread. This routine returns 1 if the ndlp
- * is not being used by anyone and has been freed. A return value of
- * 0 indicates it is being used by another discovery thread and the
- * refcount is left unchanged.
+ * by any other discovery thread. This routine returns 1 if the
+ * ndlp has been freed. A return value of 0 indicates the ndlp is
+ * not yet been released.
  */
 int
 lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
@@ -2968,11 +3183,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
 		"node not used:   did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
 		atomic_read(&ndlp->kref.refcount));
-
-	if (atomic_read(&ndlp->kref.refcount) == 1) {
-		lpfc_nlp_put(ndlp);
-		return 1;
-	}
+	if (atomic_read(&ndlp->kref.refcount) == 1)
+		if (lpfc_nlp_put(ndlp))
+			return 1;
 	return 0;
 }
-
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 6cfeba7454d4..e0363bef6d29 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -1422,9 +1422,32 @@ lpfc_cleanup(struct lpfc_vport *vport)
 		lpfc_port_link_failure(vport);
 
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp)) {
+			ndlp = lpfc_enable_node(vport, ndlp,
+						NLP_STE_UNUSED_NODE);
+			if (!ndlp)
+				continue;
+			spin_lock_irq(&phba->ndlp_lock);
+			NLP_SET_FREE_REQ(ndlp);
+			spin_unlock_irq(&phba->ndlp_lock);
+			/* Trigger the release of the ndlp memory */
+			lpfc_nlp_put(ndlp);
+			continue;
+		}
+		spin_lock_irq(&phba->ndlp_lock);
+		if (NLP_CHK_FREE_REQ(ndlp)) {
+			/* The ndlp should not be in memory free mode already */
+			spin_unlock_irq(&phba->ndlp_lock);
+			continue;
+		} else
+			/* Indicate request for freeing ndlp memory */
+			NLP_SET_FREE_REQ(ndlp);
+		spin_unlock_irq(&phba->ndlp_lock);
+
 		if (ndlp->nlp_type & NLP_FABRIC)
 			lpfc_disc_state_machine(vport, ndlp, NULL,
 					NLP_EVT_DEVICE_RECOVERY);
+
 		lpfc_disc_state_machine(vport, ndlp, NULL,
 					     NLP_EVT_DEVICE_RM);
 	}
@@ -1438,6 +1461,17 @@ lpfc_cleanup(struct lpfc_vport *vport)
 		if (i++ > 3000) {
 			lpfc_printf_vlog(vport, KERN_ERR, LOG_DISCOVERY,
 				"0233 Nodelist not empty\n");
+			list_for_each_entry_safe(ndlp, next_ndlp,
+						&vport->fc_nodes, nlp_listp) {
+				lpfc_printf_vlog(ndlp->vport, KERN_ERR,
+						LOG_NODE,
+						"0282: did:x%x ndlp:x%p "
+						"usgmap:x%x refcnt:%d\n",
+						ndlp->nlp_DID, (void *)ndlp,
+						ndlp->nlp_usg_map,
+						atomic_read(
+							&ndlp->kref.refcount));
+			}
 			break;
 		}
 
@@ -1586,6 +1620,8 @@ lpfc_offline_prep(struct lpfc_hba * phba)
 			list_for_each_entry_safe(ndlp, next_ndlp,
 						 &vports[i]->fc_nodes,
 						 nlp_listp) {
+				if (!NLP_CHK_NODE_ACT(ndlp))
+					continue;
 				if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 					continue;
 				if (ndlp->nlp_type & NLP_FABRIC) {
@@ -1905,6 +1941,9 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 
 	spin_lock_init(&phba->hbalock);
 
+	/* Initialize ndlp management spinlock */
+	spin_lock_init(&phba->ndlp_lock);
+
 	phba->pcidev = pdev;
 
 	/* Assign an unused board number */
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index 4a0e3406e37a..01d548375811 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -1,7 +1,7 @@
  /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -249,6 +249,7 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	struct Scsi_Host   *shost = lpfc_shost_from_vport(vport);
 	struct lpfc_hba    *phba = vport->phba;
 	struct lpfc_dmabuf *pcmd;
+	struct lpfc_work_evt *evtp;
 	uint32_t *lp;
 	IOCB_t *icmd;
 	struct serv_parm *sp;
@@ -435,8 +436,14 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 		del_timer_sync(&ndlp->nlp_delayfunc);
 		ndlp->nlp_last_elscmd = 0;
 
-		if (!list_empty(&ndlp->els_retry_evt.evt_listp))
+		if (!list_empty(&ndlp->els_retry_evt.evt_listp)) {
 			list_del_init(&ndlp->els_retry_evt.evt_listp);
+			/* Decrement ndlp reference count held for the
+			 * delayed retry
+			 */
+			evtp = &ndlp->els_retry_evt;
+			lpfc_nlp_put((struct lpfc_nodelist *)evtp->evt_arg1);
+		}
 
 		if (ndlp->nlp_flag & NLP_NPR_2B_DISC) {
 			spin_lock_irq(shost->host_lock);
@@ -656,7 +663,7 @@ lpfc_disc_illegal(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 		  void *arg, uint32_t evt)
 {
 	lpfc_printf_vlog(vport, KERN_ERR, LOG_DISCOVERY,
-			 "0253 Illegal State Transition: node x%x "
+			 "0271 Illegal State Transition: node x%x "
 			 "event x%x, state x%x Data: x%x x%x\n",
 			 ndlp->nlp_DID, evt, ndlp->nlp_state, ndlp->nlp_rpi,
 			 ndlp->nlp_flag);
@@ -674,7 +681,7 @@ lpfc_cmpl_plogi_illegal(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	 */
 	if (!(ndlp->nlp_flag & NLP_RCV_PLOGI)) {
 		lpfc_printf_vlog(vport, KERN_ERR, LOG_DISCOVERY,
-			 "0253 Illegal State Transition: node x%x "
+			 "0272 Illegal State Transition: node x%x "
 			 "event x%x, state x%x Data: x%x x%x\n",
 			 ndlp->nlp_DID, evt, ndlp->nlp_state, ndlp->nlp_rpi,
 			 ndlp->nlp_flag);
@@ -2144,8 +2151,11 @@ lpfc_disc_state_machine(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	uint32_t cur_state, rc;
 	uint32_t(*func) (struct lpfc_vport *, struct lpfc_nodelist *, void *,
 			 uint32_t);
+	uint32_t got_ndlp = 0;
+
+	if (lpfc_nlp_get(ndlp))
+		got_ndlp = 1;
 
-	lpfc_nlp_get(ndlp);
 	cur_state = ndlp->nlp_state;
 
 	/* DSM in event <evt> on NPort <nlp_DID> in state <cur_state> */
@@ -2162,15 +2172,24 @@ lpfc_disc_state_machine(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 	rc = (func) (vport, ndlp, arg, evt);
 
 	/* DSM out state <rc> on NPort <nlp_DID> */
-	lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
+	if (got_ndlp) {
+		lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
 			 "0212 DSM out state %d on NPort x%x Data: x%x\n",
 			 rc, ndlp->nlp_DID, ndlp->nlp_flag);
 
-	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_DSM,
-		 "DSM out:         ste:%d did:x%x flg:x%x",
-		rc, ndlp->nlp_DID, ndlp->nlp_flag);
+		lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_DSM,
+			"DSM out:         ste:%d did:x%x flg:x%x",
+			rc, ndlp->nlp_DID, ndlp->nlp_flag);
+		/* Decrement the ndlp reference count held for this function */
+		lpfc_nlp_put(ndlp);
+	} else {
+		lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
+			"0212 DSM out state %d on NPort free\n", rc);
 
-	lpfc_nlp_put(ndlp);
+		lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_DSM,
+			"DSM out:         ste:%d did:x%x flg:x%x",
+			rc, 0, 0);
+	}
 
 	return rc;
 }
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index fc5c3a42b05a..70255c11d3ad 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -1283,6 +1283,8 @@ lpfc_bus_reset_handler(struct scsi_cmnd *cmnd)
 		match = 0;
 		spin_lock_irq(shost->host_lock);
 		list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state == NLP_STE_MAPPED_NODE &&
 			    i == ndlp->nlp_sid &&
 			    ndlp->rport) {
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index 9fad7663c117..86d05beb00b8 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2006 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -327,7 +327,8 @@ lpfc_vport_create(struct fc_vport *fc_vport, bool disable)
 	 * up and ready to FDISC.
 	 */
 	ndlp = lpfc_findnode_did(phba->pport, Fabric_DID);
-	if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp) &&
+	    ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
 		if (phba->link_flag & LS_NPIV_FAB_SUPPORTED) {
 			lpfc_set_disctmo(vport);
 			lpfc_initial_fdisc(vport);
@@ -358,7 +359,8 @@ disable_vport(struct fc_vport *fc_vport)
 	long timeout;
 
 	ndlp = lpfc_findnode_did(vport, Fabric_DID);
-	if (ndlp && phba->link_state >= LPFC_LINK_UP) {
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp)
+	    && phba->link_state >= LPFC_LINK_UP) {
 		vport->unreg_vpi_cmpl = VPORT_INVAL;
 		timeout = msecs_to_jiffies(phba->fc_ratov * 2000);
 		if (!lpfc_issue_els_npiv_logo(vport, ndlp))
@@ -372,6 +374,8 @@ disable_vport(struct fc_vport *fc_vport)
 	 * calling lpfc_cleanup_rpis(vport, 1)
 	 */
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 			continue;
 		lpfc_disc_state_machine(vport, ndlp, NULL,
@@ -414,7 +418,8 @@ enable_vport(struct fc_vport *fc_vport)
 	 * up and ready to FDISC.
 	 */
 	ndlp = lpfc_findnode_did(phba->pport, Fabric_DID);
-	if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp)
+	    && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
 		if (phba->link_flag & LS_NPIV_FAB_SUPPORTED) {
 			lpfc_set_disctmo(vport);
 			lpfc_initial_fdisc(vport);
@@ -498,7 +503,41 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
 	scsi_remove_host(lpfc_shost_from_vport(vport));
 
 	ndlp = lpfc_findnode_did(phba->pport, Fabric_DID);
-	if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE &&
+
+	/* In case of driver unload, we shall not perform fabric logo as the
+	 * worker thread already stopped at this stage and, in this case, we
+	 * can safely skip the fabric logo.
+	 */
+	if (phba->pport->load_flag & FC_UNLOADING) {
+		if (ndlp && NLP_CHK_NODE_ACT(ndlp) &&
+		    ndlp->nlp_state == NLP_STE_UNMAPPED_NODE &&
+		    phba->link_state >= LPFC_LINK_UP) {
+			/* First look for the Fabric ndlp */
+			ndlp = lpfc_findnode_did(vport, Fabric_DID);
+			if (!ndlp)
+				goto skip_logo;
+			else if (!NLP_CHK_NODE_ACT(ndlp)) {
+				ndlp = lpfc_enable_node(vport, ndlp,
+							NLP_STE_UNUSED_NODE);
+				if (!ndlp)
+					goto skip_logo;
+			}
+			/* Remove ndlp from vport npld list */
+			lpfc_dequeue_node(vport, ndlp);
+
+			/* Indicate free memory when release */
+			spin_lock_irq(&phba->ndlp_lock);
+			NLP_SET_FREE_REQ(ndlp);
+			spin_unlock_irq(&phba->ndlp_lock);
+			/* Kick off release ndlp when it can be safely done */
+			lpfc_nlp_put(ndlp);
+		}
+		goto skip_logo;
+	}
+
+	/* Otherwise, we will perform fabric logo as needed */
+	if (ndlp && NLP_CHK_NODE_ACT(ndlp) &&
+	    ndlp->nlp_state == NLP_STE_UNMAPPED_NODE &&
 	    phba->link_state >= LPFC_LINK_UP) {
 		if (vport->cfg_enable_da_id) {
 			timeout = msecs_to_jiffies(phba->fc_ratov * 2000);
@@ -519,8 +558,27 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
 			if (!ndlp)
 				goto skip_logo;
 			lpfc_nlp_init(vport, ndlp, Fabric_DID);
+			/* Indicate free memory when release */
+			NLP_SET_FREE_REQ(ndlp);
 		} else {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				ndlp = lpfc_enable_node(vport, ndlp,
+						NLP_STE_UNUSED_NODE);
+				if (!ndlp)
+					goto skip_logo;
+
+			/* Remove ndlp from vport npld list */
 			lpfc_dequeue_node(vport, ndlp);
+			spin_lock_irq(&phba->ndlp_lock);
+			if (!NLP_CHK_FREE_REQ(ndlp))
+				/* Indicate free memory when release */
+				NLP_SET_FREE_REQ(ndlp);
+			else {
+				/* Skip this if ndlp is already in free mode */
+				spin_unlock_irq(&phba->ndlp_lock);
+				goto skip_logo;
+			}
+			spin_unlock_irq(&phba->ndlp_lock);
 		}
 		vport->unreg_vpi_cmpl = VPORT_INVAL;
 		timeout = msecs_to_jiffies(phba->fc_ratov * 2000);
@@ -534,9 +592,9 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
 	lpfc_sli_host_down(vport);
 
 	lpfc_stop_vport_timers(vport);
-	lpfc_unreg_all_rpis(vport);
 
 	if (!(phba->pport->load_flag & FC_UNLOADING)) {
+		lpfc_unreg_all_rpis(vport);
 		lpfc_unreg_default_rpis(vport);
 		/*
 		 * Completion of unreg_vpi (lpfc_mbx_cmpl_unreg_vpi)

From 1b32f6aa9935ab88eac0d608a4b06369f5d9064a Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:49:39 -0500
Subject: [PATCH 13/66] [SCSI] lpfc 8.2.5 : Miscellaneous Fixes

Miscellaneous fixes:
- Fix ERRATT flag which was overlapping
- Allow RESTART mbx commands through when stopped.
- Accept incoming PLOGI when connected to an N_Port.
- Fix NPort to NPort pt2pt problems: ADISC and reg_vpi issues
- Fix vport unloading error that erroneously cleaned up RSCN buffers
- Fix memory leak during repeated unloads - in mbox handling
- Fix link bounce vs FLOGI race conditions

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc.h           |  4 ++--
 drivers/scsi/lpfc/lpfc_attr.c      |  8 +++++---
 drivers/scsi/lpfc/lpfc_els.c       | 14 +++++++++++---
 drivers/scsi/lpfc/lpfc_hbadisc.c   | 15 ++-------------
 drivers/scsi/lpfc/lpfc_init.c      | 13 +++++++++++--
 drivers/scsi/lpfc/lpfc_nportdisc.c | 16 +++++++++-------
 drivers/scsi/lpfc/lpfc_sli.c       |  4 +---
 7 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 572c525ad2b5..cbe07d79bd12 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -409,7 +409,7 @@ struct lpfc_hba {
 					/* This flag is set while issuing */
 					/* INIT_LINK mailbox command */
 #define LS_NPIV_FAB_SUPPORTED 0x2	/* Fabric supports NPIV */
-#define LS_IGNORE_ERATT       0x3	/* intr handler should ignore ERATT */
+#define LS_IGNORE_ERATT       0x4	/* intr handler should ignore ERATT */
 
 	struct lpfc_sli2_slim *slim2p;
 	struct lpfc_dmabuf hbqslimp;
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index ef061d97a222..fc48e40c4d29 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1946,11 +1946,13 @@ sysfs_mbox_read(struct kobject *kobj, struct bin_attribute *bin_attr,
 		}
 
 		/* If HBA encountered an error attention, allow only DUMP
-		 * mailbox command until the HBA is restarted.
+		 * or RESTART mailbox commands until the HBA is restarted.
 		 */
 		if ((phba->pport->stopped) &&
-			(phba->sysfs_mbox.mbox->mb.mbxCommand
-				!= MBX_DUMP_MEMORY)) {
+			(phba->sysfs_mbox.mbox->mb.mbxCommand !=
+				MBX_DUMP_MEMORY &&
+			 phba->sysfs_mbox.mbox->mb.mbxCommand !=
+				MBX_RESTART)) {
 			sysfs_mbox_idle(phba);
 			spin_unlock_irq(&phba->hbalock);
 			return -EPERM;
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 39268e6a1a05..60afc8028ff5 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -2046,7 +2046,8 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		retry = 1;
 
 	if ((cmd == ELS_CMD_FLOGI) &&
-	    (phba->fc_topology != TOPOLOGY_LOOP)) {
+	    (phba->fc_topology != TOPOLOGY_LOOP) &&
+	    !lpfc_error_lost_link(irsp)) {
 		/* FLOGI retry policy */
 		retry = 1;
 		maxretry = 48;
@@ -4091,8 +4092,15 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 		ndlp = lpfc_plogi_confirm_nport(phba, payload, ndlp);
 
 		if (vport->port_state < LPFC_DISC_AUTH) {
-			rjt_err = LSRJT_UNABLE_TPC;
-			break;
+			if (!(phba->pport->fc_flag & FC_PT2PT) ||
+				(phba->pport->fc_flag & FC_PT2PT_PLOGI)) {
+				rjt_err = LSRJT_UNABLE_TPC;
+				break;
+			}
+			/* We get here, and drop thru, if we are PT2PT with
+			 * another NPort and the other side has initiated
+			 * the PLOGI before responding to our FLOGI.
+			 */
 		}
 
 		shost = lpfc_shost_from_vport(vport);
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 1ee3e62c78a7..25892671bfb0 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -800,21 +800,9 @@ lpfc_mbx_cmpl_clear_la(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	writel(control, phba->HCregaddr);
 	readl(phba->HCregaddr); /* flush */
 	spin_unlock_irq(&phba->hbalock);
+	mempool_free(pmb, phba->mbox_mem_pool);
 	return;
 
-	vport->num_disc_nodes = 0;
-	/* go thru NPR nodes and issue ELS PLOGIs */
-	if (vport->fc_npr_cnt)
-		lpfc_els_disc_plogi(vport);
-
-	if (!vport->num_disc_nodes) {
-		spin_lock_irq(shost->host_lock);
-		vport->fc_flag &= ~FC_NDISC_ACTIVE;
-		spin_unlock_irq(shost->host_lock);
-	}
-
-	vport->port_state = LPFC_VPORT_READY;
-
 out:
 	/* Device Discovery completes */
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
@@ -2484,6 +2472,7 @@ lpfc_disc_start(struct lpfc_vport *vport)
 	 * continue discovery.
 	 */
 	if ((phba->sli3_options & LPFC_SLI3_NPIV_ENABLED) &&
+	    !(vport->fc_flag & FC_PT2PT) &&
 	    !(vport->fc_flag & FC_RSCN_MODE)) {
 		lpfc_issue_reg_vpi(phba, vport);
 		return;
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index e0363bef6d29..99141545c25e 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -461,12 +461,21 @@ lpfc_config_port_post(struct lpfc_hba *phba)
 int
 lpfc_hba_down_prep(struct lpfc_hba *phba)
 {
+	struct lpfc_vport **vports;
+	int i;
 	/* Disable interrupts */
 	writel(0, phba->HCregaddr);
 	readl(phba->HCregaddr); /* flush */
 
-	lpfc_cleanup_discovery_resources(phba->pport);
-	return 0;
+	if (phba->pport->load_flag & FC_UNLOADING)
+		lpfc_cleanup_discovery_resources(phba->pport);
+	else {
+		vports = lpfc_create_vport_work_array(phba);
+		if (vports != NULL)
+			for(i = 0; i <= phba->max_vpi && vports[i] != NULL; i++)
+				lpfc_cleanup_discovery_resources(vports[i]);
+		lpfc_destroy_vport_work_array(phba, vports);
+	}	return 0;
 }
 
 /************************************************************************/
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index 01d548375811..d513813f6697 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -645,13 +645,15 @@ lpfc_disc_set_adisc(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 		return 0;
 	}
 
-	/* Check config parameter use-adisc or FCP-2 */
-	if ((vport->cfg_use_adisc && (vport->fc_flag & FC_RSCN_MODE)) ||
-	    ndlp->nlp_fcp_info & NLP_FCP_2_DEVICE) {
-		spin_lock_irq(shost->host_lock);
-		ndlp->nlp_flag |= NLP_NPR_ADISC;
-		spin_unlock_irq(shost->host_lock);
-		return 1;
+	if (!(vport->fc_flag & FC_PT2PT)) {
+		/* Check config parameter use-adisc or FCP-2 */
+		if ((vport->cfg_use_adisc && (vport->fc_flag & FC_RSCN_MODE)) ||
+		    ndlp->nlp_fcp_info & NLP_FCP_2_DEVICE) {
+			spin_lock_irq(shost->host_lock);
+			ndlp->nlp_flag |= NLP_NPR_ADISC;
+			spin_unlock_irq(shost->host_lock);
+			return 1;
+		}
 	}
 	ndlp->nlp_flag &= ~NLP_NPR_ADISC;
 	lpfc_unreg_rpi(vport, ndlp);
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index fdd01e384e36..456b8ec7753b 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -2404,9 +2404,7 @@ lpfc_do_config_port(struct lpfc_hba *phba, int sli_mode)
 	if ((pmb->mb.un.varCfgPort.sli_mode == 3) &&
 		(!pmb->mb.un.varCfgPort.cMA)) {
 		rc = -ENXIO;
-		goto do_prep_failed;
 	}
-	return rc;
 
 do_prep_failed:
 	mempool_free(pmb, phba->mbox_mem_pool);

From db2378e09151c855e8f92c1b4b2fb4fc5cd8cb40 Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:49:51 -0500
Subject: [PATCH 14/66] [SCSI] lpfc 8.2.5 : Add MSI-X single message support

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc.h      | 10 +++-
 drivers/scsi/lpfc/lpfc_attr.c |  6 ++-
 drivers/scsi/lpfc/lpfc_init.c | 97 ++++++++++++++++++++++++++++-------
 3 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index cbe07d79bd12..10f983b479c6 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -392,6 +392,13 @@ enum hba_temp_state {
 	HBA_OVER_TEMP
 };
 
+enum intr_type_t {
+	NONE = 0,
+	INTx,
+	MSI,
+	MSIX,
+};
+
 struct lpfc_hba {
 	struct lpfc_sli sli;
 	uint32_t sli_rev;		/* SLI2 or SLI3 */
@@ -555,7 +562,8 @@ struct lpfc_hba {
 	mempool_t *nlp_mem_pool;
 
 	struct fc_host_statistics link_stats;
-	uint8_t using_msi;
+	enum intr_type_t intr_type;
+	struct msix_entry msix_entries[1];
 
 	struct list_head port_list;
 	struct lpfc_vport *pport;	/* physical lpfc_vport pointer */
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index fc48e40c4d29..b12a841703ca 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1592,9 +1592,11 @@ LPFC_ATTR_RW(poll_tmo, 10, 1, 255,
 #		support this feature
 #       0  = MSI disabled (default)
 #       1  = MSI enabled
-# Value range is [0,1]. Default value is 0.
+#	2  = MSI-X enabled
+# Value range is [0,2]. Default value is 0.
 */
-LPFC_ATTR_R(use_msi, 0, 0, 1, "Use Message Signaled Interrupts, if possible");
+LPFC_ATTR_R(use_msi, 0, 0, 2, "Use Message Signaled Interrupts (1) or "
+	    "MSI-X (2), if possible");
 
 /*
 # lpfc_enable_hba_reset: Allow or prevent HBA resets to the hardware.
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 99141545c25e..a087524acf41 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -1924,6 +1924,42 @@ void lpfc_host_attrib_init(struct Scsi_Host *shost)
 	spin_unlock_irq(shost->host_lock);
 }
 
+static int
+lpfc_enable_msix(struct lpfc_hba *phba)
+{
+	int error;
+
+	phba->msix_entries[0].entry = 0;
+	phba->msix_entries[0].vector = 0;
+
+	error = pci_enable_msix(phba->pcidev, phba->msix_entries,
+				ARRAY_SIZE(phba->msix_entries));
+	if (error) {
+		lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+				"0420 Enable MSI-X failed (%d), continuing "
+				"with MSI\n", error);
+		pci_disable_msix(phba->pcidev);
+		return error;
+	}
+
+	error =	request_irq(phba->msix_entries[0].vector, lpfc_intr_handler, 0,
+			    LPFC_DRIVER_NAME, phba);
+	if (error) {
+		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+				"0421 MSI-X request_irq failed (%d), "
+				"continuing with MSI\n", error);
+		pci_disable_msix(phba->pcidev);
+	}
+	return error;
+}
+
+static void
+lpfc_disable_msix(struct lpfc_hba *phba)
+{
+	free_irq(phba->msix_entries[0].vector, phba);
+	pci_disable_msix(phba->pcidev);
+}
+
 static int __devinit
 lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 {
@@ -2125,24 +2161,36 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 	lpfc_debugfs_initialize(vport);
 
 	pci_set_drvdata(pdev, shost);
+	phba->intr_type = NONE;
 
-	if (phba->cfg_use_msi) {
+	if (phba->cfg_use_msi == 2) {
+		error = lpfc_enable_msix(phba);
+		if (!error)
+			phba->intr_type = MSIX;
+	}
+
+	/* Fallback to MSI if MSI-X initialization failed */
+	if (phba->cfg_use_msi >= 1 && phba->intr_type == NONE) {
 		retval = pci_enable_msi(phba->pcidev);
 		if (!retval)
-			phba->using_msi = 1;
+			phba->intr_type = MSI;
 		else
 			lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
 					"0452 Enable MSI failed, continuing "
 					"with IRQ\n");
 	}
 
-	retval = request_irq(phba->pcidev->irq, lpfc_intr_handler, IRQF_SHARED,
-			    LPFC_DRIVER_NAME, phba);
-	if (retval) {
-		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-			"0451 Enable interrupt handler failed\n");
-		error = retval;
-		goto out_disable_msi;
+	/* MSI-X is the only case the doesn't need to call request_irq */
+	if (phba->intr_type != MSIX) {
+		retval = request_irq(phba->pcidev->irq, lpfc_intr_handler,
+				     IRQF_SHARED, LPFC_DRIVER_NAME, phba);
+		if (retval) {
+			lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "0451 Enable "
+					"interrupt handler failed\n");
+			error = retval;
+			goto out_disable_msi;
+		} else if (phba->intr_type != MSI)
+			phba->intr_type = INTx;
 	}
 
 	phba->MBslimaddr = phba->slim_memmap_p;
@@ -2187,9 +2235,14 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 out_free_irq:
 	lpfc_stop_phba_timers(phba);
 	phba->pport->work_port_events = 0;
-	free_irq(phba->pcidev->irq, phba);
+
+	if (phba->intr_type == MSIX)
+		lpfc_disable_msix(phba);
+	else
+		free_irq(phba->pcidev->irq, phba);
+
 out_disable_msi:
-	if (phba->using_msi)
+	if (phba->intr_type == MSI)
 		pci_disable_msi(phba->pcidev);
 	destroy_port(vport);
 out_kthread_stop:
@@ -2262,10 +2315,13 @@ lpfc_pci_remove_one(struct pci_dev *pdev)
 
 	lpfc_debugfs_terminate(vport);
 
-	/* Release the irq reservation */
-	free_irq(phba->pcidev->irq, phba);
-	if (phba->using_msi)
-		pci_disable_msi(phba->pcidev);
+	if (phba->intr_type == MSIX)
+		lpfc_disable_msix(phba);
+	else {
+		free_irq(phba->pcidev->irq, phba);
+		if (phba->intr_type == MSI)
+			pci_disable_msi(phba->pcidev);
+	}
 
 	pci_set_drvdata(pdev, NULL);
 	scsi_host_put(shost);
@@ -2324,10 +2380,13 @@ static pci_ers_result_t lpfc_io_error_detected(struct pci_dev *pdev,
 	pring = &psli->ring[psli->fcp_ring];
 	lpfc_sli_abort_iocb_ring(phba, pring);
 
-	/* Release the irq reservation */
-	free_irq(phba->pcidev->irq, phba);
-	if (phba->using_msi)
-		pci_disable_msi(phba->pcidev);
+	if (phba->intr_type == MSIX)
+		lpfc_disable_msix(phba);
+	else {
+		free_irq(phba->pcidev->irq, phba);
+		if (phba->intr_type == MSI)
+			pci_disable_msi(phba->pcidev);
+	}
 
 	/* Request a slot reset. */
 	return PCI_ERS_RESULT_NEED_RESET;

From 7f5f3d0d02aa2f124e764aee5c775589ce72fd42 Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:50:14 -0500
Subject: [PATCH 15/66] [SCSI] lpfc 8.2.5 : Miscellaneous discovery Fixes

Miscellaneous discovery fixes:
- Flush RSCN buffers on vports when reseting HBA.
- Fix incorrect FLOGI after vport reg failed
- Fix a potential fabric ELS race condition
- Fix handling of failed PLOGI command under high lip rates
- Fix FDISC handling
- Fix debug logging for npiv handling

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc.h        |   1 +
 drivers/scsi/lpfc/lpfc_ct.c     |   2 +-
 drivers/scsi/lpfc/lpfc_disc.h   |  33 ++++----
 drivers/scsi/lpfc/lpfc_els.c    | 133 ++++++++++++++++++++++----------
 drivers/scsi/lpfc/lpfc_hw.h     |   1 +
 drivers/scsi/lpfc/lpfc_init.c   |   5 +-
 drivers/scsi/lpfc/lpfc_logmsg.h |  10 ++-
 drivers/scsi/lpfc/lpfc_mem.c    |   4 +-
 drivers/scsi/lpfc/lpfc_sli.c    |   4 +-
 9 files changed, 127 insertions(+), 66 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 10f983b479c6..6c178f1c8786 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -307,6 +307,7 @@ struct lpfc_vport {
 
 	uint32_t fc_nlp_cnt;	/* outstanding NODELIST requests */
 	uint32_t fc_rscn_id_cnt;	/* count of RSCNs payloads in list */
+	uint32_t fc_rscn_flush;		/* flag use of fc_rscn_id_list */
 	struct lpfc_dmabuf *fc_rscn_id_list[FC_MAX_HOLD_RSCN];
 	struct lpfc_name fc_nodename;	/* fc nodename */
 	struct lpfc_name fc_portname;	/* fc portname */
diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c
index aea8d33f6d09..3d0ccd9b341d 100644
--- a/drivers/scsi/lpfc/lpfc_ct.c
+++ b/drivers/scsi/lpfc/lpfc_ct.c
@@ -775,7 +775,7 @@ lpfc_cmpl_ct_cmd_gff_id(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 				 "0267 NameServer GFF Rsp "
 				 "x%x Error (%d %d) Data: x%x x%x\n",
 				 did, irsp->ulpStatus, irsp->un.ulpWord[4],
-				 vport->fc_flag, vport->fc_rscn_id_cnt)
+				 vport->fc_flag, vport->fc_rscn_id_cnt);
 	}
 
 	/* This is a target port, unregistered port, or the GFF_ID failed */
diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h
index 8eb007309599..2db0b74b6fad 100644
--- a/drivers/scsi/lpfc/lpfc_disc.h
+++ b/drivers/scsi/lpfc/lpfc_disc.h
@@ -91,25 +91,26 @@ struct lpfc_nodelist {
 };
 
 /* Defines for nlp_flag (uint32) */
-#define NLP_PLOGI_SND      0x20		/* sent PLOGI request for this entry */
-#define NLP_PRLI_SND       0x40		/* sent PRLI request for this entry */
-#define NLP_ADISC_SND      0x80		/* sent ADISC request for this entry */
-#define NLP_LOGO_SND       0x100	/* sent LOGO request for this entry */
-#define NLP_RNID_SND       0x400	/* sent RNID request for this entry */
-#define NLP_ELS_SND_MASK   0x7e0	/* sent ELS request for this entry */
-#define NLP_DEFER_RM       0x10000	/* Remove this ndlp if no longer used */
-#define NLP_DELAY_TMO      0x20000	/* delay timeout is running for node */
-#define NLP_NPR_2B_DISC    0x40000	/* node is included in num_disc_nodes */
-#define NLP_RCV_PLOGI      0x80000	/* Rcv'ed PLOGI from remote system */
-#define NLP_LOGO_ACC       0x100000	/* Process LOGO after ACC completes */
-#define NLP_TGT_NO_SCSIID  0x200000	/* good PRLI but no binding for scsid */
-#define NLP_ACC_REGLOGIN   0x1000000	/* Issue Reg Login after successful
+#define NLP_PLOGI_SND      0x00000020	/* sent PLOGI request for this entry */
+#define NLP_PRLI_SND       0x00000040	/* sent PRLI request for this entry */
+#define NLP_ADISC_SND      0x00000080	/* sent ADISC request for this entry */
+#define NLP_LOGO_SND       0x00000100	/* sent LOGO request for this entry */
+#define NLP_RNID_SND       0x00000400	/* sent RNID request for this entry */
+#define NLP_ELS_SND_MASK   0x000007e0	/* sent ELS request for this entry */
+#define NLP_DEFER_RM       0x00010000	/* Remove this ndlp if no longer used */
+#define NLP_DELAY_TMO      0x00020000	/* delay timeout is running for node */
+#define NLP_NPR_2B_DISC    0x00040000	/* node is included in num_disc_nodes */
+#define NLP_RCV_PLOGI      0x00080000	/* Rcv'ed PLOGI from remote system */
+#define NLP_LOGO_ACC       0x00100000	/* Process LOGO after ACC completes */
+#define NLP_TGT_NO_SCSIID  0x00200000	/* good PRLI but no binding for scsid */
+#define NLP_ACC_REGLOGIN   0x01000000	/* Issue Reg Login after successful
 					   ACC */
-#define NLP_NPR_ADISC      0x2000000	/* Issue ADISC when dq'ed from
+#define NLP_NPR_ADISC      0x02000000	/* Issue ADISC when dq'ed from
 					   NPR list */
-#define NLP_RM_DFLT_RPI    0x4000000	/* need to remove leftover dflt RPI */
-#define NLP_NODEV_REMOVE   0x8000000	/* Defer removal till discovery ends */
+#define NLP_RM_DFLT_RPI    0x04000000	/* need to remove leftover dflt RPI */
+#define NLP_NODEV_REMOVE   0x08000000	/* Defer removal till discovery ends */
 #define NLP_TARGET_REMOVE  0x10000000   /* Target remove in process */
+#define NLP_SC_REQ         0x20000000	/* Target requires authentication */
 
 /* ndlp usage management macros */
 #define NLP_CHK_NODE_ACT(ndlp)		(((ndlp)->nlp_usg_map \
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 60afc8028ff5..cbb68a942255 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -1920,18 +1920,15 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 			break;
 
 		case IOERR_ILLEGAL_COMMAND:
-			if ((phba->sli3_options & LPFC_SLI3_VPORT_TEARDOWN) &&
-			    (cmd == ELS_CMD_FDISC)) {
-				lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS,
-						 "0124 FDISC failed (3/6) "
-						 "retrying...\n");
-				lpfc_mbx_unreg_vpi(vport);
-				retry = 1;
-				/* FDISC retry policy */
-				maxretry = 48;
-				if (cmdiocb->retry >= 32)
-					delay = 1000;
-			}
+			lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS,
+					 "0124 Retry illegal cmd x%x "
+					 "retry:x%x delay:x%x\n",
+					 cmd, cmdiocb->retry, delay);
+			retry = 1;
+			/* All command's retry policy */
+			maxretry = 8;
+			if (cmdiocb->retry > 2)
+				delay = 1000;
 			break;
 
 		case IOERR_NO_RESOURCES:
@@ -2017,6 +2014,17 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 			break;
 
 		case LSRJT_LOGICAL_ERR:
+			/* There are some cases where switches return this
+			 * error when they are not ready and should be returning
+			 * Logical Busy. We should delay every time.
+			 */
+			if (cmd == ELS_CMD_FDISC &&
+			    stat.un.b.lsRjtRsnCodeExp == LSEXP_PORT_LOGIN_REQ) {
+				maxretry = 3;
+				delay = 1000;
+				retry = 1;
+				break;
+			}
 		case LSRJT_PROTOCOL_ERR:
 			if ((phba->sli3_options & LPFC_SLI3_NPIV_ENABLED) &&
 			  (cmd == ELS_CMD_FDISC) &&
@@ -2931,6 +2939,16 @@ lpfc_els_flush_rscn(struct lpfc_vport *vport)
 	struct lpfc_hba  *phba = vport->phba;
 	int i;
 
+	spin_lock_irq(shost->host_lock);
+	if (vport->fc_rscn_flush) {
+		/* Another thread is walking fc_rscn_id_list on this vport */
+		spin_unlock_irq(shost->host_lock);
+		return;
+	}
+	/* Indicate we are walking lpfc_els_flush_rscn on this vport */
+	vport->fc_rscn_flush = 1;
+	spin_unlock_irq(shost->host_lock);
+
 	for (i = 0; i < vport->fc_rscn_id_cnt; i++) {
 		lpfc_in_buf_free(phba, vport->fc_rscn_id_list[i]);
 		vport->fc_rscn_id_list[i] = NULL;
@@ -2940,6 +2958,8 @@ lpfc_els_flush_rscn(struct lpfc_vport *vport)
 	vport->fc_flag &= ~(FC_RSCN_MODE | FC_RSCN_DISCOVERY);
 	spin_unlock_irq(shost->host_lock);
 	lpfc_can_disctmo(vport);
+	/* Indicate we are done walking this fc_rscn_id_list */
+	vport->fc_rscn_flush = 0;
 }
 
 int
@@ -2949,6 +2969,7 @@ lpfc_rscn_payload_check(struct lpfc_vport *vport, uint32_t did)
 	D_ID rscn_did;
 	uint32_t *lp;
 	uint32_t payload_len, i;
+	struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
 
 	ns_did.un.word = did;
 
@@ -2960,6 +2981,15 @@ lpfc_rscn_payload_check(struct lpfc_vport *vport, uint32_t did)
 	if (vport->fc_flag & FC_RSCN_DISCOVERY)
 		return did;
 
+	spin_lock_irq(shost->host_lock);
+	if (vport->fc_rscn_flush) {
+		/* Another thread is walking fc_rscn_id_list on this vport */
+		spin_unlock_irq(shost->host_lock);
+		return 0;
+	}
+	/* Indicate we are walking fc_rscn_id_list on this vport */
+	vport->fc_rscn_flush = 1;
+	spin_unlock_irq(shost->host_lock);
 	for (i = 0; i < vport->fc_rscn_id_cnt; i++) {
 		lp = vport->fc_rscn_id_list[i]->virt;
 		payload_len = be32_to_cpu(*lp++ & ~ELS_CMD_MASK);
@@ -2970,16 +3000,16 @@ lpfc_rscn_payload_check(struct lpfc_vport *vport, uint32_t did)
 			switch (rscn_did.un.b.resv) {
 			case 0:	/* Single N_Port ID effected */
 				if (ns_did.un.word == rscn_did.un.word)
-					return did;
+					goto return_did_out;
 				break;
 			case 1:	/* Whole N_Port Area effected */
 				if ((ns_did.un.b.domain == rscn_did.un.b.domain)
 				    && (ns_did.un.b.area == rscn_did.un.b.area))
-					return did;
+					goto return_did_out;
 				break;
 			case 2:	/* Whole N_Port Domain effected */
 				if (ns_did.un.b.domain == rscn_did.un.b.domain)
-					return did;
+					goto return_did_out;
 				break;
 			default:
 				/* Unknown Identifier in RSCN node */
@@ -2988,11 +3018,17 @@ lpfc_rscn_payload_check(struct lpfc_vport *vport, uint32_t did)
 						 "RSCN payload Data: x%x\n",
 						 rscn_did.un.word);
 			case 3:	/* Whole Fabric effected */
-				return did;
+				goto return_did_out;
 			}
 		}
 	}
+	/* Indicate we are done with walking fc_rscn_id_list on this vport */
+	vport->fc_rscn_flush = 0;
 	return 0;
+return_did_out:
+	/* Indicate we are done with walking fc_rscn_id_list on this vport */
+	vport->fc_rscn_flush = 0;
+	return did;
 }
 
 static int
@@ -3034,7 +3070,7 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 	uint32_t *lp, *datap;
 	IOCB_t *icmd;
 	uint32_t payload_len, length, nportid, *cmd;
-	int rscn_cnt = vport->fc_rscn_id_cnt;
+	int rscn_cnt;
 	int rscn_id = 0, hba_id = 0;
 	int i;
 
@@ -3047,7 +3083,8 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 	/* RSCN received */
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
 			 "0214 RSCN received Data: x%x x%x x%x x%x\n",
-			 vport->fc_flag, payload_len, *lp, rscn_cnt);
+			 vport->fc_flag, payload_len, *lp,
+			 vport->fc_rscn_id_cnt);
 	for (i = 0; i < payload_len/sizeof(uint32_t); i++)
 		fc_host_post_event(shost, fc_get_event_number(),
 			FCH_EVT_RSCN, lp[i]);
@@ -3085,7 +3122,7 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 					 "0214 Ignore RSCN "
 					 "Data: x%x x%x x%x x%x\n",
 					 vport->fc_flag, payload_len,
-					 *lp, rscn_cnt);
+					 *lp, vport->fc_rscn_id_cnt);
 			lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_ELS_UNSOL,
 				"RCV RSCN vport:  did:x%x/ste:x%x flg:x%x",
 				ndlp->nlp_DID, vport->port_state,
@@ -3097,6 +3134,18 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 		}
 	}
 
+	spin_lock_irq(shost->host_lock);
+	if (vport->fc_rscn_flush) {
+		/* Another thread is walking fc_rscn_id_list on this vport */
+		spin_unlock_irq(shost->host_lock);
+		vport->fc_flag |= FC_RSCN_DISCOVERY;
+		return 0;
+	}
+	/* Indicate we are walking fc_rscn_id_list on this vport */
+	vport->fc_rscn_flush = 1;
+	spin_unlock_irq(shost->host_lock);
+	/* Get the array count after sucessfully have the token */
+	rscn_cnt = vport->fc_rscn_id_cnt;
 	/* If we are already processing an RSCN, save the received
 	 * RSCN payload buffer, cmdiocb->context2 to process later.
 	 */
@@ -3118,7 +3167,7 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 			if ((rscn_cnt) &&
 			    (payload_len + length <= LPFC_BPL_SIZE)) {
 				*cmd &= ELS_CMD_MASK;
-				*cmd |= be32_to_cpu(payload_len + length);
+				*cmd |= cpu_to_be32(payload_len + length);
 				memcpy(((uint8_t *)cmd) + length, lp,
 				       payload_len);
 			} else {
@@ -3129,7 +3178,6 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 				 */
 				cmdiocb->context2 = NULL;
 			}
-
 			/* Deferred RSCN */
 			lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
 					 "0235 Deferred RSCN "
@@ -3146,9 +3194,10 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 					 vport->fc_rscn_id_cnt, vport->fc_flag,
 					 vport->port_state);
 		}
+		/* Indicate we are done walking fc_rscn_id_list on this vport */
+		vport->fc_rscn_flush = 0;
 		/* Send back ACC */
 		lpfc_els_rsp_acc(vport, ELS_CMD_ACC, cmdiocb, ndlp, NULL);
-
 		/* send RECOVERY event for ALL nodes that match RSCN payload */
 		lpfc_rscn_recovery_check(vport);
 		spin_lock_irq(shost->host_lock);
@@ -3156,7 +3205,6 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 		spin_unlock_irq(shost->host_lock);
 		return 0;
 	}
-
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_ELS_UNSOL,
 		"RCV RSCN:        did:x%x/ste:x%x flg:x%x",
 		ndlp->nlp_DID, vport->port_state, ndlp->nlp_flag);
@@ -3165,20 +3213,18 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb,
 	vport->fc_flag |= FC_RSCN_MODE;
 	spin_unlock_irq(shost->host_lock);
 	vport->fc_rscn_id_list[vport->fc_rscn_id_cnt++] = pcmd;
+	/* Indicate we are done walking fc_rscn_id_list on this vport */
+	vport->fc_rscn_flush = 0;
 	/*
 	 * If we zero, cmdiocb->context2, the calling routine will
 	 * not try to free it.
 	 */
 	cmdiocb->context2 = NULL;
-
 	lpfc_set_disctmo(vport);
-
 	/* Send back ACC */
 	lpfc_els_rsp_acc(vport, ELS_CMD_ACC, cmdiocb, ndlp, NULL);
-
 	/* send RECOVERY event for ALL nodes that match RSCN payload */
 	lpfc_rscn_recovery_check(vport);
-
 	return lpfc_els_handle_rscn(vport);
 }
 
@@ -4343,15 +4389,15 @@ lpfc_els_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 			vport = lpfc_find_vport_by_vpid(phba, vpi);
 		}
 	}
-				/* If there are no BDEs associated
-				 * with this IOCB, there is nothing to do.
-				 */
+	/* If there are no BDEs associated
+	 * with this IOCB, there is nothing to do.
+	 */
 	if (icmd->ulpBdeCount == 0)
 		return;
 
-				/* type of ELS cmd is first 32bit word
-				 * in packet
-				 */
+	/* type of ELS cmd is first 32bit word
+	 * in packet
+	 */
 	if (phba->sli3_options & LPFC_SLI3_HBQ_ENABLED) {
 		elsiocb->context2 = bdeBuf1;
 	} else {
@@ -4464,6 +4510,7 @@ lpfc_cmpl_reg_new_vport(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		switch (mb->mbxStatus) {
 		case 0x11:	/* unsupported feature */
 		case 0x9603:	/* max_vpi exceeded */
+		case 0x9602:	/* Link event since CLEAR_LA */
 			/* giving up on vport registration */
 			lpfc_vport_set_state(vport, FC_VPORT_FAILED);
 			spin_lock_irq(shost->host_lock);
@@ -4477,7 +4524,10 @@ lpfc_cmpl_reg_new_vport(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 			spin_lock_irq(shost->host_lock);
 			vport->fc_flag |= FC_VPORT_NEEDS_REG_VPI;
 			spin_unlock_irq(shost->host_lock);
-			lpfc_initial_fdisc(vport);
+			if (vport->port_type == LPFC_PHYSICAL_PORT)
+				lpfc_initial_flogi(vport);
+			else
+				lpfc_initial_fdisc(vport);
 			break;
 		}
 
@@ -4795,11 +4845,12 @@ lpfc_resume_fabric_iocbs(struct lpfc_hba *phba)
 repeat:
 	iocb = NULL;
 	spin_lock_irqsave(&phba->hbalock, iflags);
-				/* Post any pending iocb to the SLI layer */
+	/* Post any pending iocb to the SLI layer */
 	if (atomic_read(&phba->fabric_iocb_count) == 0) {
 		list_remove_head(&phba->fabric_iocb_list, iocb, typeof(*iocb),
 				 list);
 		if (iocb)
+			/* Increment fabric iocb count to hold the position */
 			atomic_inc(&phba->fabric_iocb_count);
 	}
 	spin_unlock_irqrestore(&phba->hbalock, iflags);
@@ -4846,9 +4897,7 @@ lpfc_block_fabric_iocbs(struct lpfc_hba *phba)
 	int blocked;
 
 	blocked = test_and_set_bit(FABRIC_COMANDS_BLOCKED, &phba->bit_flags);
-				/* Start a timer to unblock fabric
-				 * iocbs after 100ms
-				 */
+	/* Start a timer to unblock fabric iocbs after 100ms */
 	if (!blocked)
 		mod_timer(&phba->fabric_block_timer, jiffies + HZ/10 );
 
@@ -4896,8 +4945,8 @@ lpfc_cmpl_fabric_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 
 	atomic_dec(&phba->fabric_iocb_count);
 	if (!test_bit(FABRIC_COMANDS_BLOCKED, &phba->bit_flags)) {
-				/* Post any pending iocbs to HBA */
-		    lpfc_resume_fabric_iocbs(phba);
+		/* Post any pending iocbs to HBA */
+		lpfc_resume_fabric_iocbs(phba);
 	}
 }
 
@@ -4916,6 +4965,9 @@ lpfc_issue_fabric_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *iocb)
 	ready = atomic_read(&phba->fabric_iocb_count) == 0 &&
 		!test_bit(FABRIC_COMANDS_BLOCKED, &phba->bit_flags);
 
+	if (ready)
+		/* Increment fabric iocb count to hold the position */
+		atomic_inc(&phba->fabric_iocb_count);
 	spin_unlock_irqrestore(&phba->hbalock, iflags);
 	if (ready) {
 		iocb->fabric_iocb_cmpl = iocb->iocb_cmpl;
@@ -4926,7 +4978,6 @@ lpfc_issue_fabric_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *iocb)
 			"Fabric sched2:   ste:x%x",
 			iocb->vport->port_state, 0, 0);
 
-		atomic_inc(&phba->fabric_iocb_count);
 		ret = lpfc_sli_issue_iocb(phba, pring, iocb, 0);
 
 		if (ret == IOCB_ERROR) {
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index 041f83e7634a..543ed3ca8b76 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -581,6 +581,7 @@ struct ls_rjt {	/* Structure is in Big Endian format */
 #define LSEXP_INVALID_O_SID     0x15
 #define LSEXP_INVALID_OX_RX     0x17
 #define LSEXP_CMD_IN_PROGRESS   0x19
+#define LSEXP_PORT_LOGIN_REQ    0x1E
 #define LSEXP_INVALID_NPORT_ID  0x1F
 #define LSEXP_INVALID_SEQ_ID    0x21
 #define LSEXP_INVALID_XCHG      0x23
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index a087524acf41..7a5ce3355808 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -475,7 +475,8 @@ lpfc_hba_down_prep(struct lpfc_hba *phba)
 			for(i = 0; i <= phba->max_vpi && vports[i] != NULL; i++)
 				lpfc_cleanup_discovery_resources(vports[i]);
 		lpfc_destroy_vport_work_array(phba, vports);
-	}	return 0;
+	}
+	return 0;
 }
 
 /************************************************************************/
@@ -1740,9 +1741,9 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
 
 	vport = (struct lpfc_vport *) shost->hostdata;
 	vport->phba = phba;
-
 	vport->load_flag |= FC_LOADING;
 	vport->fc_flag |= FC_VPORT_NEEDS_REG_VPI;
+	vport->fc_rscn_flush = 0;
 
 	lpfc_get_vport_cfgparam(vport);
 	shost->unique_id = instance;
diff --git a/drivers/scsi/lpfc/lpfc_logmsg.h b/drivers/scsi/lpfc/lpfc_logmsg.h
index c5841d7565f7..39fd2b843bec 100644
--- a/drivers/scsi/lpfc/lpfc_logmsg.h
+++ b/drivers/scsi/lpfc/lpfc_logmsg.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2005 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  *                                                                 *
@@ -35,11 +35,15 @@
 #define LOG_ALL_MSG                   0xffff	/* LOG all messages */
 
 #define lpfc_printf_vlog(vport, level, mask, fmt, arg...) \
+	do { \
 	{ if (((mask) &(vport)->cfg_log_verbose) || (level[1] <= '3')) \
 		dev_printk(level, &((vport)->phba->pcidev)->dev, "%d:(%d):" \
-			   fmt, (vport)->phba->brd_no, vport->vpi, ##arg); }
+			   fmt, (vport)->phba->brd_no, vport->vpi, ##arg); } \
+	} while (0)
 
 #define lpfc_printf_log(phba, level, mask, fmt, arg...) \
+	do { \
 	{ if (((mask) &(phba)->pport->cfg_log_verbose) || (level[1] <= '3')) \
 		dev_printk(level, &((phba)->pcidev)->dev, "%d:" \
-			   fmt, phba->brd_no, ##arg); }
+			   fmt, phba->brd_no, ##arg); } \
+	} while (0)
diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c
index 6dc5ab8d6716..27448c98c07a 100644
--- a/drivers/scsi/lpfc/lpfc_mem.c
+++ b/drivers/scsi/lpfc/lpfc_mem.c
@@ -265,6 +265,9 @@ lpfc_in_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp)
 {
 	struct hbq_dmabuf *hbq_entry;
 
+	if (!mp)
+		return;
+
 	if (phba->sli3_options & LPFC_SLI3_HBQ_ENABLED) {
 		hbq_entry = container_of(mp, struct hbq_dmabuf, dbuf);
 		if (hbq_entry->tag == -1) {
@@ -279,4 +282,3 @@ lpfc_in_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp)
 	}
 	return;
 }
-
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 456b8ec7753b..c8c5b48baa66 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -2623,14 +2623,14 @@ lpfc_sli_issue_mbox(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmbox, uint32_t flag)
 		spin_unlock_irqrestore(&phba->hbalock, drvr_flag);
 
 		/* Mbox command <mbxCommand> cannot issue */
-		LOG_MBOX_CANNOT_ISSUE_DATA(phba, pmbox, psli, flag)
+		LOG_MBOX_CANNOT_ISSUE_DATA(phba, pmbox, psli, flag);
 		return MBX_NOT_FINISHED;
 	}
 
 	if (mb->mbxCommand != MBX_KILL_BOARD && flag & MBX_NOWAIT &&
 	    !(readl(phba->HCregaddr) & HC_MBINT_ENA)) {
 		spin_unlock_irqrestore(&phba->hbalock, drvr_flag);
-		LOG_MBOX_CANNOT_ISSUE_DATA(phba, pmbox, psli, flag)
+		LOG_MBOX_CANNOT_ISSUE_DATA(phba, pmbox, psli, flag);
 		return MBX_NOT_FINISHED;
 	}
 

From 3163f725a5d071eea1830bbbfab78cfe3fc9baaf Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:50:25 -0500
Subject: [PATCH 16/66] [SCSI] lpfc 8.2.5 : Fix buffer leaks

Fix buffer leaks:
- HBQ dma buffer leak at dma_pool_destroy when unloading driver
- Fix missing buffer free in slow ring buffer handling

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc.h         |  2 +
 drivers/scsi/lpfc/lpfc_hbadisc.c | 17 ++++--
 drivers/scsi/lpfc/lpfc_hw.h      | 17 +++++-
 drivers/scsi/lpfc/lpfc_init.c    |  2 +
 drivers/scsi/lpfc/lpfc_mem.c     |  9 +++
 drivers/scsi/lpfc/lpfc_sli.c     | 97 +++++++++++++++++++++++++++++++-
 6 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 6c178f1c8786..2ab2d24dcc15 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -495,6 +495,8 @@ struct lpfc_hba {
 	wait_queue_head_t    *work_wait;
 	struct task_struct   *worker_thread;
 
+	uint32_t hbq_in_use;		/* HBQs in use flag */
+	struct list_head hbqbuf_in_list;  /* in-fly hbq buffer list */
 	uint32_t hbq_count;	        /* Count of configured HBQs */
 	struct hbq_s hbqs[LPFC_MAX_HBQS]; /* local copy of hbq indicies  */
 
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 25892671bfb0..bd572d6b60af 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -629,9 +629,8 @@ lpfc_linkdown(struct lpfc_hba *phba)
 	LPFC_MBOXQ_t          *mb;
 	int i;
 
-	if (phba->link_state == LPFC_LINK_DOWN) {
+	if (phba->link_state == LPFC_LINK_DOWN)
 		return 0;
-	}
 	spin_lock_irq(&phba->hbalock);
 	if (phba->link_state > LPFC_LINK_DOWN) {
 		phba->link_state = LPFC_LINK_DOWN;
@@ -1122,7 +1121,7 @@ lpfc_mbx_cmpl_read_la(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	if (la->attType == AT_LINK_UP) {
 		phba->fc_stat.LinkUp++;
 		if (phba->link_flag & LS_LOOPBACK_MODE) {
-			lpfc_printf_log(phba, KERN_INFO, LOG_LINK_EVENT,
+			lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
 					"1306 Link Up Event in loop back mode "
 					"x%x received Data: x%x x%x x%x x%x\n",
 					la->eventTag, phba->fc_eventTag,
@@ -1139,11 +1138,21 @@ lpfc_mbx_cmpl_read_la(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_mbx_process_link_up(phba, la);
 	} else {
 		phba->fc_stat.LinkDown++;
-		lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
+		if (phba->link_flag & LS_LOOPBACK_MODE) {
+			lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
+				"1308 Link Down Event in loop back mode "
+				"x%x received "
+				"Data: x%x x%x x%x\n",
+				la->eventTag, phba->fc_eventTag,
+				phba->pport->port_state, vport->fc_flag);
+		}
+		else {
+			lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
 				"1305 Link Down Event x%x received "
 				"Data: x%x x%x x%x\n",
 				la->eventTag, phba->fc_eventTag,
 				phba->pport->port_state, vport->fc_flag);
+		}
 		lpfc_mbx_issue_link_down(phba);
 	}
 
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index 543ed3ca8b76..7773b949aa7c 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  *                                                                 *
@@ -1377,11 +1377,26 @@ typedef struct {		/* FireFly BIU registers */
 #define CMD_QUE_XRI64_CX	0xB3
 #define CMD_IOCB_RCV_SEQ64_CX	0xB5
 #define CMD_IOCB_RCV_ELS64_CX	0xB7
+#define CMD_IOCB_RET_XRI64_CX	0xB9
 #define CMD_IOCB_RCV_CONT64_CX	0xBB
 
 #define CMD_GEN_REQUEST64_CR    0xC2
 #define CMD_GEN_REQUEST64_CX    0xC3
 
+/* Unhandled SLI-3 Commands */
+#define CMD_IOCB_XMIT_MSEQ64_CR		0xB0
+#define CMD_IOCB_XMIT_MSEQ64_CX		0xB1
+#define CMD_IOCB_RCV_SEQ_LIST64_CX	0xC1
+#define CMD_IOCB_RCV_ELS_LIST64_CX	0xCD
+#define CMD_IOCB_CLOSE_EXTENDED_CN	0xB6
+#define CMD_IOCB_ABORT_EXTENDED_CN	0xBA
+#define CMD_IOCB_RET_HBQE64_CN		0xCA
+#define CMD_IOCB_FCP_IBIDIR64_CR	0xAC
+#define CMD_IOCB_FCP_IBIDIR64_CX	0xAD
+#define CMD_IOCB_FCP_ITASKMGT64_CX	0xAF
+#define CMD_IOCB_LOGENTRY_CN		0x94
+#define CMD_IOCB_LOGENTRY_ASYNC_CN	0x96
+
 #define CMD_MAX_IOCB_CMD        0xE6
 #define CMD_IOCB_MASK           0xff
 
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 7a5ce3355808..22843751c2ca 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2087,6 +2087,8 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 
 	memset(phba->hbqslimp.virt, 0, lpfc_sli_hbq_size());
 
+	INIT_LIST_HEAD(&phba->hbqbuf_in_list);
+
 	/* Initialize the SLI Layer to run with lpfc HBAs. */
 	lpfc_sli_setup(phba);
 	lpfc_sli_queue_setup(phba);
diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c
index 27448c98c07a..3c0cebc71800 100644
--- a/drivers/scsi/lpfc/lpfc_mem.c
+++ b/drivers/scsi/lpfc/lpfc_mem.c
@@ -264,18 +264,27 @@ void
 lpfc_in_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp)
 {
 	struct hbq_dmabuf *hbq_entry;
+	unsigned long flags;
 
 	if (!mp)
 		return;
 
 	if (phba->sli3_options & LPFC_SLI3_HBQ_ENABLED) {
+		/* Check whether HBQ is still in use */
+		spin_lock_irqsave(&phba->hbalock, flags);
+		if (!phba->hbq_in_use) {
+			spin_unlock_irqrestore(&phba->hbalock, flags);
+			return;
+		}
 		hbq_entry = container_of(mp, struct hbq_dmabuf, dbuf);
+		list_del(&hbq_entry->dbuf.list);
 		if (hbq_entry->tag == -1) {
 			(phba->hbqs[LPFC_ELS_HBQ].hbq_free_buffer)
 				(phba, hbq_entry);
 		} else {
 			lpfc_sli_free_hbq(phba, hbq_entry);
 		}
+		spin_unlock_irqrestore(&phba->hbalock, flags);
 	} else {
 		lpfc_mbuf_free(phba, mp->virt, mp->phys);
 		kfree(mp);
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index c8c5b48baa66..f53206411cd8 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -203,8 +203,25 @@ lpfc_sli_iocb_cmd_type(uint8_t iocb_cmnd)
 	case CMD_IOCB_RCV_SEQ64_CX:
 	case CMD_IOCB_RCV_ELS64_CX:
 	case CMD_IOCB_RCV_CONT64_CX:
+	case CMD_IOCB_RET_XRI64_CX:
 		type = LPFC_UNSOL_IOCB;
 		break;
+	case CMD_IOCB_XMIT_MSEQ64_CR:
+	case CMD_IOCB_XMIT_MSEQ64_CX:
+	case CMD_IOCB_RCV_SEQ_LIST64_CX:
+	case CMD_IOCB_RCV_ELS_LIST64_CX:
+	case CMD_IOCB_CLOSE_EXTENDED_CN:
+	case CMD_IOCB_ABORT_EXTENDED_CN:
+	case CMD_IOCB_RET_HBQE64_CN:
+	case CMD_IOCB_FCP_IBIDIR64_CR:
+	case CMD_IOCB_FCP_IBIDIR64_CX:
+	case CMD_IOCB_FCP_ITASKMGT64_CX:
+	case CMD_IOCB_LOGENTRY_CN:
+	case CMD_IOCB_LOGENTRY_ASYNC_CN:
+		printk("%s - Unhandled SLI-3 Command x%x\n",
+				__FUNCTION__, iocb_cmnd);
+		type = LPFC_UNKNOWN_IOCB;
+		break;
 	default:
 		type = LPFC_UNKNOWN_IOCB;
 		break;
@@ -529,10 +546,13 @@ lpfc_sli_hbqbuf_free_all(struct lpfc_hba *phba)
 {
 	struct lpfc_dmabuf *dmabuf, *next_dmabuf;
 	struct hbq_dmabuf *hbq_buf;
+	unsigned long flags;
 	int i, hbq_count;
+	uint32_t hbqno;
 
 	hbq_count = lpfc_sli_hbq_count();
 	/* Return all memory used by all HBQs */
+	spin_lock_irqsave(&phba->hbalock, flags);
 	for (i = 0; i < hbq_count; ++i) {
 		list_for_each_entry_safe(dmabuf, next_dmabuf,
 				&phba->hbqs[i].hbq_buffer_list, list) {
@@ -542,6 +562,28 @@ lpfc_sli_hbqbuf_free_all(struct lpfc_hba *phba)
 		}
 		phba->hbqs[i].buffer_count = 0;
 	}
+	/* Return all HBQ buffer that are in-fly */
+	list_for_each_entry_safe(dmabuf, next_dmabuf,
+			&phba->hbqbuf_in_list, list) {
+		hbq_buf = container_of(dmabuf, struct hbq_dmabuf, dbuf);
+		list_del(&hbq_buf->dbuf.list);
+		if (hbq_buf->tag == -1) {
+			(phba->hbqs[LPFC_ELS_HBQ].hbq_free_buffer)
+				(phba, hbq_buf);
+		} else {
+			hbqno = hbq_buf->tag >> 16;
+			if (hbqno >= LPFC_MAX_HBQS)
+				(phba->hbqs[LPFC_ELS_HBQ].hbq_free_buffer)
+					(phba, hbq_buf);
+			else
+				(phba->hbqs[hbqno].hbq_free_buffer)(phba,
+					hbq_buf);
+		}
+	}
+
+	/* Mark the HBQs not in use */
+	phba->hbq_in_use = 0;
+	spin_unlock_irqrestore(&phba->hbalock, flags);
 }
 
 static struct lpfc_hbq_entry *
@@ -603,6 +645,7 @@ static int
 lpfc_sli_hbqbuf_fill_hbqs(struct lpfc_hba *phba, uint32_t hbqno, uint32_t count)
 {
 	uint32_t i, start, end;
+	unsigned long flags;
 	struct hbq_dmabuf *hbq_buffer;
 
 	if (!phba->hbqs[hbqno].hbq_alloc_buffer) {
@@ -615,6 +658,13 @@ lpfc_sli_hbqbuf_fill_hbqs(struct lpfc_hba *phba, uint32_t hbqno, uint32_t count)
 		end = lpfc_hbq_defs[hbqno]->entry_count;
 	}
 
+	/* Check whether HBQ is still in use */
+	spin_lock_irqsave(&phba->hbalock, flags);
+	if (!phba->hbq_in_use) {
+		spin_unlock_irqrestore(&phba->hbalock, flags);
+		return 0;
+	}
+
 	/* Populate HBQ entries */
 	for (i = start; i < end; i++) {
 		hbq_buffer = (phba->hbqs[hbqno].hbq_alloc_buffer)(phba);
@@ -626,6 +676,8 @@ lpfc_sli_hbqbuf_fill_hbqs(struct lpfc_hba *phba, uint32_t hbqno, uint32_t count)
 		else
 			(phba->hbqs[hbqno].hbq_free_buffer)(phba, hbq_buffer);
 	}
+
+	spin_unlock_irqrestore(&phba->hbalock, flags);
 	return 0;
 }
 
@@ -910,16 +962,29 @@ lpfc_sli_replace_hbqbuff(struct lpfc_hba *phba, uint32_t tag)
 	uint32_t hbqno;
 	void *virt;		/* virtual address ptr */
 	dma_addr_t phys;	/* mapped address */
+	unsigned long flags;
+
+	/* Check whether HBQ is still in use */
+	spin_lock_irqsave(&phba->hbalock, flags);
+	if (!phba->hbq_in_use) {
+		spin_unlock_irqrestore(&phba->hbalock, flags);
+		return NULL;
+	}
 
 	hbq_entry = lpfc_sli_hbqbuf_find(phba, tag);
-	if (hbq_entry == NULL)
+	if (hbq_entry == NULL) {
+		spin_unlock_irqrestore(&phba->hbalock, flags);
 		return NULL;
+	}
 	list_del(&hbq_entry->dbuf.list);
 
 	hbqno = tag >> 16;
 	new_hbq_entry = (phba->hbqs[hbqno].hbq_alloc_buffer)(phba);
-	if (new_hbq_entry == NULL)
+	if (new_hbq_entry == NULL) {
+		list_add_tail(&hbq_entry->dbuf.list, &phba->hbqbuf_in_list);
+		spin_unlock_irqrestore(&phba->hbalock, flags);
 		return &hbq_entry->dbuf;
+	}
 	new_hbq_entry->tag = -1;
 	phys = new_hbq_entry->dbuf.phys;
 	virt = new_hbq_entry->dbuf.virt;
@@ -928,6 +993,9 @@ lpfc_sli_replace_hbqbuff(struct lpfc_hba *phba, uint32_t tag)
 	hbq_entry->dbuf.phys = phys;
 	hbq_entry->dbuf.virt = virt;
 	lpfc_sli_free_hbq(phba, hbq_entry);
+	list_add_tail(&new_hbq_entry->dbuf.list, &phba->hbqbuf_in_list);
+	spin_unlock_irqrestore(&phba->hbalock, flags);
+
 	return &new_hbq_entry->dbuf;
 }
 
@@ -951,6 +1019,7 @@ lpfc_sli_process_unsol_iocb(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 	uint32_t           Rctl, Type;
 	uint32_t           match, i;
 	struct lpfc_iocbq *iocbq;
+	struct lpfc_dmabuf *dmzbuf;
 
 	match = 0;
 	irsp = &(saveq->iocb);
@@ -972,6 +1041,29 @@ lpfc_sli_process_unsol_iocb(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 		return 1;
 	}
 
+	if ((irsp->ulpCommand == CMD_IOCB_RET_XRI64_CX) &&
+		(phba->sli3_options & LPFC_SLI3_HBQ_ENABLED)) {
+		if (irsp->ulpBdeCount > 0) {
+			dmzbuf = lpfc_sli_get_buff(phba, pring,
+					irsp->un.ulpWord[3]);
+			lpfc_in_buf_free(phba, dmzbuf);
+		}
+
+		if (irsp->ulpBdeCount > 1) {
+			dmzbuf = lpfc_sli_get_buff(phba, pring,
+					irsp->unsli3.sli3Words[3]);
+			lpfc_in_buf_free(phba, dmzbuf);
+		}
+
+		if (irsp->ulpBdeCount > 2) {
+			dmzbuf = lpfc_sli_get_buff(phba, pring,
+				irsp->unsli3.sli3Words[7]);
+			lpfc_in_buf_free(phba, dmzbuf);
+		}
+
+		return 1;
+	}
+
 	if (phba->sli3_options & LPFC_SLI3_HBQ_ENABLED) {
 		if (irsp->ulpBdeCount != 0) {
 			saveq->context2 = lpfc_sli_get_buff(phba, pring,
@@ -2293,6 +2385,7 @@ lpfc_sli_hbq_setup(struct lpfc_hba *phba)
 
 	/* Initialize the struct lpfc_sli_hbq structure for each hbq */
 	phba->link_state = LPFC_INIT_MBX_CMDS;
+	phba->hbq_in_use = 1;
 
 	hbq_entry_index = 0;
 	for (hbqno = 0; hbqno < hbq_count; ++hbqno) {

From e390bc0a26ba522f008a1f9479097f1c6fc0189c Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Fri, 8 Feb 2008 18:50:36 -0500
Subject: [PATCH 17/66] [SCSI] lpfc 8.2.5 : Update lpfc driver version to 8.2.5

Update lpfc driver version to 8.2.5

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/lpfc/lpfc_version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h
index 4b633d39a82a..ca540d1d041e 100644
--- a/drivers/scsi/lpfc/lpfc_version.h
+++ b/drivers/scsi/lpfc/lpfc_version.h
@@ -18,7 +18,7 @@
  * included with this package.                                     *
  *******************************************************************/
 
-#define LPFC_DRIVER_VERSION "8.2.4"
+#define LPFC_DRIVER_VERSION "8.2.5"
 
 #define LPFC_DRIVER_NAME "lpfc"
 

From c958d767dc79250583902a382275961b5da91a4d Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 11 Feb 2008 16:18:55 -0600
Subject: [PATCH 18/66] [SCSI] sym53c416: fix module parameters

It looks like there's been a bug in the module parameter setup forever.
The upshot doesn't really matter, because even if no parameters are ever
set, we just call sym53c416_setup() three times, but the zero values in
the arrays eventually cause nothing to happen.  Unfortunately gcc has
started to notice this now too:

drivers/scsi/sym53c416.c: In function 'sym53c416_detect':
drivers/scsi/sym53c416.c:624: warning: the address of 'sym53c416' will always evaluate as 'true'
drivers/scsi/sym53c416.c:630: warning: the address of 'sym53c416_1' will always evaluate as 'true'
drivers/scsi/sym53c416.c:636: warning: the address of 'sym53c416_2' will always evaluate as 'true'
drivers/scsi/sym53c416.c:642: warning: the address of 'sym53c416_3' will always evaluate as 'true'

So fix this longstanding bug to keep gcc quiet.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/sym53c416.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 6325901e5093..f7d279542fa5 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -187,10 +187,10 @@
 #define sym53c416_base_2 sym53c416_2
 #define sym53c416_base_3 sym53c416_3
 
-static unsigned int sym53c416_base[2] = {0,0};
-static unsigned int sym53c416_base_1[2] = {0,0};
-static unsigned int sym53c416_base_2[2] = {0,0};
-static unsigned int sym53c416_base_3[2] = {0,0};
+static unsigned int sym53c416_base[2];
+static unsigned int sym53c416_base_1[2];
+static unsigned int sym53c416_base_2[2];
+static unsigned int sym53c416_base_3[2];
 
 #endif
 
@@ -621,25 +621,25 @@ int __init sym53c416_detect(struct scsi_host_template *tpnt)
 	int ints[3];
 
 	ints[0] = 2;
-	if(sym53c416_base)
+	if(sym53c416_base[0])
 	{
 		ints[1] = sym53c416_base[0];
 		ints[2] = sym53c416_base[1];
 		sym53c416_setup(NULL, ints);
 	}
-	if(sym53c416_base_1)
+	if(sym53c416_base_1[0])
 	{
 		ints[1] = sym53c416_base_1[0];
 		ints[2] = sym53c416_base_1[1];
 		sym53c416_setup(NULL, ints);
 	}
-	if(sym53c416_base_2)
+	if(sym53c416_base_2[0])
 	{
 		ints[1] = sym53c416_base_2[0];
 		ints[2] = sym53c416_base_2[1];
 		sym53c416_setup(NULL, ints);
 	}
-	if(sym53c416_base_3)
+	if(sym53c416_base_3[0])
 	{
 		ints[1] = sym53c416_base_3[0];
 		ints[2] = sym53c416_base_3[1];

From c98aa86df3169e5d6275305376043134caa69831 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 12 Feb 2008 13:52:37 -0800
Subject: [PATCH 19/66] timeconst.pl: correct reversal of USEC_TO_HZ and
 HZ_TO_USEC

The USEC_TO_HZ and HZ_TO_USEC constant sets were mislabelled, with
seriously incorrect results.  This among other things manifested
itself as cpufreq not working when a tickless kernel was configured.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Tested-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/timeconst.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 62b1287932ed..41468035473c 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -339,7 +339,7 @@ ($@)
 	print "\n";
 
 	foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
-		      'USEC_TO_HZ','HZ_TO_USEC') {
+		      'HZ_TO_USEC','USEC_TO_HZ') {
 		foreach $bit (32, 64) {
 			foreach $suf ('MUL', 'ADJ', 'SHR') {
 				printf "#define %-23s %s\n",

From 720a2592cf1b9af86f30c44e8d89348826c03372 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:36 +0100
Subject: [PATCH 20/66] hrtimer: more hrtimer_init_sleeper() fallout.

Missed an instance...

  futex_lock_pi()
    hrtimer_init_sleeper()
    rt_mutex_timed_lock()
      rt_mutex_timed_fastlock()
        rt_mutex_slowlock()
          hrtimer_start()

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rtmutex.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout))
+	if (unlikely(timeout)) {
 		hrtimer_start(&timeout->timer, timeout->timer.expires,
 			      HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
 
 	for (;;) {
 		/* Try to acquire the lock: */

From 8ed3699682be75fd68281239c202ad3830f9c72d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:39 +0100
Subject: [PATCH 21/66] sched: fair-group: separate tg->shares from
 task_group_lock

On Mon, 2008-02-11 at 15:09 +0300, Denis V. Lunev wrote:
> BUG: sleeping function called from invalid context
> at /home/den/src/linux-netns26/kernel/mutex.c:209
> in_atomic():1, irqs_disabled():0
> no locks held by swapper/0.
> Pid: 0, comm: swapper Not tainted 2.6.24 #304
>
> Call Trace:
>  <IRQ>  [<ffffffff80252d1e>] ? __debug_show_held_locks+0x15/0x27
>  [<ffffffff8022c2a8>] __might_sleep+0xc0/0xdf
>  [<ffffffff8049f1df>] mutex_lock_nested+0x28/0x2a9
>  [<ffffffff80231294>] sched_destroy_group+0x18/0xea
>  [<ffffffff8023e835>] sched_destroy_user+0xd/0xf
>  [<ffffffff8023e8c1>] free_uid+0x8a/0xab
>  [<ffffffff80233e24>] __put_task_struct+0x3f/0xd3
>  [<ffffffff80236708>] delayed_put_task_struct+0x23/0x25
>  [<ffffffff8026fda7>] __rcu_process_callbacks+0x8d/0x215
>  [<ffffffff8026ff52>] rcu_process_callbacks+0x23/0x44
>  [<ffffffff8023a2ae>] __do_softirq+0x79/0xf8
>  [<ffffffff8020f8c3>] ? profile_pc+0x2a/0x67
>  [<ffffffff8020d38c>] call_softirq+0x1c/0x30
>  [<ffffffff8020f689>] do_softirq+0x61/0x9c
>  [<ffffffff8023a233>] irq_exit+0x51/0x53
>  [<ffffffff8021bd1a>] smp_apic_timer_interrupt+0x77/0xad
>  [<ffffffff8020ce3b>] apic_timer_interrupt+0x6b/0x70
>  <EOI>  [<ffffffff8020b0dd>] ? default_idle+0x43/0x76
>  [<ffffffff8020b0db>] ? default_idle+0x41/0x76
>  [<ffffffff8020b09a>] ? default_idle+0x0/0x76
>  [<ffffffff8020b186>] ? cpu_idle+0x76/0x98

separate the tg->shares protection from the task_group lock.

Reported-by: Denis V. Lunev <den@openvz.org>
Tested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3eedd5260907..6b02276baaa6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -232,10 +232,10 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
 
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
 
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
@@ -295,16 +295,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 	p->rt.parent = task_group(p)->rt_se[cpu];
 }
 
-static inline void lock_task_group_list(void)
-{
-	mutex_lock(&task_group_mutex);
-}
-
-static inline void unlock_task_group_list(void)
-{
-	mutex_unlock(&task_group_mutex);
-}
-
 static inline void lock_doms_cur(void)
 {
 	mutex_lock(&doms_cur_mutex);
@@ -318,8 +308,6 @@ static inline void unlock_doms_cur(void)
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
 
@@ -7571,6 +7559,7 @@ struct task_group *sched_create_group(void)
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
 	struct rq *rq;
+	unsigned long flags;
 	int i;
 
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
@@ -7620,7 +7609,7 @@ struct task_group *sched_create_group(void)
 		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 
-	lock_task_group_list();
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
@@ -7629,7 +7618,7 @@ struct task_group *sched_create_group(void)
 		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	}
 	list_add_rcu(&tg->list, &task_groups);
-	unlock_task_group_list();
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
 
@@ -7650,9 +7639,10 @@ void sched_destroy_group(struct task_group *tg)
 {
 	struct cfs_rq *cfs_rq = NULL;
 	struct rt_rq *rt_rq = NULL;
+	unsigned long flags;
 	int i;
 
-	lock_task_group_list();
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
@@ -7660,7 +7650,7 @@ void sched_destroy_group(struct task_group *tg)
 		list_del_rcu(&rt_rq->leaf_rt_rq_list);
 	}
 	list_del_rcu(&tg->list);
-	unlock_task_group_list();
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	BUG_ON(!cfs_rq);
 
@@ -7728,13 +7718,16 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 	}
 }
 
+static DEFINE_MUTEX(shares_mutex);
+
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	struct cfs_rq *cfs_rq;
 	struct rq *rq;
+	unsigned long flags;
 
-	lock_task_group_list();
+	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 
@@ -7746,10 +7739,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * load_balance_fair) from referring to this group first,
 	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
 	 */
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 	}
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for any ongoing reference to this group to finish */
 	synchronize_sched();
@@ -7769,13 +7764,15 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * Enable load balance activity on this group, by inserting it back on
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	}
+	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
-	unlock_task_group_list();
+	mutex_unlock(&shares_mutex);
 	return 0;
 }
 

From 4cf5d77a6eefaa7a464bc34e8cb767356f10fd74 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:39 +0100
Subject: [PATCH 22/66] sched: fix incorrect irq lock usage in
 normalize_rt_tasks()

lockdep spotted this bogus irq locking. normalize_rt_tasks() can be called
from hardirq context through sysrq-n

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6b02276baaa6..88a17c7128c3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7291,7 +7291,7 @@ void normalize_rt_tasks(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	read_lock_irq(&tasklist_lock);
+	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
@@ -7317,16 +7317,16 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		spin_lock_irqsave(&p->pi_lock, flags);
+		spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 
 		normalize_task(rq, p);
 
 		__task_rq_unlock(rq);
-		spin_unlock_irqrestore(&p->pi_lock, flags);
+		spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 
-	read_unlock_irq(&tasklist_lock);
+	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
 #endif /* CONFIG_MAGIC_SYSRQ */

From 23b0fdfc9299b137bd126e9dc22f62a59dae546d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:39 +0100
Subject: [PATCH 23/66] sched: rt-group: deal with PI

Steven mentioned the fun case where a lock holding task will be throttled.

Simple fix: allow groups that have boosted tasks to run anyway.

If a runnable task in a throttled group gets boosted the dequeue/enqueue
done by rt_mutex_setprio() is enough to unthrottle the group.

This is ofcourse not quite correct. Two possible ways forward are:
  - second prio array for boosted tasks
  - boost to a prio ceiling (this would also work for deadline scheduling)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  3 +++
 kernel/sched_rt.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 88a17c7128c3..cecaea67ae9b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -362,6 +362,8 @@ struct rt_rq {
 	u64 rt_time;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long rt_nr_boosted;
+
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
@@ -7112,6 +7114,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	rt_rq->rt_throttled = 0;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..8d4269381239 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -110,6 +110,23 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
 		dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+	struct task_struct *p;
+
+	if (rt_rq)
+		return !!rt_rq->rt_nr_boosted;
+
+	p = rt_task_of(rt_se);
+	return p->prio != p->normal_prio;
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -149,6 +166,10 @@ static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -172,7 +193,7 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 		return 0;
 
 	if (rt_rq->rt_throttled)
-		return 1;
+		return rt_rq_throttled(rt_rq);
 
 	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
@@ -183,8 +204,10 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
 
-		sched_rt_ratio_dequeue(rt_rq);
-		return 1;
+		if (rt_rq_throttled(rt_rq)) {
+			sched_rt_ratio_dequeue(rt_rq);
+			return 1;
+		}
 	}
 
 	return 0;
@@ -265,6 +288,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -295,6 +322,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted--;
+
+	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +336,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && group_rq->rt_throttled)
+	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +529,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	if (unlikely(!rt_rq->rt_nr_running))
 		return NULL;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
 	do {

From 9f0c1e560c43327b70998e6c702b2f01321130d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:39 +0100
Subject: [PATCH 24/66] sched: rt-group: interface

Change the rt_ratio interface to rt_runtime_us, to match rt_period_us.
This avoids picking a granularity for the ratio.

Extend the /sys/kernel/uids/<uid>/ interface to allow setting
the group's rt_runtime.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/sched-rt-group.txt |  59 +++++++++++++
 include/linux/sched.h            |   7 +-
 kernel/sched.c                   | 141 ++++++++++++++++++++++++-------
 kernel/sched_rt.c                |  53 +++++-------
 kernel/sysctl.c                  |  32 +++----
 kernel/user.c                    |  28 ++++++
 6 files changed, 242 insertions(+), 78 deletions(-)
 create mode 100644 Documentation/sched-rt-group.txt

diff --git a/Documentation/sched-rt-group.txt b/Documentation/sched-rt-group.txt
new file mode 100644
index 000000000000..1c6332f4543c
--- /dev/null
+++ b/Documentation/sched-rt-group.txt
@@ -0,0 +1,59 @@
+
+
+Real-Time group scheduling.
+
+The problem space:
+
+In order to schedule multiple groups of realtime tasks each group must
+be assigned a fixed portion of the CPU time available. Without a minimum
+guarantee a realtime group can obviously fall short. A fuzzy upper limit
+is of no use since it cannot be relied upon. Which leaves us with just
+the single fixed portion.
+
+CPU time is divided by means of specifying how much time can be spent
+running in a given period. Say a frame fixed realtime renderer must
+deliver 25 frames a second, which yields a period of 0.04s. Now say
+it will also have to play some music and respond to input, leaving it
+with around 80% for the graphics. We can then give this group a runtime
+of 0.8 * 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s runtime
+limit.
+
+Now if the audio thread needs to refill the DMA buffer every 0.005s, but
+needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s
+= 0.00015s.
+
+
+The Interface:
+
+system wide:
+
+/proc/sys/kernel/sched_rt_period_ms
+/proc/sys/kernel/sched_rt_runtime_us
+
+CONFIG_FAIR_USER_SCHED
+
+/sys/kernel/uids/<uid>/cpu_rt_runtime_us
+
+or
+
+CONFIG_FAIR_CGROUP_SCHED
+
+/cgroup/<cgroup>/cpu.rt_runtime_us
+
+[ time is specified in us because the interface is s32; this gives an
+  operating range of ~35m to 1us ]
+
+The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
+
+A runtime of -1 specifies runtime == period, ie. no limit.
+
+New groups get the period from /proc/sys/kernel/sched_rt_period_us and
+a runtime of 0.
+
+Settings are constrained to:
+
+   \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
+
+in order to keep the configuration schedulable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 00e144117326..142eb293f9c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1541,8 +1541,6 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_ratio;
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -1552,6 +1550,8 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
 
 extern unsigned int sysctl_sched_compat_yield;
 
@@ -2036,6 +2036,9 @@ extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+				      long rt_runtime_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
 
 #endif
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cecaea67ae9b..85a5fbff2b00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,7 +176,7 @@ struct task_group {
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 
-	unsigned int rt_ratio;
+	u64 rt_runtime;
 
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
@@ -642,19 +642,21 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
 
-#define SCHED_RT_FRAC_SHIFT	16
-#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
 
 /*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * single value that denotes runtime == period, ie unlimited time.
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF	((u64)~0ULL)
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7187,7 +7189,8 @@ void __init sched_init(void)
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
-		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		init_task_group.rt_runtime =
+			sysctl_sched_rt_runtime * NSEC_PER_USEC;
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7583,7 +7586,7 @@ struct task_group *sched_create_group(void)
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
-	tg->rt_ratio = 0; /* XXX */
+	tg->rt_runtime = 0;
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7785,30 +7788,76 @@ unsigned long sched_group_shares(struct task_group *tg)
 }
 
 /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
  */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+	if (runtime == RUNTIME_INF)
+		return 1ULL << 16;
+
+	runtime *= (1ULL << 16);
+	div64_64(runtime, period);
+	return runtime;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct task_group *tgi;
 	unsigned long total = 0;
+	unsigned long global_ratio =
+		to_ratio(sysctl_sched_rt_period,
+			 sysctl_sched_rt_runtime < 0 ?
+				RUNTIME_INF : sysctl_sched_rt_runtime);
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list)
-		total += tgi->rt_ratio;
-	rcu_read_unlock();
+	list_for_each_entry_rcu(tgi, &task_groups, list) {
+		if (tgi == tg)
+			continue;
 
-	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
-		return -EINVAL;
+		total += to_ratio(period, tgi->rt_runtime);
+	}
+	rcu_read_unlock();
 
-	tg->rt_ratio = rt_ratio;
-	return 0;
+	return total + to_ratio(period, runtime) < global_ratio;
 }
 
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
-	return tg->rt_ratio;
+	u64 rt_runtime, rt_period;
+	int err = 0;
+
+	rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	if (rt_runtime_us == -1)
+		rt_runtime = rt_period;
+
+	mutex_lock(&rt_constraints_mutex);
+	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+		err = -EINVAL;
+		goto unlock;
+	}
+	if (rt_runtime_us == -1)
+		rt_runtime = RUNTIME_INF;
+	tg->rt_runtime = rt_runtime;
+ unlock:
+	mutex_unlock(&rt_constraints_mutex);
+
+	return err;
 }
 
+long sched_group_rt_runtime(struct task_group *tg)
+{
+	u64 rt_runtime_us;
+
+	if (tg->rt_runtime == RUNTIME_INF)
+		return -1;
+
+	rt_runtime_us = tg->rt_runtime;
+	do_div(rt_runtime_us, NSEC_PER_USEC);
+	return rt_runtime_us;
+}
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7884,17 +7933,49 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 	return (u64) tg->shares;
 }
 
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-		u64 rt_ratio_val)
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+				struct file *file,
+				const char __user *userbuf,
+				size_t nbytes, loff_t *unused_ppos)
 {
-	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+	char buffer[64];
+	int retval = 0;
+	s64 val;
+	char *end;
+
+	if (!nbytes)
+		return -EINVAL;
+	if (nbytes >= sizeof(buffer))
+		return -E2BIG;
+	if (copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;     /* nul-terminate */
+
+	/* strip newline if necessary */
+	if (nbytes && (buffer[nbytes-1] == '\n'))
+		buffer[nbytes-1] = 0;
+	val = simple_strtoll(buffer, &end, 0);
+	if (*end)
+		return -EINVAL;
+
+	/* Pass to subsystem */
+	retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+	if (!retval)
+		retval = nbytes;
+	return retval;
 }
 
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+				   struct file *file,
+				   char __user *buf, size_t nbytes,
+				   loff_t *ppos)
 {
-	struct task_group *tg = cgroup_tg(cgrp);
+	char tmp[64];
+	long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+	int len = sprintf(tmp, "%ld\n", val);
 
-	return (u64) tg->rt_ratio;
+	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
 static struct cftype cpu_files[] = {
@@ -7904,9 +7985,9 @@ static struct cftype cpu_files[] = {
 		.write_uint = cpu_shares_write_uint,
 	},
 	{
-		.name = "rt_ratio",
-		.read_uint = cpu_rt_ratio_read_uint,
-		.write_uint = cpu_rt_ratio_write_uint,
+		.name = "rt_runtime_us",
+		.read = cpu_rt_runtime_read,
+		.write = cpu_rt_runtime_write,
 	},
 };
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d4269381239..35825b28e429 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -57,12 +57,12 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
 	if (!rt_rq->tg)
-		return SCHED_RT_FRAC;
+		return RUNTIME_INF;
 
-	return rt_rq->tg->rt_ratio;
+	return rt_rq->tg->rt_runtime;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
 	}
 }
 
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -129,9 +129,12 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 
 #else
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-	return sysctl_sched_rt_ratio;
+	if (sysctl_sched_rt_runtime == -1)
+		return RUNTIME_INF;
+
+	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -158,11 +161,11 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return NULL;
 }
 
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 }
 
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
 
@@ -184,28 +187,24 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	return rt_task_of(rt_se)->prio;
 }
 
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
-	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
-	u64 period, ratio;
+	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (rt_ratio == SCHED_RT_FRAC)
+	if (runtime == RUNTIME_INF)
 		return 0;
 
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
-	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	if (rt_rq->rt_time > ratio) {
+	if (rt_rq->rt_time > runtime) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
 
 		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
 
 		if (rt_rq_throttled(rt_rq)) {
-			sched_rt_ratio_dequeue(rt_rq);
+			sched_rt_rq_dequeue(rt_rq);
 			return 1;
 		}
 	}
@@ -219,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
 	u64 period;
 
 	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+		period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 		rq->rt_period_expire += period;
 
 		for_each_leaf_rt_rq(rt_rq, rq) {
-			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+			u64 runtime = sched_rt_runtime(rt_rq);
 
-			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-			if (rt_rq->rt_throttled) {
+			rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 				rt_rq->rt_throttled = 0;
-				sched_rt_ratio_enqueue(rt_rq);
+				sched_rt_rq_enqueue(rt_rq);
 			}
 		}
 
@@ -262,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	/*
-	 * might make it a tad more accurate:
-	 *
-	 * update_sched_rt_period(rq);
-	 */
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d41ef6b4cf72..924c674b76ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_ms",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_ratio",
-		.data		= &sysctl_sched_rt_ratio,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 	{
 		.ctl_name       = CTL_UNNUMBERED,
@@ -346,6 +330,22 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_us",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_runtime_us",
+		.data		= &sysctl_sched_rt_runtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_compat_yield",
diff --git a/kernel/user.c b/kernel/user.c
index 7d7900c5a1fd..9f6d471bfd03 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -164,9 +164,37 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
 static struct kobj_attribute cpu_share_attr =
 	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
 
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   char *buf)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+	return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+	unsigned long rt_runtime;
+	int rc;
+
+	sscanf(buf, "%lu", &rt_runtime);
+
+	rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+	return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+	__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+
 /* default attributes per uid directory */
 static struct attribute *uids_attributes[] = {
 	&cpu_share_attr.attr,
+	&cpu_rt_runtime_attr.attr,
 	NULL
 };
 

From 052f1dc7eb02300b05170ae341ccd03b76207778 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:40 +0100
Subject: [PATCH 25/66] sched: rt-group: make rt groups scheduling configurable

Make the rt group scheduler compile time configurable.
Keep it experimental for now.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cgroup_subsys.h |   2 +-
 include/linux/sched.h         |  11 ++-
 init/Kconfig                  |  23 ++++--
 kernel/sched.c                | 148 ++++++++++++++++++++++++----------
 kernel/sched_rt.c             |  12 +--
 kernel/user.c                 |  22 +++--
 6 files changed, 151 insertions(+), 67 deletions(-)

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 228235c5ae53..ac6aad98b607 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -25,7 +25,7 @@ SUBSYS(ns)
 
 /* */
 
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 SUBSYS(cpu_cgroup)
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 142eb293f9c4..b9bb313fe1ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -590,7 +590,7 @@ struct user_struct {
 	struct hlist_node uidhash_node;
 	uid_t uid;
 
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;
@@ -973,7 +973,7 @@ struct sched_rt_entity {
 	unsigned long timeout;
 	int nr_cpus_allowed;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct rt_rq		*rt_rq;
@@ -2027,19 +2027,22 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
 
 extern void normalize_rt_tasks(void);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 
 extern struct task_group init_task_group;
 
 extern struct task_group *sched_create_group(void);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
+#ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 extern int sched_group_set_rt_runtime(struct task_group *tg,
 				      long rt_runtime_us);
 extern long sched_group_rt_runtime(struct task_group *tg);
-
+#endif
 #endif
 
 #ifdef CONFIG_TASK_XACCT
diff --git a/init/Kconfig b/init/Kconfig
index 824d48cb67bf..dcef8b55011a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -311,25 +311,36 @@ config CPUSETS
 
 	  Say N if unsure.
 
-config FAIR_GROUP_SCHED
-	bool "Fair group CPU scheduler"
+config GROUP_SCHED
+	bool "Group CPU scheduler"
 	default y
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups.
 
+config FAIR_GROUP_SCHED
+	bool "Group scheduling for SCHED_OTHER"
+	depends on GROUP_SCHED
+	default y
+
+config RT_GROUP_SCHED
+	bool "Group scheduling for SCHED_RR/FIFO"
+	depends on EXPERIMENTAL
+	depends on GROUP_SCHED
+	default n
+
 choice
-	depends on FAIR_GROUP_SCHED
+	depends on GROUP_SCHED
 	prompt "Basis for grouping tasks"
-	default FAIR_USER_SCHED
+	default USER_SCHED
 
-config FAIR_USER_SCHED
+config USER_SCHED
 	bool "user id"
 	help
 	  This option will choose userid as the basis for grouping
 	  tasks, thus providing equal CPU bandwidth to each user.
 
-config FAIR_CGROUP_SCHED
+config CGROUP_SCHED
 	bool "Control groups"
  	depends on CGROUPS
  	help
diff --git a/kernel/sched.c b/kernel/sched.c
index 85a5fbff2b00..5edc549edae8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 
 #include <linux/cgroup.h>
 
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
 
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 
-	struct sched_rt_entity **rt_se;
-	struct rt_rq **rt_rq;
-
-	u64 rt_runtime;
-
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
 	 * is allocated to the group. The more shares a group has, the more is
@@ -213,24 +210,36 @@ struct task_group {
 	 *
 	 */
 	unsigned long shares;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct sched_rt_entity **rt_se;
+	struct rt_rq **rt_rq;
+
+	u64 rt_runtime;
+#endif
 
 	struct rcu_head rcu;
 	struct list_head list;
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -240,6 +249,7 @@ static DEFINE_SPINLOCK(task_group_lock);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 /* kernel thread that runs rebalance_shares() periodically */
 static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
 
 static void set_se_shares(struct sched_entity *se, unsigned long shares);
 
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
+#endif
+
+#define MIN_GROUP_SHARES	2
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	.rt_se	= init_sched_rt_entity_p,
 	.rt_rq	= init_rt_rq_p,
-};
-
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
-
-#define MIN_GROUP_SHARES	2
-
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+};
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 	tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
 #else
@@ -288,11 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
 }
 
 static inline void lock_doms_cur(void)
@@ -311,7 +330,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#endif	/* CONFIG_GROUP_SCHED */
 
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -351,7 +370,7 @@ struct cfs_rq {
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
@@ -361,7 +380,7 @@ struct rt_rq {
 	int rt_throttled;
 	u64 rt_time;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
 
 	struct rq *rq;
@@ -437,6 +456,8 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 
@@ -7104,7 +7125,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
@@ -7115,7 +7136,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
@@ -7139,7 +7160,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
 	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
 	se->parent = NULL;
 }
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 		int cpu, int add)
@@ -7168,7 +7191,7 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 #endif
 
@@ -7189,6 +7212,8 @@ void __init sched_init(void)
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_runtime =
 			sysctl_sched_rt_runtime * NSEC_PER_USEC;
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
@@ -7381,9 +7406,9 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 /*
  * distribute shares of all task groups among their schedulable entities,
  * to reflect load distribution across cpus.
@@ -7539,20 +7564,28 @@ static void free_sched_group(struct task_group *tg)
 	int i;
 
 	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
 		if (tg->rt_se)
 			kfree(tg->rt_se[i]);
+#endif
 	}
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	kfree(tg->rt_rq);
 	kfree(tg->rt_se);
+#endif
 	kfree(tg);
 }
 
@@ -7560,10 +7593,14 @@ static void free_sched_group(struct task_group *tg)
 struct task_group *sched_create_group(void)
 {
 	struct task_group *tg;
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
+#endif
 	struct rq *rq;
 	unsigned long flags;
 	int i;
@@ -7572,12 +7609,18 @@ struct task_group *sched_create_group(void)
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
+
+	tg->shares = NICE_0_LOAD;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
 	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
@@ -7585,12 +7628,13 @@ struct task_group *sched_create_group(void)
 	if (!tg->rt_se)
 		goto err;
 
-	tg->shares = NICE_0_LOAD;
 	tg->rt_runtime = 0;
+#endif
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
@@ -7601,6 +7645,10 @@ struct task_group *sched_create_group(void)
 		if (!se)
 			goto err;
 
+		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
 		rt_rq = kmalloc_node(sizeof(struct rt_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!rt_rq)
@@ -7611,17 +7659,21 @@ struct task_group *sched_create_group(void)
 		if (!rt_se)
 			goto err;
 
-		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
 		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
+#endif
 	}
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
+#ifdef CONFIG_FAIR_GROUP_SCHED
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 		rt_rq = tg->rt_rq[i];
 		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+#endif
 	}
 	list_add_rcu(&tg->list, &task_groups);
 	spin_unlock_irqrestore(&task_group_lock, flags);
@@ -7643,23 +7695,21 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
-	struct cfs_rq *cfs_rq = NULL;
-	struct rt_rq *rt_rq = NULL;
 	unsigned long flags;
 	int i;
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-		rt_rq = tg->rt_rq[i];
-		list_del_rcu(&rt_rq->leaf_rt_rq_list);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		list_del_rcu(&tg->cfs_rq[i]->leaf_cfs_rq_list);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		list_del_rcu(&tg->rt_rq[i]->leaf_rt_rq_list);
+#endif
 	}
 	list_del_rcu(&tg->list);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
-	BUG_ON(!cfs_rq);
-
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
@@ -7699,6 +7749,7 @@ void sched_move_task(struct task_struct *tsk)
 	task_rq_unlock(rq, &flags);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
@@ -7786,7 +7837,9 @@ unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
@@ -7858,9 +7911,10 @@ long sched_group_rt_runtime(struct task_group *tg)
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#endif
+#endif	/* CONFIG_GROUP_SCHED */
 
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7920,6 +7974,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	sched_move_task(tsk);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
@@ -7932,7 +7987,9 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
@@ -7977,18 +8034,23 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
 
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+#endif
 
 static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read = cpu_rt_runtime_read,
 		.write = cpu_rt_runtime_write,
 	},
+#endif
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -8007,7 +8069,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.early_init	= 1,
 };
 
-#endif	/* CONFIG_FAIR_CGROUP_SCHED */
+#endif	/* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_CPUACCT
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 35825b28e429..f54792b175b2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,7 +55,7 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 	return !list_empty(&rt_se->run_list);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
@@ -177,7 +177,7 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
@@ -269,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
 		rt_rq->highest_prio = rt_se_prio(rt_se);
 #endif
@@ -281,7 +281,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted++;
 #endif
@@ -293,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
 		struct rt_prio_array *array;
 
@@ -315,7 +315,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted--;
 
diff --git a/kernel/user.c b/kernel/user.c
index 9f6d471bfd03..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -57,7 +57,7 @@ struct user_struct root_user = {
 	.uid_keyring	= &root_user_keyring,
 	.session_keyring = &root_session_keyring,
 #endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
 };
@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 	return NULL;
 }
 
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 
 static void sched_destroy_user(struct user_struct *up)
 {
@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
 	sched_move_task(p);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED */
+#else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
 static void sched_switch_user(struct task_struct *p) { }
 
-#endif	/* CONFIG_FAIR_USER_SCHED */
+#endif	/* CONFIG_USER_SCHED */
 
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
 
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void)
 }
 
 /* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static ssize_t cpu_shares_show(struct kobject *kobj,
 			       struct kobj_attribute *attr,
 			       char *buf)
@@ -163,7 +164,9 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
 
 static struct kobj_attribute cpu_share_attr =
 	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
 				   struct kobj_attribute *attr,
 				   char *buf)
@@ -190,11 +193,16 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
 
 static struct kobj_attribute cpu_rt_runtime_attr =
 	__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
 
 /* default attributes per uid directory */
 static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	&cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	&cpu_rt_runtime_attr.attr,
+#endif
 	NULL
 };
 
@@ -297,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 	schedule_work(&up->work);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else	/* CONFIG_USER_SCHED && CONFIG_SYSFS */
 
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -401,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			/* This case is not possible when CONFIG_FAIR_USER_SCHED
+			/* This case is not possible when CONFIG_USER_SCHED
 			 * is defined, since we serialize alloc_uid() using
 			 * uids_mutex. Hence no need to call
 			 * sched_destroy_user() or remove_user_sysfs_dir().

From bccbe08a60973c873e6af6fdb9ec11ffb1a6e4de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:40 +0100
Subject: [PATCH 26/66] sched: rt-group: clean up the ifdeffery

Clean up some of the excessive ifdeffery introduces in the last patch.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 210 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 139 insertions(+), 71 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5edc549edae8..d2f4398c5e6f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7559,57 +7559,29 @@ static int load_balance_monitor(void *unused)
 }
 #endif	/* CONFIG_SMP */
 
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 
 	for_each_possible_cpu(i) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-		if (tg->rt_rq)
-			kfree(tg->rt_rq[i]);
-		if (tg->rt_se)
-			kfree(tg->rt_se[i]);
-#endif
 	}
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	kfree(tg->rt_rq);
-	kfree(tg->rt_se);
-#endif
-	kfree(tg);
 }
 
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+static int alloc_fair_sched_group(struct task_group *tg)
 {
-	struct task_group *tg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se;
-#endif
 	struct rq *rq;
-	unsigned long flags;
 	int i;
 
-	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-	if (!tg)
-		return ERR_PTR(-ENOMEM);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
@@ -7618,23 +7590,10 @@ struct task_group *sched_create_group(void)
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
-	if (!tg->rt_rq)
-		goto err;
-	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
-	if (!tg->rt_se)
-		goto err;
-
-	tg->rt_runtime = 0;
-#endif
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
@@ -7646,9 +7605,78 @@ struct task_group *sched_create_group(void)
 			goto err;
 
 		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+	}
+
+	return 1;
+
+ err:
+	return 0;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+			&cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+	return 1;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (tg->rt_rq)
+			kfree(tg->rt_rq[i]);
+		if (tg->rt_se)
+			kfree(tg->rt_se[i]);
+	}
+
+	kfree(tg->rt_rq);
+	kfree(tg->rt_se);
+}
+
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+	struct rt_rq *rt_rq;
+	struct sched_rt_entity *rt_se;
+	struct rq *rq;
+	int i;
+
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_rq)
+		goto err;
+	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_se)
+		goto err;
+
+	tg->rt_runtime = 0;
+
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		rt_rq = kmalloc_node(sizeof(struct rt_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!rt_rq)
@@ -7660,20 +7688,71 @@ struct task_group *sched_create_group(void)
 			goto err;
 
 		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
-#endif
 	}
 
+	return 1;
+
+ err:
+	return 0;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+			&cpu_rq(cpu)->leaf_rt_rq_list);
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+	return 1;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+static void free_sched_group(struct task_group *tg)
+{
+	free_fair_sched_group(tg);
+	free_rt_sched_group(tg);
+	kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+	struct task_group *tg;
+	unsigned long flags;
+	int i;
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	if (!alloc_fair_sched_group(tg))
+		goto err;
+
+	if (!alloc_rt_sched_group(tg))
+		goto err;
+
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		cfs_rq = tg->cfs_rq[i];
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-		rt_rq = tg->rt_rq[i];
-		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
-#endif
+		register_fair_sched_group(tg, i);
+		register_rt_sched_group(tg, i);
 	}
 	list_add_rcu(&tg->list, &task_groups);
 	spin_unlock_irqrestore(&task_group_lock, flags);
@@ -7700,12 +7779,8 @@ void sched_destroy_group(struct task_group *tg)
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		list_del_rcu(&tg->cfs_rq[i]->leaf_cfs_rq_list);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-		list_del_rcu(&tg->rt_rq[i]->leaf_rt_rq_list);
-#endif
+		unregister_fair_sched_group(tg, i);
+		unregister_rt_sched_group(tg, i);
 	}
 	list_del_rcu(&tg->list);
 	spin_unlock_irqrestore(&task_group_lock, flags);
@@ -7780,8 +7855,6 @@ static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
-	struct cfs_rq *cfs_rq;
-	struct rq *rq;
 	unsigned long flags;
 
 	mutex_lock(&shares_mutex);
@@ -7797,10 +7870,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
 	 */
 	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-	}
+	for_each_possible_cpu(i)
+		unregister_fair_sched_group(tg, i);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for any ongoing reference to this group to finish */
@@ -7822,11 +7893,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
 	spin_lock_irqsave(&task_group_lock, flags);
-	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-		cfs_rq = tg->cfs_rq[i];
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-	}
+	for_each_possible_cpu(i)
+		register_fair_sched_group(tg, i);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
 	mutex_unlock(&shares_mutex);

From b68aa2300cabeb96801369a4bb37a4f19f59ed84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 15:45:40 +0100
Subject: [PATCH 27/66] sched: rt-group: refure unrunnable tasks

Refuse to accept or create RT tasks in groups that can't run them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index d2f4398c5e6f..f28f19e65b59 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4584,6 +4584,15 @@ int sched_setscheduler(struct task_struct *p, int policy,
 			return -EPERM;
 	}
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Do not allow realtime tasks into groups that have no runtime
+	 * assigned.
+	 */
+	if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+		return -EPERM;
+#endif
+
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
@@ -8028,9 +8037,15 @@ static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+		return -EINVAL;
+#else
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
+#endif
 
 	return 0;
 }

From 1cdde19109901e8f1194e227d0bcd48caf713323 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 Feb 2008 16:20:35 +0100
Subject: [PATCH 28/66] x86: fix sigcontext.h user export

Jakub Jelinek reported that some user-space code that relies on
kernel headers has built dependency on the sigcontext->eip/rip
register names - which have been unified in commit:

  commit 742fa54a62be6a263df14a553bf832724471dfbe
  Author: H. Peter Anvin <hpa@zytor.com>
  Date:   Wed Jan 30 13:30:56 2008 +0100

      x86: use generic register names in struct sigcontext

so give the old layout to user-space. This is not particularly
pretty, but it's an ABI so there's no danger of the two definitions
getting out of sync.

Reported-by: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/sigcontext.h | 66 ++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/include/asm-x86/sigcontext.h b/include/asm-x86/sigcontext.h
index 681deade5f00..d743947f4c77 100644
--- a/include/asm-x86/sigcontext.h
+++ b/include/asm-x86/sigcontext.h
@@ -58,6 +58,7 @@ struct _fpstate {
 
 #define X86_FXSR_MAGIC		0x0000
 
+#ifdef __KERNEL__
 struct sigcontext {
 	unsigned short gs, __gsh;
 	unsigned short fs, __fsh;
@@ -82,6 +83,35 @@ struct sigcontext {
 	unsigned long oldmask;
 	unsigned long cr2;
 };
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+	unsigned short gs, __gsh;
+	unsigned short fs, __fsh;
+	unsigned short es, __esh;
+	unsigned short ds, __dsh;
+	unsigned long edi;
+	unsigned long esi;
+	unsigned long ebp;
+	unsigned long esp;
+	unsigned long ebx;
+	unsigned long edx;
+	unsigned long ecx;
+	unsigned long eax;
+	unsigned long trapno;
+	unsigned long err;
+	unsigned long eip;
+	unsigned short cs, __csh;
+	unsigned long eflags;
+	unsigned long esp_at_signal;
+	unsigned short ss, __ssh;
+	struct _fpstate __user * fpstate;
+	unsigned long oldmask;
+	unsigned long cr2;
+};
+#endif /* !__KERNEL__ */
 
 #else /* __i386__ */
 
@@ -102,6 +132,7 @@ struct _fpstate {
 	__u32	reserved2[24];
 };
 
+#ifdef __KERNEL__
 struct sigcontext {
 	unsigned long r8;
 	unsigned long r9;
@@ -132,6 +163,41 @@ struct sigcontext {
 	struct _fpstate __user *fpstate;	/* zero when no FPU context */
 	unsigned long reserved1[8];
 };
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+	unsigned long r8;
+	unsigned long r9;
+	unsigned long r10;
+	unsigned long r11;
+	unsigned long r12;
+	unsigned long r13;
+	unsigned long r14;
+	unsigned long r15;
+	unsigned long rdi;
+	unsigned long rsi;
+	unsigned long rbp;
+	unsigned long rbx;
+	unsigned long rdx;
+	unsigned long rax;
+	unsigned long rcx;
+	unsigned long rsp;
+	unsigned long rip;
+	unsigned long eflags;		/* RFLAGS */
+	unsigned short cs;
+	unsigned short gs;
+	unsigned short fs;
+	unsigned short __pad0;
+	unsigned long err;
+	unsigned long trapno;
+	unsigned long oldmask;
+	unsigned long cr2;
+	struct _fpstate __user *fpstate;	/* zero when no FPU context */
+	unsigned long reserved1[8];
+};
+#endif /* !__KERNEL__ */
 
 #endif /* !__i386__ */
 

From 416e2d63794d4e57774989429e174507801915f2 Mon Sep 17 00:00:00 2001
From: Jody Belka <lists-lkml@pimb.org>
Date: Tue, 12 Feb 2008 23:37:48 +0000
Subject: [PATCH 29/66] x86: fixup machine_ops reboot_{32|64}.c unification
 fallout

When reboot_32.c and reboot_64.c were unified (commit 4d022e35fd...),
the machine_ops code was broken, leading to xen pvops kernels failing
to properly halt/poweroff/reboot etc. This fixes that up.

Signed-off-by: Jody Belka <knew-linux@pimb.org>
Cc: Miguel Boton <mboton@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 46 +++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 5818dc28167d..7fd6ac43e4a1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -326,7 +326,7 @@ static inline void kb_wait(void)
 	}
 }
 
-void machine_emergency_restart(void)
+static void native_machine_emergency_restart(void)
 {
 	int i;
 
@@ -376,7 +376,7 @@ void machine_emergency_restart(void)
 	}
 }
 
-void machine_shutdown(void)
+static void native_machine_shutdown(void)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
@@ -420,7 +420,7 @@ void machine_shutdown(void)
 #endif
 }
 
-void machine_restart(char *__unused)
+static void native_machine_restart(char *__unused)
 {
 	printk("machine restart\n");
 
@@ -429,11 +429,11 @@ void machine_restart(char *__unused)
 	machine_emergency_restart();
 }
 
-void machine_halt(void)
+static void native_machine_halt(void)
 {
 }
 
-void machine_power_off(void)
+static void native_machine_power_off(void)
 {
 	if (pm_power_off) {
 		if (!reboot_force)
@@ -443,9 +443,35 @@ void machine_power_off(void)
 }
 
 struct machine_ops machine_ops = {
-	.power_off = machine_power_off,
-	.shutdown = machine_shutdown,
-	.emergency_restart = machine_emergency_restart,
-	.restart = machine_restart,
-	.halt = machine_halt
+	.power_off = native_machine_power_off,
+	.shutdown = native_machine_shutdown,
+	.emergency_restart = native_machine_emergency_restart,
+	.restart = native_machine_restart,
+	.halt = native_machine_halt
 };
+
+void machine_power_off(void)
+{
+	machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+	machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+	machine_ops.emergency_restart();
+}
+
+void machine_restart(char *cmd)
+{
+	machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+	machine_ops.halt();
+}
+

From 37cc8d7f963ba2deec29c9b68716944516a3244f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Wed, 13 Feb 2008 16:20:35 +0100
Subject: [PATCH 30/66] x86/early_ioremap: don't assume we're using
 swapper_pg_dir

At the early stages of boot, before the kernel pagetable has been
fully initialized, a Xen kernel will still be running off the
Xen-provided pagetables rather than swapper_pg_dir[].  Therefore,
readback cr3 to determine the base of the pagetable rather than
assuming swapper_pg_dir[].

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Tested-by: Jody Belka <knew-linux@pimb.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index a4897a85268a..9f42d7e9c158 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -265,7 +265,9 @@ static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
-	pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)];
+	/* Don't assume we're using swapper_pg_dir at this point */
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(addr)];
 	pud_t *pud = pud_offset(pgd, addr);
 	pmd_t *pmd = pmd_offset(pud, addr);
 

From 2b5407811db755257ae53c75cc6b312ed5d2ad9e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Wed, 13 Feb 2008 16:20:35 +0100
Subject: [PATCH 31/66] xen: unpin initial Xen pagetable once we're finished
 with it

Unpin the Xen-provided pagetable once we've finished with it, so it
doesn't cause stray references which cause later swapper_pg_dir
pagetable updates to fail.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Tested-by: Jody Belka <knew-linux@pimb.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index de647bc6e74d..49e5358f481a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -798,6 +798,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 	 * added to the table can be prepared properly for Xen.
 	 */
 	xen_write_cr3(__pa(base));
+
+	/* Unpin initial Xen pagetable */
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+			  PFN_DOWN(__pa(xen_start_info->pt_base)));
 }
 
 static __init void xen_pagetable_setup_done(pgd_t *base)

From 5d3c8b21e22712137db6bbd246d1bdcbe0a09914 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 13 Feb 2008 16:20:35 +0100
Subject: [PATCH 32/66] x86: CPA: fix gbpages support in
 try_preserve_large_page

[ mingo@elte.hu: while gbpages cannot be enabled on mainline currently,
  keep the code uptodate and this fix is easy enough. ]

Use correct page sizes and masks for GB pages in try_preserve_large_page()

This prevents a boot hang on a GB capable system with CONFIG_DIRECT_GBPAGES
enabled.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 440210a2277d..bd61ed13f9cf 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -275,8 +275,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 		break;
 #ifdef CONFIG_X86_64
 	case PG_LEVEL_1G:
-		psize = PMD_PAGE_SIZE;
-		pmask = PMD_PAGE_MASK;
+		psize = PUD_PAGE_SIZE;
+		pmask = PUD_PAGE_MASK;
 		break;
 #endif
 	default:

From e85f20518bb928667508c22090c85d458e25a4f7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Feb 2008 19:46:48 +0100
Subject: [PATCH 33/66] x86: EFI: fix use of unitialized variable and the cache
 logic

Andi Kleen pointed out that the cache attribute logic is reverse in
efi_enter_virtual_mode(). This problem alone is harmless as we do not
(yet) do cache attribute conflict resolution. (This bug was not present
in the original EFI submission - I introduced it while fixing up rejects.)

While reviewing this code I noticed a second, worse problem: the use of
uninitialized md->virt_addr.

Fix both problems.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 32dd62b36ff7..b4d523276f40 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -428,9 +428,6 @@ void __init efi_enter_virtual_mode(void)
 		else
 			va = efi_ioremap(md->phys_addr, size);
 
-		if (md->attribute & EFI_MEMORY_WB)
-			set_memory_uc(md->virt_addr, size);
-
 		md->virt_addr = (u64) (unsigned long) va;
 
 		if (!va) {
@@ -439,6 +436,9 @@ void __init efi_enter_virtual_mode(void)
 			continue;
 		}
 
+		if (!(md->attribute & EFI_MEMORY_WB))
+			set_memory_uc(md->virt_addr, size);
+
 		systab = (u64) (unsigned long) efi_phys.systab;
 		if (md->phys_addr <= systab && systab < end) {
 			systab += md->virt_addr - md->phys_addr;

From 4de0d4a6d173351b023ab2855c3d331146a418e5 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 13 Feb 2008 17:22:41 +0800
Subject: [PATCH 34/66] x86: EFI runtime code mapping enhancement

This patch enhances EFI runtime code memory mapping as following:

- Move __supported_pte_mask & _PAGE_NX checking before invoking
  runtime_code_page_mkexec(). This makes it possible for compiler to
  eliminate runtime_code_page_mkexec() on machine without NX support.

- Use set_memory_x/nx in early_mapping_set_exec(). This eliminates the
  duplicated implementation.

This patch has been tested on Intel x86_64 platform with EFI64/32
firmware.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c    |  6 ++----
 arch/x86/kernel/efi_64.c | 32 +++++++++++++-------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b4d523276f40..cbdf9bacc575 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -384,9 +384,6 @@ static void __init runtime_code_page_mkexec(void)
 	efi_memory_desc_t *md;
 	void *p;
 
-	if (!(__supported_pte_mask & _PAGE_NX))
-		return;
-
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
@@ -476,7 +473,8 @@ void __init efi_enter_virtual_mode(void)
 	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
 	efi.reset_system = virt_efi_reset_system;
 	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
-	runtime_code_page_mkexec();
+	if (__supported_pte_mask & _PAGE_NX)
+		runtime_code_page_mkexec();
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
 	memmap.map = NULL;
 }
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 09d5c2330934..d143a1e76b30 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -35,6 +35,7 @@
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
 #include <asm/efi.h>
+#include <asm/cacheflush.h>
 
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
@@ -43,22 +44,15 @@ static void __init early_mapping_set_exec(unsigned long start,
 					  unsigned long end,
 					  int executable)
 {
-	pte_t *kpte;
-	unsigned int level;
-
-	while (start < end) {
-		kpte = lookup_address((unsigned long)__va(start), &level);
-		BUG_ON(!kpte);
-		if (executable)
-			set_pte(kpte, pte_mkexec(*kpte));
-		else
-			set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
-					    __supported_pte_mask));
-		if (level == PG_LEVEL_4K)
-			start = (start + PAGE_SIZE) & PAGE_MASK;
-		else
-			start = (start + PMD_SIZE) & PMD_MASK;
-	}
+	unsigned long num_pages;
+
+	start &= PMD_MASK;
+	end = (end + PMD_SIZE - 1) & PMD_MASK;
+	num_pages = (end - start) >> PAGE_SHIFT;
+	if (executable)
+		set_memory_x((unsigned long)__va(start), num_pages);
+	else
+		set_memory_nx((unsigned long)__va(start), num_pages);
 }
 
 static void __init early_runtime_code_mapping_set_exec(int executable)
@@ -74,7 +68,7 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
 		md = p;
 		if (md->type == EFI_RUNTIME_SERVICES_CODE) {
 			unsigned long end;
-			end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
+			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 			early_mapping_set_exec(md->phys_addr, end, executable);
 		}
 	}
@@ -84,8 +78,8 @@ void __init efi_call_phys_prelog(void)
 {
 	unsigned long vaddress;
 
-	local_irq_save(efi_flags);
 	early_runtime_code_mapping_set_exec(1);
+	local_irq_save(efi_flags);
 	vaddress = (unsigned long)__va(0x0UL);
 	save_pgd = *pgd_offset_k(0x0UL);
 	set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
@@ -98,9 +92,9 @@ void __init efi_call_phys_epilog(void)
 	 * After the lock is released, the original page table is restored.
 	 */
 	set_pgd(pgd_offset_k(0x0UL), save_pgd);
-	early_runtime_code_mapping_set_exec(0);
 	__flush_tlb_all();
 	local_irq_restore(efi_flags);
+	early_runtime_code_mapping_set_exec(0);
 }
 
 void __init efi_reserve_bootmem(void)

From c2a9cc7e86cf535a4fa14aebf5c3bc3349d09603 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 12 Feb 2008 12:10:27 -0800
Subject: [PATCH 35/66] x86: pit_clockevent can be static

arch/x86/kernel/i8253.c:98:27: warning: symbol 'pit_clockevent' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8253.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index ef62b07b2b48..8540abe86ade 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
  * !using_apic_timer decisions in do_timer_interrupt_hook()
  */
-struct clock_event_device pit_clockevent = {
+static struct clock_event_device pit_clockevent = {
 	.name		= "pit",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= init_pit_timer,

From 61c92814dc324b541391757062ff02fbf3b08086 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 12 Feb 2008 19:35:22 +0200
Subject: [PATCH 36/66] [SCSI] gdth: scan for scsi devices

The patch: "gdth: switch to modern scsi host registration"

missed one simple fact when moving a way from scsi_module.c.
That is to call scsi_scan_host() on the probed host.
With this the gdth driver from 2.6.24 is again able to
see drives and boot.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Tested-by: Joerg Dorchain <joerg@dorchain.net>
Tested-by: Stefan Priebe <s.priebe@allied-internet.ag>
Tested-by: Jon Chelton <jchelton@ffpglobal.com>
Cc: Stable Tree <stable@kernel.org>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/gdth.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index c82523908c2e..7079fef383ec 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -4836,6 +4836,9 @@ static int __init gdth_isa_probe_one(ulong32 isa_bios)
 	if (error)
 		goto out_free_coal_stat;
 	list_add_tail(&ha->list, &gdth_instances);
+
+	scsi_scan_host(shp);
+
 	return 0;
 
  out_free_coal_stat:
@@ -4963,6 +4966,9 @@ static int __init gdth_eisa_probe_one(ushort eisa_slot)
 	if (error)
 		goto out_free_coal_stat;
 	list_add_tail(&ha->list, &gdth_instances);
+
+	scsi_scan_host(shp);
+
 	return 0;
 
  out_free_ccb_phys:
@@ -5100,6 +5106,9 @@ static int __init gdth_pci_probe_one(gdth_pci_str *pcistr, int ctr)
 	if (error)
 		goto out_free_coal_stat;
 	list_add_tail(&ha->list, &gdth_instances);
+
+	scsi_scan_host(shp);
+
 	return 0;
 
  out_free_coal_stat:

From 99109301d103fbf0de43fc5a580a406c12a501e0 Mon Sep 17 00:00:00 2001
From: Sergio Luis <sergio@larces.uece.br>
Date: Tue, 12 Feb 2008 20:48:03 -0300
Subject: [PATCH 37/66] [SCSI] gdth: update deprecated pci_find_device

Fix compilation warning in gdth.c, which was using the deprecated
pci_find_device.

drivers/scsi/gdth.c:645: warning: 'pci_find_device' is deprecated (declared at include/linux/pci.h:495)

Changing it to use pci_get_device, instead.

Signed-off-by: Sergio Luis <sergio@larces.uece.br>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/Kconfig | 2 +-
 drivers/scsi/gdth.c  | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index a5f0aaaf0dd4..a7a0813b24cb 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -722,7 +722,7 @@ config SCSI_FD_MCS
 
 config SCSI_GDTH
 	tristate "Intel/ICP (former GDT SCSI Disk Array) RAID Controller support"
-	depends on (ISA || EISA || PCI) && SCSI && ISA_DMA_API && PCI_LEGACY
+	depends on (ISA || EISA || PCI) && SCSI && ISA_DMA_API
 	---help---
 	  Formerly called GDT SCSI Disk Array Controller Support.
 
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 7079fef383ec..6d67f5c0eb8e 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -642,12 +642,15 @@ static void __init gdth_search_dev(gdth_pci_str *pcistr, ushort *cnt,
           *cnt, vendor, device));
 
     pdev = NULL;
-    while ((pdev = pci_find_device(vendor, device, pdev)) 
+    while ((pdev = pci_get_device(vendor, device, pdev))
            != NULL) {
         if (pci_enable_device(pdev))
             continue;
-        if (*cnt >= MAXHA)
+        if (*cnt >= MAXHA) {
+            pci_dev_put(pdev);
             return;
+        }
+
         /* GDT PCI controller found, resources are already in pdev */
         pcistr[*cnt].pdev = pdev;
         pcistr[*cnt].irq = pdev->irq;

From e6bafba5b4765a5a252f1b8d31cbf6d2459da337 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 13 Feb 2008 04:03:25 +0000
Subject: [PATCH 38/66] wmi: (!x & y) strikes again

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Carlos Corbacho <carlos@strangeworlds.co.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/wmi.c b/drivers/acpi/wmi.c
index 457ed3d3f51c..efacc9f8bfe3 100644
--- a/drivers/acpi/wmi.c
+++ b/drivers/acpi/wmi.c
@@ -247,7 +247,7 @@ u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out)
 	block = &wblock->gblock;
 	handle = wblock->handle;
 
-	if (!block->flags & ACPI_WMI_METHOD)
+	if (!(block->flags & ACPI_WMI_METHOD))
 		return AE_BAD_DATA;
 
 	if (block->instance_count < instance)

From 39ed7adb17bdec8224bd3fae551bb7222e05f35b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 13 Feb 2008 03:53:00 +0000
Subject: [PATCH 39/66] dm-raid1 breakage on 64bit

test_and_set_bit() on address of uint32_t is a Bad Idea(tm)...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/dm-raid1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index edc057f5cdcc..2928ef228101 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -124,7 +124,7 @@ enum dm_raid1_error {
 struct mirror {
 	struct mirror_set *ms;
 	atomic_t error_count;
-	uint32_t error_type;
+	unsigned long error_type;
 	struct dm_dev *dev;
 	sector_t offset;
 };

From 282ea441e003f2886893ab7bb60bfe29399ef7be Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 13 Feb 2008 03:56:59 +0000
Subject: [PATCH 40/66] drivers/memstick/host/tifm_ms.c breakage

writel(sock + ...) that should've been writel(sock->addr + ...)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/host/tifm_ms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c
index f55b71a4337d..4fb24215bd95 100644
--- a/drivers/memstick/host/tifm_ms.c
+++ b/drivers/memstick/host/tifm_ms.c
@@ -282,7 +282,7 @@ static int tifm_ms_issue_cmd(struct tifm_ms *host)
 
 			writel(TIFM_MS_SYS_LATCH
 			       | readl(sock->addr + SOCK_MS_SYSTEM),
-			       sock + SOCK_MS_SYSTEM);
+			       sock->addr + SOCK_MS_SYSTEM);
 			writel(0, sock->addr + SOCK_MS_DATA);
 			dev_dbg(&sock->dev, "writing %x\n", 0);
 

From d897d2b597167586fcf1fb197ad5a1c23332c3e8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 13 Feb 2008 16:10:08 +0000
Subject: [PATCH 41/66] FRV: Fix up parse error in linker script

Fix up parse error in FRV linker script, presumably introduced through changes
to the INIT_TEXT and EXIT_TEXT macros.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/vmlinux.lds.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index ef7527b8b0c7..17725a55aed8 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -105,11 +105,9 @@ SECTIONS
 	SCHED_TEXT
 	LOCK_TEXT
 #ifdef CONFIG_DEBUG_INFO
-	*(
 	INIT_TEXT
 	EXIT_TEXT
-	.exitcall.exit
-	)
+	*(.exitcall.exit)
 #endif
 	*(.fixup)
 	*(.gnu.warning)

From 10270d4838bdc493781f5a1cf2e90e9c34c9142f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Wed, 13 Feb 2008 09:56:14 -0800
Subject: [PATCH 42/66] acpi: fix acpi_os_read_pci_configuration() misuse of
 raw_pci_read()

The raw_pci_read() interface (as the raw_pci_ops->read() before it)
unconditionally fills in a 32-bit integer return value regardless of the
size of the operation requested.

So claiming to take a "void *" is wrong, as is passing in a pointer to
just a byte variable.

Noticed by pageexec when enabling -fstack-protector (which needs other
patches too to actually work, but that's a separate issue).

Acked-by: Len Brown <len.brown@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/osl.c      | 16 ++++++++--------
 include/acpi/acpiosxf.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 34b3386dedca..15e602377655 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -623,7 +623,7 @@ acpi_os_write_memory(acpi_physical_address phys_addr, u32 value, u32 width)
 
 acpi_status
 acpi_os_read_pci_configuration(struct acpi_pci_id * pci_id, u32 reg,
-			       void *value, u32 width)
+			       u32 *value, u32 width)
 {
 	int result, size;
 
@@ -689,7 +689,6 @@ static void acpi_os_derive_pci_id_2(acpi_handle rhandle,	/* upper bound  */
 	acpi_status status;
 	unsigned long temp;
 	acpi_object_type type;
-	u8 tu8;
 
 	acpi_get_parent(chandle, &handle);
 	if (handle != rhandle) {
@@ -704,6 +703,7 @@ static void acpi_os_derive_pci_id_2(acpi_handle rhandle,	/* upper bound  */
 		    acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL,
 					  &temp);
 		if (ACPI_SUCCESS(status)) {
+			u32 val;
 			pci_id->device = ACPI_HIWORD(ACPI_LODWORD(temp));
 			pci_id->function = ACPI_LOWORD(ACPI_LODWORD(temp));
 
@@ -712,24 +712,24 @@ static void acpi_os_derive_pci_id_2(acpi_handle rhandle,	/* upper bound  */
 
 			/* any nicer way to get bus number of bridge ? */
 			status =
-			    acpi_os_read_pci_configuration(pci_id, 0x0e, &tu8,
+			    acpi_os_read_pci_configuration(pci_id, 0x0e, &val,
 							   8);
 			if (ACPI_SUCCESS(status)
-			    && ((tu8 & 0x7f) == 1 || (tu8 & 0x7f) == 2)) {
+			    && ((val & 0x7f) == 1 || (val & 0x7f) == 2)) {
 				status =
 				    acpi_os_read_pci_configuration(pci_id, 0x18,
-								   &tu8, 8);
+								   &val, 8);
 				if (!ACPI_SUCCESS(status)) {
 					/* Certainly broken...  FIX ME */
 					return;
 				}
 				*is_bridge = 1;
-				pci_id->bus = tu8;
+				pci_id->bus = val;
 				status =
 				    acpi_os_read_pci_configuration(pci_id, 0x19,
-								   &tu8, 8);
+								   &val, 8);
 				if (ACPI_SUCCESS(status)) {
-					*bus_number = tu8;
+					*bus_number = val;
 				}
 			} else
 				*is_bridge = 0;
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index 022a5fd80c8e..4839f2af94c3 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -222,7 +222,7 @@ acpi_os_write_memory(acpi_physical_address address, u32 value, u32 width);
  */
 acpi_status
 acpi_os_read_pci_configuration(struct acpi_pci_id *pci_id,
-			       u32 reg, void *value, u32 width);
+			       u32 reg, u32 *value, u32 width);
 
 acpi_status
 acpi_os_write_pci_configuration(struct acpi_pci_id *pci_id,

From aa02cd2d9bd1e24a230bd66a0a741b984d03915a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Feb 2008 21:33:16 +0100
Subject: [PATCH 43/66] xtime_lock vs update_process_times

Commit d3d74453c34f8fd87674a8cf5b8a327c68f22e99 ("hrtimer: fixup the
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback") broke several archs, and since
only Russell bothered to merge the fix, and Greg to ACK his arch, I'm
sending this for merger.

I have confirmation that the Alpha bit results in a booting kernel.
That leaves: blackfin, frv, sh and sparc untested.

The deadlock in question was found by Russell:

  IRQ handle
    -> timer_tick() - xtime seqlock held for write
      -> update_process_times()
        -> run_local_timers()
          -> hrtimer_run_queues()
            -> hrtimer_get_softirq_time() - tries to get a read lock

Now, Thomas assures me the fix is trivial, only do_timer() needs to be
done under the xtime_lock, and update_process_times() can savely be
removed from under it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Greg Ungerer <gerg@uclinux.org>
CC: Richard Henderson <rth@twiddle.net>
CC: Bryan Wu <bryan.wu@analog.com>
CC: David Howells <dhowells@redhat.com>
CC: Paul Mundt <lethal@linux-sh.org>
CC: William Irwin <wli@holomorphy.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/time.c           | 15 ++++++++-------
 arch/blackfin/kernel/time.c        |  8 +++++---
 arch/frv/kernel/time.c             |  6 ++++--
 arch/m68knommu/kernel/time.c       | 12 +++++++-----
 arch/sh/kernel/timers/timer-cmt.c  |  9 ---------
 arch/sh/kernel/timers/timer-mtu2.c |  2 --
 arch/sparc/kernel/pcic.c           |  2 +-
 arch/sparc/kernel/time.c           |  7 +++----
 8 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 1dd50d07693c..75480cab0893 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -119,13 +119,8 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 	state.partial_tick = delta & ((1UL << FIX_SHIFT) - 1); 
 	nticks = delta >> FIX_SHIFT;
 
-	while (nticks > 0) {
-		do_timer(1);
-#ifndef CONFIG_SMP
-		update_process_times(user_mode(get_irq_regs()));
-#endif
-		nticks--;
-	}
+	if (nticks)
+		do_timer(nticks);
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
@@ -141,6 +136,12 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 	}
 
 	write_sequnlock(&xtime_lock);
+
+#ifndef CONFIG_SMP
+	while (nticks--)
+		update_process_times(user_mode(get_irq_regs()));
+#endif
+
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index 5bd64e341df3..9bdc8f99183a 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -137,9 +137,6 @@ irqreturn_t timer_interrupt(int irq, void *dummy)
 
 	do_timer(1);
 
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
 	profile_tick(CPU_PROFILING);
 
 	/*
@@ -161,6 +158,11 @@ irqreturn_t timer_interrupt(int irq, void *dummy)
 			last_rtc_update = xtime.tv_sec - 600;
 	}
 	write_sequnlock(&xtime_lock);
+
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(get_irq_regs()));
+#endif
+
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 925fb0199a0f..69f6a4ef5d61 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -63,6 +63,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
 	/* last time the cmos clock got updated */
 	static long last_rtc_update = 0;
 
+	profile_tick(CPU_PROFILING);
 	/*
 	 * Here we are in the timer irq handler. We just have irqs locally
 	 * disabled but we don't know if the timer_bh is running on the other
@@ -73,8 +74,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
 	write_seqlock(&xtime_lock);
 
 	do_timer(1);
-	update_process_times(user_mode(get_irq_regs()));
-	profile_tick(CPU_PROFILING);
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
@@ -99,6 +98,9 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
 #endif /* CONFIG_HEARTBEAT */
 
 	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(get_irq_regs()));
+
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index 89cdbcaeb45f..0ccfb2ad6380 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -42,14 +42,12 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy)
 	/* last time the cmos clock got updated */
 	static long last_rtc_update=0;
 
+	if (current->pid)
+		profile_tick(CPU_PROFILING);
+
 	write_seqlock(&xtime_lock);
 
 	do_timer(1);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
@@ -67,6 +65,10 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy)
 	}
 
 	write_sequnlock(&xtime_lock);
+
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(get_irq_regs()));
+#endif
 	return(IRQ_HANDLED);
 }
 
diff --git a/arch/sh/kernel/timers/timer-cmt.c b/arch/sh/kernel/timers/timer-cmt.c
index 499e07beebe2..71312324b5de 100644
--- a/arch/sh/kernel/timers/timer-cmt.c
+++ b/arch/sh/kernel/timers/timer-cmt.c
@@ -100,16 +100,7 @@ static irqreturn_t cmt_timer_interrupt(int irq, void *dev_id)
 	timer_status &= ~0x80;
 	ctrl_outw(timer_status, CMT_CMCSR_0);
 
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
 	handle_timer_tick();
-	write_sequnlock(&xtime_lock);
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
index b7499a2a9188..463cd08f9517 100644
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ b/arch/sh/kernel/timers/timer-mtu2.c
@@ -100,9 +100,7 @@ static irqreturn_t mtu2_timer_interrupt(int irq, void *dev_id)
 	ctrl_outb(timer_status, MTU2_TSR_1);
 
 	/* Do timer tick */
-	write_seqlock(&xtime_lock);
 	handle_timer_tick();
-	write_sequnlock(&xtime_lock);
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 4cd5d7818dc6..a6a6f9823370 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -713,10 +713,10 @@ static irqreturn_t pcic_timer_handler (int irq, void *h)
 	write_seqlock(&xtime_lock);	/* Dummy, to show that we remember */
 	pcic_clear_clock_irq();
 	do_timer(1);
+	write_sequnlock(&xtime_lock);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
 #endif
-	write_sequnlock(&xtime_lock);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 00b393c3a4a0..cfaf22c05bc4 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -128,10 +128,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	clear_clock_irq();
 
 	do_timer(1);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-
 
 	/* Determine when to update the Mostek clock. */
 	if (ntp_synced() &&
@@ -145,6 +141,9 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	}
 	write_sequnlock(&xtime_lock);
 
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(get_irq_regs()));
+#endif
 	return IRQ_HANDLED;
 }
 

From b3c97528689619fc66569b30bf83d09d9929521a Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 15:03:15 -0800
Subject: [PATCH 44/66] include/linux: Remove all users of FASTCALL() macro

FASTCALL() is always expanded to empty, remove it.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h            | 20 ++++++++++----------
 include/linux/buffer_head.h    |  6 +++---
 include/linux/file.h           | 16 ++++++++--------
 include/linux/gfp.h            | 15 +++++++--------
 include/linux/interrupt.h      |  8 ++++----
 include/linux/mm.h             |  4 ++--
 include/linux/mutex-debug.h    |  2 +-
 include/linux/namei.h          |  6 +++---
 include/linux/netdevice.h      |  2 +-
 include/linux/pagemap.h        | 10 +++++-----
 include/linux/pid.h            | 21 ++++++++++-----------
 include/linux/rwsem-spinlock.h | 16 ++++++++--------
 include/linux/sched.h          | 14 +++++++-------
 include/linux/swap.h           |  8 ++++----
 include/linux/wait.h           | 34 ++++++++++++++++------------------
 include/linux/workqueue.h      | 13 ++++++-------
 16 files changed, 95 insertions(+), 100 deletions(-)

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 7ef8de662001..a9931e2e5624 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -206,21 +206,21 @@ struct kioctx {
 /* prototypes */
 extern unsigned aio_max_size;
 
-extern ssize_t FASTCALL(wait_on_sync_kiocb(struct kiocb *iocb));
-extern int FASTCALL(aio_put_req(struct kiocb *iocb));
-extern void FASTCALL(kick_iocb(struct kiocb *iocb));
-extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
-extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
+extern int aio_put_req(struct kiocb *iocb);
+extern void kick_iocb(struct kiocb *iocb);
+extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void __put_ioctx(struct kioctx *ctx);
 struct mm_struct;
-extern void FASTCALL(exit_aio(struct mm_struct *mm));
+extern void exit_aio(struct mm_struct *mm);
 extern struct kioctx *lookup_ioctx(unsigned long ctx_id);
-extern int FASTCALL(io_submit_one(struct kioctx *ctx,
-			struct iocb __user *user_iocb, struct iocb *iocb));
+extern int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+			 struct iocb *iocb);
 
 /* semi private, but used by the 32bit emulations: */
 struct kioctx *lookup_ioctx(unsigned long ctx_id);
-int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-				  struct iocb *iocb));
+int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+		  struct iocb *iocb);
 
 #define get_ioctx(kioctx) do {						\
 	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e98801f06dcc..932eb02a2753 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -144,7 +144,7 @@ BUFFER_FNS(Unwritten, unwritten)
  * Declarations
  */
 
-void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+void mark_buffer_dirty(struct buffer_head *bh);
 void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
@@ -185,8 +185,8 @@ struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size
 void invalidate_bh_lrus(void);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
-void FASTCALL(unlock_buffer(struct buffer_head *bh));
-void FASTCALL(__lock_buffer(struct buffer_head *bh));
+void unlock_buffer(struct buffer_head *bh);
+void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int submit_bh(int, struct buffer_head *);
diff --git a/include/linux/file.h b/include/linux/file.h
index 56023c74e9fd..7239baac81a9 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -59,8 +59,8 @@ struct files_struct {
 
 extern struct kmem_cache *filp_cachep;
 
-extern void FASTCALL(__fput(struct file *));
-extern void FASTCALL(fput(struct file *));
+extern void __fput(struct file *);
+extern void fput(struct file *);
 
 struct file_operations;
 struct vfsmount;
@@ -77,13 +77,13 @@ static inline void fput_light(struct file *file, int fput_needed)
 		fput(file);
 }
 
-extern struct file * FASTCALL(fget(unsigned int fd));
-extern struct file * FASTCALL(fget_light(unsigned int fd, int *fput_needed));
-extern void FASTCALL(set_close_on_exec(unsigned int fd, int flag));
+extern struct file *fget(unsigned int fd);
+extern struct file *fget_light(unsigned int fd, int *fput_needed);
+extern void set_close_on_exec(unsigned int fd, int flag);
 extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern int get_unused_fd_flags(int flags);
-extern void FASTCALL(put_unused_fd(unsigned int fd));
+extern void put_unused_fd(unsigned int fd);
 struct kmem_cache;
 
 extern int expand_files(struct files_struct *, int nr);
@@ -110,12 +110,12 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in
  */
 #define fcheck(fd)	fcheck_files(current->files, fd)
 
-extern void FASTCALL(fd_install(unsigned int fd, struct file * file));
+extern void fd_install(unsigned int fd, struct file *file);
 
 struct task_struct;
 
 struct files_struct *get_files_struct(struct task_struct *);
-void FASTCALL(put_files_struct(struct files_struct *fs));
+void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct task_struct *, struct files_struct *);
 
 extern struct kmem_cache *files_cachep;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0c6ce515185d..164be9da3c1b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -172,8 +172,7 @@ static inline void arch_free_page(struct page *page, int order) { }
 static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
 
-extern struct page *
-FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
+extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);
 
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
@@ -209,8 +208,8 @@ extern struct page *alloc_page_vma(gfp_t gfp_mask,
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
-extern unsigned long FASTCALL(__get_free_pages(gfp_t gfp_mask, unsigned int order));
-extern unsigned long FASTCALL(get_zeroed_page(gfp_t gfp_mask));
+extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
+extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask),0)
@@ -218,10 +217,10 @@ extern unsigned long FASTCALL(get_zeroed_page(gfp_t gfp_mask));
 #define __get_dma_pages(gfp_mask, order) \
 		__get_free_pages((gfp_mask) | GFP_DMA,(order))
 
-extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
-extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
-extern void FASTCALL(free_hot_page(struct page *page));
-extern void FASTCALL(free_cold_page(struct page *page));
+extern void __free_pages(struct page *page, unsigned int order);
+extern void free_pages(unsigned long addr, unsigned int order);
+extern void free_hot_page(struct page *page);
+extern void free_cold_page(struct page *page);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index dea7598aeff4..f8ab4ce70564 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -273,8 +273,8 @@ asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
-extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
-extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void raise_softirq_irqoff(unsigned int nr);
+extern void raise_softirq(unsigned int nr);
 
 
 /* Tasklets --- multithreaded analogue of BHs.
@@ -341,7 +341,7 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t)
 #define tasklet_unlock(t) do { } while (0)
 #endif
 
-extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t));
+extern void __tasklet_schedule(struct tasklet_struct *t);
 
 static inline void tasklet_schedule(struct tasklet_struct *t)
 {
@@ -349,7 +349,7 @@ static inline void tasklet_schedule(struct tasklet_struct *t)
 		__tasklet_schedule(t);
 }
 
-extern void FASTCALL(__tasklet_hi_schedule(struct tasklet_struct *t));
+extern void __tasklet_hi_schedule(struct tasklet_struct *t);
 
 static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8abb3814209..26c7124b841a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -786,7 +786,7 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-int FASTCALL(set_page_dirty(struct page *page));
+int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
@@ -829,7 +829,7 @@ extern void unregister_shrinker(struct shrinker *);
 
 int vma_wants_writenotify(struct vm_area_struct *vma);
 
-extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
+extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl);
 
 #ifdef __PAGETABLE_PUD_FOLDED
 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 2537285e1064..731d77d6e155 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -18,6 +18,6 @@ do {									\
 	__mutex_init((mutex), #mutex, &__key);				\
 } while (0)
 
-extern void FASTCALL(mutex_destroy(struct mutex *lock));
+extern void mutex_destroy(struct mutex *lock);
 
 #endif
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 4cb4f8d2f78d..c13e411491f4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,13 +62,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ACCESS		(0x0400)
 #define LOOKUP_CHDIR		(0x0800)
 
-extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
-extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *));
+extern int __user_walk(const char __user *, unsigned, struct nameidata *);
+extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *);
 #define user_path_walk(name,nd) \
 	__user_walk_fd(AT_FDCWD, name, LOOKUP_FOLLOW, nd)
 #define user_path_walk_link(name,nd) \
 	__user_walk_fd(AT_FDCWD, name, 0, nd)
-extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
+extern int path_lookup(const char *, unsigned, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 extern void path_release(struct nameidata *);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 047d432bde55..7128a02f1d37 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -322,7 +322,7 @@ enum
 	NAPI_STATE_DISABLE,	/* Disable pending */
 };
 
-extern void FASTCALL(__napi_schedule(struct napi_struct *n));
+extern void __napi_schedule(struct napi_struct *n);
 
 static inline int napi_disable_pending(struct napi_struct *n)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b62a105622b..d2fca802f809 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -156,10 +156,10 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 	return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 }
 
-extern void FASTCALL(__lock_page(struct page *page));
-extern int FASTCALL(__lock_page_killable(struct page *page));
-extern void FASTCALL(__lock_page_nosync(struct page *page));
-extern void FASTCALL(unlock_page(struct page *page));
+extern void __lock_page(struct page *page);
+extern int __lock_page_killable(struct page *page);
+extern void __lock_page_nosync(struct page *page);
+extern void unlock_page(struct page *page);
 
 /*
  * lock_page may only be called if we have the page's inode pinned.
@@ -199,7 +199,7 @@ static inline void lock_page_nosync(struct page *page)
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
  */
-extern void FASTCALL(wait_on_page_bit(struct page *page, int bit_nr));
+extern void wait_on_page_bit(struct page *page, int bit_nr);
 
 /* 
  * Wait for a page to be unlocked.
diff --git a/include/linux/pid.h b/include/linux/pid.h
index f84d532b5d23..c7980810eb09 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -79,10 +79,9 @@ static inline struct pid *get_pid(struct pid *pid)
 	return pid;
 }
 
-extern void FASTCALL(put_pid(struct pid *pid));
-extern struct task_struct *FASTCALL(pid_task(struct pid *pid, enum pid_type));
-extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid,
-						enum pid_type));
+extern void put_pid(struct pid *pid);
+extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
+extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
 
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
 
@@ -90,11 +89,11 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
  * attach_pid() and detach_pid() must be called with the tasklist_lock
  * write-held.
  */
-extern int FASTCALL(attach_pid(struct task_struct *task,
-				enum pid_type type, struct pid *pid));
-extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type));
-extern void FASTCALL(transfer_pid(struct task_struct *old,
-				  struct task_struct *new, enum pid_type));
+extern int attach_pid(struct task_struct *task, enum pid_type type,
+		      struct pid *pid);
+extern void detach_pid(struct task_struct *task, enum pid_type);
+extern void transfer_pid(struct task_struct *old, struct task_struct *new,
+			 enum pid_type);
 
 struct pid_namespace;
 extern struct pid_namespace init_pid_ns;
@@ -109,7 +108,7 @@ extern struct pid_namespace init_pid_ns;
  *
  * see also find_task_by_pid() set in include/linux/sched.h
  */
-extern struct pid *FASTCALL(find_pid_ns(int nr, struct pid_namespace *ns));
+extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
 extern struct pid *find_vpid(int nr);
 extern struct pid *find_pid(int nr);
 
@@ -121,7 +120,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, int last);
 
 extern struct pid *alloc_pid(struct pid_namespace *ns);
-extern void FASTCALL(free_pid(struct pid *pid));
+extern void free_pid(struct pid *pid);
 
 /*
  * the helpers to get the pid's id seen from different namespaces
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 813cee13da0d..6c3c0f6c261f 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -60,14 +60,14 @@ do {								\
 	__init_rwsem((sem), #sem, &__key);			\
 } while (0)
 
-extern void FASTCALL(__down_read(struct rw_semaphore *sem));
-extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write_nested(struct rw_semaphore *sem, int subclass));
-extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__up_read(struct rw_semaphore *sem));
-extern void FASTCALL(__up_write(struct rw_semaphore *sem));
-extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
+extern void __down_read(struct rw_semaphore *sem);
+extern int __down_read_trylock(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
+extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int __down_write_trylock(struct rw_semaphore *sem);
+extern void __up_read(struct rw_semaphore *sem);
+extern void __up_write(struct rw_semaphore *sem);
+extern void __downgrade_write(struct rw_semaphore *sem);
 
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b9bb313fe1ae..e217d188a102 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -323,7 +323,7 @@ extern char __sched_text_start[], __sched_text_end[];
 extern int in_sched_functions(unsigned long addr);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
-extern signed long FASTCALL(schedule_timeout(signed long timeout));
+extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
@@ -1648,10 +1648,10 @@ extern void release_uids(struct user_namespace *ns);
 
 extern void do_timer(unsigned long ticks);
 
-extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
-extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
-						unsigned long clone_flags));
+extern int wake_up_state(struct task_struct *tsk, unsigned int state);
+extern int wake_up_process(struct task_struct *tsk);
+extern void wake_up_new_task(struct task_struct *tsk,
+				unsigned long clone_flags);
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
 #else
@@ -1741,7 +1741,7 @@ static inline int sas_ss_flags(unsigned long sp)
 extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
-extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
@@ -1925,7 +1925,7 @@ static inline int signal_pending(struct task_struct *p)
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 
-extern int FASTCALL(__fatal_signal_pending(struct task_struct *p));
+extern int __fatal_signal_pending(struct task_struct *p);
 
 static inline int fatal_signal_pending(struct task_struct *p)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3ca5c4bd6d3f..878459ae0454 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,10 +171,10 @@ extern unsigned int nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void FASTCALL(lru_cache_add(struct page *));
-extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
-extern void FASTCALL(mark_page_accessed(struct page *));
+extern void lru_cache_add(struct page *);
+extern void lru_cache_add_active(struct page *);
+extern void activate_page(struct page *);
+extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 33a2aa9e02f2..0081147a9fe8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -117,9 +117,9 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  */
 #define is_sync_wait(wait)	(!(wait) || ((wait)->private))
 
-extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
-extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
-extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
+extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 
 static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 {
@@ -141,16 +141,16 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 	list_del(&old->task_list);
 }
 
-void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key));
-extern void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode));
-extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
-void FASTCALL(__wake_up_bit(wait_queue_head_t *, void *, int));
-int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned));
-int FASTCALL(__wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned));
-void FASTCALL(wake_up_bit(void *, int));
-int FASTCALL(out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned));
-int FASTCALL(out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned));
-wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
+void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_bit(wait_queue_head_t *, void *, int);
+int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
+int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
+void wake_up_bit(void *, int);
+int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned);
+int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned);
+wait_queue_head_t *bit_waitqueue(void *, int);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -437,11 +437,9 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
-void FASTCALL(prepare_to_wait(wait_queue_head_t *q,
-				wait_queue_t *wait, int state));
-void FASTCALL(prepare_to_wait_exclusive(wait_queue_head_t *q,
-				wait_queue_t *wait, int state));
-void FASTCALL(finish_wait(wait_queue_head_t *q, wait_queue_t *wait));
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
+void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7f28c32d9aca..542526c6e8ef 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -178,18 +178,17 @@ __create_workqueue_key(const char *name, int singlethread,
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
-extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
-extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
-			struct delayed_work *work, unsigned long delay));
+extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
+extern int queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *work, unsigned long delay);
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *work, unsigned long delay);
 
-extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+extern void flush_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
-extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(schedule_delayed_work(struct delayed_work *work,
-					unsigned long delay));
+extern int schedule_work(struct work_struct *work);
+extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay);
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);

From fbf6bfca76d50abef478ba902b8597ecbadfd390 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 Feb 2008 15:03:15 -0800
Subject: [PATCH 45/66] rcupdate: fix comment

This comment caused some consternation during fastcall removal.  Make it
truthful.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcupdate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
 static void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;

From b5606c2d4447e80b1d72406af4e78af1eda611d4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 15:03:16 -0800
Subject: [PATCH 46/66] remove final fastcall users

fastcall always expands to empty, remove it.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/RCU/NMI-RCU.txt |  2 +-
 Documentation/kprobes.txt     | 11 +++++------
 kernel/signal.c               |  2 +-
 mm/filemap.c                  |  2 +-
 net/bluetooth/rfcomm/core.c   |  4 ++--
 net/core/dev.c                |  2 +-
 net/core/sock.c               |  4 ++--
 7 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
index d0634a5c3445..c64158ecde43 100644
--- a/Documentation/RCU/NMI-RCU.txt
+++ b/Documentation/RCU/NMI-RCU.txt
@@ -25,7 +25,7 @@ the NMI handler to take the default machine-specific action.
 This nmi_callback variable is a global function pointer to the current
 NMI handler.
 
-	fastcall void do_nmi(struct pt_regs * regs, long error_code)
+	void do_nmi(struct pt_regs * regs, long error_code)
 	{
 		int cpu;
 
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 30c101761d0d..83f515c2905a 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -92,9 +92,8 @@ handler has run.  Up to MAX_STACK_SIZE bytes are copied -- e.g.,
 64 bytes on i386.
 
 Note that the probed function's args may be passed on the stack
-or in registers (e.g., for x86_64 or for an i386 fastcall function).
-The jprobe will work in either case, so long as the handler's
-prototype matches that of the probed function.
+or in registers.  The jprobe will work in either case, so long as the
+handler's prototype matches that of the probed function.
 
 1.3 Return Probes
 
@@ -270,9 +269,9 @@ Kprobes runs the handler whose address is jp->entry.
 The handler should have the same arg list and return type as the probed
 function; and just before it returns, it must call jprobe_return().
 (The handler never actually returns, since jprobe_return() returns
-control to Kprobes.)  If the probed function is declared asmlinkage,
-fastcall, or anything else that affects how args are passed, the
-handler's declaration must match.
+control to Kprobes.)  If the probed function is declared asmlinkage
+or anything else that affects how args are passed, the handler's
+declaration must match.
 
 register_jprobe() returns 0 on success, or a negative errno otherwise.
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c1f08defac2..84917fe507f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
 	}
 }
 
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
 {
 	return sigismember(&tsk->pending.signal, SIGKILL);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index b7b1be6dbd83..5c74b68935ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -604,7 +604,7 @@ void __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
 
-int fastcall __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index d3e4e1877e6a..0c2c93735e93 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -465,7 +465,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
 	return len;
 }
 
-void fastcall __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
+void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
 {
 	BT_DBG("dlc %p state %ld", d, d->state);
 
@@ -476,7 +476,7 @@ void fastcall __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
 	rfcomm_schedule(RFCOMM_SCHED_TX);
 }
 
-void fastcall __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
+void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
 {
 	BT_DBG("dlc %p state %ld", d, d->state);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 9549417250bb..b2f6cb5e0f72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2143,7 +2143,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
  *
  * The entry's receive function will be scheduled to run
  */
-void fastcall __napi_schedule(struct napi_struct *n)
+void __napi_schedule(struct napi_struct *n)
 {
 	unsigned long flags;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 433715fb141a..09cb3a74de7f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1731,7 +1731,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	atomic_set(&sk->sk_drops, 0);
 }
 
-void fastcall lock_sock_nested(struct sock *sk, int subclass)
+void lock_sock_nested(struct sock *sk, int subclass)
 {
 	might_sleep();
 	spin_lock_bh(&sk->sk_lock.slock);
@@ -1748,7 +1748,7 @@ void fastcall lock_sock_nested(struct sock *sk, int subclass)
 
 EXPORT_SYMBOL(lock_sock_nested);
 
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
 {
 	/*
 	 * The sk_lock has mutex_unlock() semantics:

From 21534301ea1801783bd88fba2a2e617ee4d2bd28 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 15:03:17 -0800
Subject: [PATCH 47/66] Final removal of FASTCALL()/fastcall

All users are gone, remove definitions and comments referring
to them.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/drm/i830_dma.c   | 2 +-
 include/asm-mn10300/highmem.h | 4 ++--
 include/asm-mn10300/linkage.h | 2 --
 include/linux/irq.h           | 1 -
 include/linux/linkage.h       | 5 -----
 5 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/char/drm/i830_dma.c b/drivers/char/drm/i830_dma.c
index 379cbdad4921..9df08105f4f3 100644
--- a/drivers/char/drm/i830_dma.c
+++ b/drivers/char/drm/i830_dma.c
@@ -36,7 +36,7 @@
 #include "i830_drm.h"
 #include "i830_drv.h"
 #include <linux/interrupt.h>	/* For task queue support */
-#include <linux/pagemap.h>	/* For FASTCALL on unlock_page() */
+#include <linux/pagemap.h>
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 
diff --git a/include/asm-mn10300/highmem.h b/include/asm-mn10300/highmem.h
index 383c0c42982e..5256854c0453 100644
--- a/include/asm-mn10300/highmem.h
+++ b/include/asm-mn10300/highmem.h
@@ -42,8 +42,8 @@ extern void __init kmap_init(void);
 #define PKMAP_NR(virt)  ((virt - PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
 
-extern unsigned long __fastcall kmap_high(struct page *page);
-extern void __fastcall kunmap_high(struct page *page);
+extern unsigned long kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
 
 static inline unsigned long kmap(struct page *page)
 {
diff --git a/include/asm-mn10300/linkage.h b/include/asm-mn10300/linkage.h
index 29a32e467523..dda3002a5dfa 100644
--- a/include/asm-mn10300/linkage.h
+++ b/include/asm-mn10300/linkage.h
@@ -13,8 +13,6 @@
 
 /* don't override anything */
 #define asmlinkage
-#define FASTCALL(x) x
-#define fastcall
 
 #define __ALIGN		.align 4,0xcb
 #define __ALIGN_STR	".align 4,0xcb"
diff --git a/include/linux/irq.h b/include/linux/irq.h
index bfd9efb5cb49..176e5e790a44 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -285,7 +285,6 @@ extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
 
 /*
  * Monolithic do_IRQ implementation.
- * (is an explicit fastcall, because i386 4KSTACKS calls it from assembly)
  */
 #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 extern unsigned int __do_IRQ(unsigned int irq);
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 3faf599ea58e..0592936344c4 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -73,9 +73,4 @@
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
-#ifndef FASTCALL
-#define FASTCALL(x)	x
-#define fastcall
-#endif
-
 #endif

From 2695a14d315c014474ccadbaed40b0169b00cb5b Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Wed, 13 Feb 2008 15:03:17 -0800
Subject: [PATCH 48/66] SC26XX: missing PORT define in serial_core.h

When submitting the driver for inclusion to 2.6.25 I've missed the change to
serial_core.h. This patch fixes this.

Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial_core.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 1a0b6cf83ff1..289942fc6655 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -149,6 +149,8 @@
 /* Freescale ColdFire */
 #define PORT_MCF	78
 
+#define PORT_SC26XX	79
+
 
 /* MN10300 on-chip UART numbers */
 #define PORT_MN10300		80

From 064d9efe947542097be669581f82d6b097e81d1a Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Wed, 13 Feb 2008 15:03:19 -0800
Subject: [PATCH 49/66] hugetlb: fix overcommit locking

proc_doulongvec_minmax() calls copy_to_user()/copy_from_user(), so we can't
hold hugetlb_lock over the call.  Use a dummy variable to store the sysctl
result, like in hugetlb_sysctl_handler(), then grab the lock to update
nr_overcommit_huge_pages.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Reported-by: Miles Lane <miles.lane@gmail.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 kernel/sysctl.c         | 4 ++--
 mm/hugetlb.c            | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7ca198b379af..addca4cd4f11 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,8 +33,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
+extern unsigned long sysctl_overcommit_huge_pages;
 extern unsigned long hugepages_treat_as_movable;
-extern unsigned long nr_overcommit_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 924c674b76ea..8b7e95411795 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -978,8 +978,8 @@ static struct ctl_table vm_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_overcommit_hugepages",
-		.data		= &nr_overcommit_huge_pages,
-		.maxlen		= sizeof(nr_overcommit_huge_pages),
+		.data		= &sysctl_overcommit_huge_pages,
+		.maxlen		= sizeof(sysctl_overcommit_huge_pages),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_overcommit_handler,
 	},
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d9a380312467..cb1b3a7ecdfc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,14 +24,15 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 static unsigned long surplus_huge_pages;
+static unsigned long nr_overcommit_huge_pages;
 unsigned long max_huge_pages;
+unsigned long sysctl_overcommit_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-unsigned long nr_overcommit_huge_pages;
 static int hugetlb_next_nid;
 
 /*
@@ -609,8 +610,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			struct file *file, void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
-	spin_lock(&hugetlb_lock);
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	spin_lock(&hugetlb_lock);
+	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
 	spin_unlock(&hugetlb_lock);
 	return 0;
 }

From df24d9a6a9014010513d6af1105f4de05c504a4b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 13 Feb 2008 15:03:20 -0800
Subject: [PATCH 50/66] Documentation: prune redundant SubmitChecklist items

Kernel style is mentioned twice, and the git apply trick is a bit redundant
given the checkpatch.pl recommendation (which also checks for bad
whitespace).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/SubmitChecklist | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 34e06d2f194f..da10e0714241 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -20,7 +20,11 @@ kernel patches.
 4: ppc64 is a good architecture for cross-compilation checking because it
    tends to use `unsigned long' for 64-bit quantities.
 
-5: Matches kernel coding style(!)
+5: Check your patch for general style as detailed in
+   Documentation/CodingStyle.  Check for trivial violations with the
+   patch style checker prior to submission (scripts/checkpatch.pl).
+   You should be able to justify all violations that remain in
+   your patch.
 
 6: Any new or modified CONFIG options don't muck up the config menu.
 
@@ -79,13 +83,3 @@ kernel patches.
 23: Tested after it has been merged into the -mm patchset to make sure
     that it still works with all of the other queued patches and various
     changes in the VM, VFS, and other subsystems.
-
-24: Avoid whitespace damage such as indenting with spaces or whitespace
-    at the end of lines.  You can test this by feeding the patch to
-    "git apply --check --whitespace=error-all"
-
-25: Check your patch for general style as detailed in
-    Documentation/CodingStyle.  Check for trivial violations with the
-    patch style checker prior to submission (scripts/checkpatch.pl).
-    You should be able to justify all violations that remain in
-    your patch.

From 55265b00ad423898bb743bce515f073c3f290bdb Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Wed, 13 Feb 2008 15:03:21 -0800
Subject: [PATCH 51/66] parport: section fixup

Fix section warning for parport_ECP_supported(); it's called from a routine
exported to modules, so it can't be removed with __devinit section pruning.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 238628d3a854..d76d37bcb9cc 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1768,7 +1768,7 @@ static int parport_PS2_supported(struct parport *pb)
 }
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-static int __devinit parport_ECP_supported(struct parport *pb)
+static int parport_ECP_supported(struct parport *pb)
 {
 	int i;
 	int config, configb;
@@ -1992,7 +1992,7 @@ static int parport_ECPEPP_supported(struct parport *pb)
 /* Don't bother probing for modes we know we won't use. */
 static int __devinit parport_PS2_supported(struct parport *pb) { return 0; }
 #ifdef CONFIG_PARPORT_PC_FIFO
-static int __devinit parport_ECP_supported(struct parport *pb) { return 0; }
+static int parport_ECP_supported(struct parport *pb) { return 0; }
 #endif
 static int __devinit parport_EPP_supported(struct parport *pb) { return 0; }
 static int __devinit parport_ECPEPP_supported(struct parport *pb){return 0;}

From b51d63c6d3078f47c26bcf7584b5c3ebea2defd0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Feb 2008 15:03:22 -0800
Subject: [PATCH 52/66] kernel-doc: fix fs/pipe.c notation

Fix several kernel-doc notation errors in fs/pipe.c.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index a07e9a542064..3c185b6527bc 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -171,7 +171,7 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
  *
  * Description:
  *	This function returns a kernel virtual address mapping for the
- *	passed in @pipe_buffer. If @atomic is set, an atomic map is provided
+ *	pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
  *	and the caller has to be careful not to fault before calling
  *	the unmap function.
  *
@@ -208,15 +208,15 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 }
 
 /**
- * generic_pipe_buf_steal - attempt to take ownership of a @pipe_buffer
+ * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
  * @buf:	the buffer to attempt to steal
  *
  * Description:
- *	This function attempts to steal the @struct page attached to
+ *	This function attempts to steal the &struct page attached to
  *	@buf. If successful, this function returns 0 and returns with
  *	the page locked. The caller may then reuse the page for whatever
- *	he wishes, the typical use is insertion into a different file
+ *	he wishes; the typical use is insertion into a different file
  *	page cache.
  */
 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -238,7 +238,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 }
 
 /**
- * generic_pipe_buf_get - get a reference to a @struct pipe_buffer
+ * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
  * @buf:	the buffer to get a reference to
  *

From 073b86dacc3c0fa79c71f3519169ea18d5521227 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Feb 2008 15:03:23 -0800
Subject: [PATCH 53/66] docbook: move pipe and splice to filesystems docbook

Move pipes and splice docbook to filesystems book.
kernel-api book is huge (10x most other books) & slow to process.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/filesystems.tmpl | 20 ++++++++++++++++++++
 Documentation/DocBook/kernel-api.tmpl  | 20 --------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 5eaef87e8f1b..5e87ad58c0b5 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -398,4 +398,24 @@ an example.
 
   </chapter>
 
+  <chapter id="splice">
+      <title>splice API</title>
+  <para>
+	splice is a method for moving blocks of data around inside the
+	kernel, without continually transferring them between the kernel
+	and user space.
+  </para>
+!Ffs/splice.c
+  </chapter>
+
+  <chapter id="pipes">
+      <title>pipes API</title>
+  <para>
+	Pipe interfaces are all for in-kernel (builtin image) use.
+	They are not exported for use by modules.
+  </para>
+!Iinclude/linux/pipe_fs_i.h
+!Ffs/pipe.c
+  </chapter>
+
 </book>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 059aaf20951a..75e4ed15ab9a 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -712,24 +712,4 @@ X!Idrivers/video/console/fonts.c
 !Edrivers/i2c/i2c-core.c
   </chapter>
 
-  <chapter id="splice">
-      <title>splice API</title>
-  <para>
-	splice is a method for moving blocks of data around inside the
-	kernel, without continually transferring them between the kernel
-	and user space.
-  </para>
-!Ffs/splice.c
-  </chapter>
-
-  <chapter id="pipes">
-      <title>pipes API</title>
-  <para>
-	Pipe interfaces are all for in-kernel (builtin image) use.
-	They are not exported for use by modules.
-  </para>
-!Iinclude/linux/pipe_fs_i.h
-!Ffs/pipe.c
-  </chapter>
-
 </book>

From 65b6e42cdc5b6a1ce2ada31cc294d7e60b22bb43 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Feb 2008 15:03:23 -0800
Subject: [PATCH 54/66] docbook: sunrpc filenames and notation fixes

Use updated file list for docbook files and
fix kernel-doc warnings in sunrpc:
Warning(linux-2.6.24-git12//net/sunrpc/rpc_pipe.c:689): No description found for parameter 'rpc_client'
Warning(linux-2.6.24-git12//net/sunrpc/rpc_pipe.c:765): No description found for parameter 'flags'
Warning(linux-2.6.24-git12//net/sunrpc/clnt.c:584): No description found for parameter 'tk_ops'
Warning(linux-2.6.24-git12//net/sunrpc/clnt.c:618): No description found for parameter 'bufsize'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/kernel-api.tmpl |  8 +++++++-
 net/sunrpc/clnt.c                     | 10 +++++-----
 net/sunrpc/rpc_pipe.c                 |  3 ++-
 net/sunrpc/xprt.c                     |  2 +-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 75e4ed15ab9a..6c0e5f018615 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -231,8 +231,14 @@ X!Ilib/string.c
 !Dnet/sunrpc/sunrpc_syms.c
 -->
 !Enet/sunrpc/xdr.c
-!Enet/sunrpc/svcsock.c
+!Enet/sunrpc/svc_xprt.c
+!Enet/sunrpc/xprt.c
 !Enet/sunrpc/sched.c
+!Enet/sunrpc/socklib.c
+!Enet/sunrpc/stats.c
+!Enet/sunrpc/rpc_pipe.c
+!Enet/sunrpc/rpcb_clnt.c
+!Enet/sunrpc/clnt.c
      </sect1>
   </chapter>
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0998e6d09664..8c6a7f1a25e9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -464,9 +464,9 @@ rpc_release_client(struct rpc_clnt *clnt)
 
 /**
  * rpc_bind_new_program - bind a new RPC program to an existing client
- * @old - old rpc_client
- * @program - rpc program to set
- * @vers - rpc program version
+ * @old: old rpc_client
+ * @program: rpc program to set
+ * @vers: rpc program version
  *
  * Clones the rpc client and sets up a new RPC program. This is mainly
  * of use for enabling different RPC programs to share the same transport.
@@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(rpc_call_sync);
  * @clnt: pointer to RPC client
  * @msg: RPC call parameters
  * @flags: RPC call flags
- * @ops: RPC call ops
+ * @tk_ops: RPC call ops
  * @data: user call data
  */
 int
@@ -610,7 +610,7 @@ EXPORT_SYMBOL_GPL(rpc_call_start);
  * rpc_peeraddr - extract remote peer address from clnt's xprt
  * @clnt: RPC client structure
  * @buf: target buffer
- * @size: length of target buffer
+ * @bufsize: length of target buffer
  *
  * Returns the number of bytes that are actually in the stored address.
  */
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 7e197168a245..0e3ead7e11b9 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -677,7 +677,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 /**
  * rpc_mkdir - Create a new directory in rpc_pipefs
  * @path: path from the rpc_pipefs root to the new directory
- * @rpc_clnt: rpc client to associate with this directory
+ * @rpc_client: rpc client to associate with this directory
  *
  * This creates a directory at the given @path associated with
  * @rpc_clnt, which will contain a file named "info" with some basic
@@ -748,6 +748,7 @@ rpc_rmdir(struct dentry *dentry)
  * @private: private data to associate with the pipe, for the caller's use
  * @ops: operations defining the behavior of the pipe: upcall, downcall,
  *	release_pipe, and destroy_msg.
+ * @flags: rpc_inode flags
  *
  * Data is made available for userspace to read by calls to
  * rpc_queue_upcall().  The actual reads will result in calls to
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index cfcade906a56..d5553b8179f9 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL_GPL(xprt_register_transport);
 
 /**
  * xprt_unregister_transport - unregister a transport implementation
- * transport: transport to unregister
+ * @transport: transport to unregister
  *
  * Returns:
  * 0:		transport successfully unregistered

From bc2cda1ebd4430f55deb60f0193a3e3b835499a2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Feb 2008 15:03:25 -0800
Subject: [PATCH 55/66] docbook: make a networking book and fix a few errors

Move networking (core and drivers) docbook to its own networking book.
Fix a few kernel-doc errors in header and source files.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/Makefile        |   2 +-
 Documentation/DocBook/kernel-api.tmpl |  65 ----------------
 Documentation/DocBook/networking.tmpl | 106 ++++++++++++++++++++++++++
 include/linux/etherdevice.h           |   3 +-
 net/core/dev.c                        |   3 +-
 net/core/skbuff.c                     |   4 +-
 6 files changed, 111 insertions(+), 72 deletions(-)
 create mode 100644 Documentation/DocBook/networking.tmpl

diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 6a0ad4715e9f..300e1707893f 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -8,7 +8,7 @@
 
 DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
-	    procfs-guide.xml writing_usb_driver.xml \
+	    procfs-guide.xml writing_usb_driver.xml networking.xml \
 	    kernel-api.xml filesystems.xml lsm.xml usb.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 6c0e5f018615..7e054c9124e6 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -204,71 +204,6 @@ X!Ilib/string.c
      </sect1>
   </chapter>
 
-  <chapter id="netcore">
-     <title>Linux Networking</title>
-     <sect1><title>Networking Base Types</title>
-!Iinclude/linux/net.h
-     </sect1>
-     <sect1><title>Socket Buffer Functions</title>
-!Iinclude/linux/skbuff.h
-!Iinclude/net/sock.h
-!Enet/socket.c
-!Enet/core/skbuff.c
-!Enet/core/sock.c
-!Enet/core/datagram.c
-!Enet/core/stream.c
-     </sect1>
-     <sect1><title>Socket Filter</title>
-!Enet/core/filter.c
-     </sect1>
-     <sect1><title>Generic Network Statistics</title>
-!Iinclude/linux/gen_stats.h
-!Enet/core/gen_stats.c
-!Enet/core/gen_estimator.c
-     </sect1>
-     <sect1><title>SUN RPC subsystem</title>
-<!-- The !D functionality is not perfect, garbage has to be protected by comments
-!Dnet/sunrpc/sunrpc_syms.c
--->
-!Enet/sunrpc/xdr.c
-!Enet/sunrpc/svc_xprt.c
-!Enet/sunrpc/xprt.c
-!Enet/sunrpc/sched.c
-!Enet/sunrpc/socklib.c
-!Enet/sunrpc/stats.c
-!Enet/sunrpc/rpc_pipe.c
-!Enet/sunrpc/rpcb_clnt.c
-!Enet/sunrpc/clnt.c
-     </sect1>
-  </chapter>
-
-  <chapter id="netdev">
-     <title>Network device support</title>
-     <sect1><title>Driver Support</title>
-!Enet/core/dev.c
-!Enet/ethernet/eth.c
-!Enet/sched/sch_generic.c
-!Iinclude/linux/etherdevice.h
-!Iinclude/linux/netdevice.h
-     </sect1>
-     <sect1><title>PHY Support</title>
-!Edrivers/net/phy/phy.c
-!Idrivers/net/phy/phy.c
-!Edrivers/net/phy/phy_device.c
-!Idrivers/net/phy/phy_device.c
-!Edrivers/net/phy/mdio_bus.c
-!Idrivers/net/phy/mdio_bus.c
-     </sect1>
-<!-- FIXME: Removed for now since no structured comments in source
-     <sect1><title>Wireless</title>
-X!Enet/core/wireless.c
-     </sect1>
--->
-     <sect1><title>Synchronous PPP</title>
-!Edrivers/net/wan/syncppp.c
-     </sect1>
-  </chapter>
-
   <chapter id="modload">
      <title>Module Support</title>
      <sect1><title>Module Loading</title>
diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl
new file mode 100644
index 000000000000..f24f9e85e4ae
--- /dev/null
+++ b/Documentation/DocBook/networking.tmpl
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LinuxNetworking">
+ <bookinfo>
+  <title>Linux Networking and Network Devices APIs</title>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+  <chapter id="netcore">
+     <title>Linux Networking</title>
+     <sect1><title>Networking Base Types</title>
+!Iinclude/linux/net.h
+     </sect1>
+     <sect1><title>Socket Buffer Functions</title>
+!Iinclude/linux/skbuff.h
+!Iinclude/net/sock.h
+!Enet/socket.c
+!Enet/core/skbuff.c
+!Enet/core/sock.c
+!Enet/core/datagram.c
+!Enet/core/stream.c
+     </sect1>
+     <sect1><title>Socket Filter</title>
+!Enet/core/filter.c
+     </sect1>
+     <sect1><title>Generic Network Statistics</title>
+!Iinclude/linux/gen_stats.h
+!Enet/core/gen_stats.c
+!Enet/core/gen_estimator.c
+     </sect1>
+     <sect1><title>SUN RPC subsystem</title>
+<!-- The !D functionality is not perfect, garbage has to be protected by comments
+!Dnet/sunrpc/sunrpc_syms.c
+-->
+!Enet/sunrpc/xdr.c
+!Enet/sunrpc/svc_xprt.c
+!Enet/sunrpc/xprt.c
+!Enet/sunrpc/sched.c
+!Enet/sunrpc/socklib.c
+!Enet/sunrpc/stats.c
+!Enet/sunrpc/rpc_pipe.c
+!Enet/sunrpc/rpcb_clnt.c
+!Enet/sunrpc/clnt.c
+     </sect1>
+  </chapter>
+
+  <chapter id="netdev">
+     <title>Network device support</title>
+     <sect1><title>Driver Support</title>
+!Enet/core/dev.c
+!Enet/ethernet/eth.c
+!Enet/sched/sch_generic.c
+!Iinclude/linux/etherdevice.h
+!Iinclude/linux/netdevice.h
+     </sect1>
+     <sect1><title>PHY Support</title>
+!Edrivers/net/phy/phy.c
+!Idrivers/net/phy/phy.c
+!Edrivers/net/phy/phy_device.c
+!Idrivers/net/phy/phy_device.c
+!Edrivers/net/phy/mdio_bus.c
+!Idrivers/net/phy/mdio_bus.c
+     </sect1>
+<!-- FIXME: Removed for now since no structured comments in source
+     <sect1><title>Wireless</title>
+X!Enet/core/wireless.c
+     </sect1>
+-->
+     <sect1><title>Synchronous PPP</title>
+!Edrivers/net/wan/syncppp.c
+     </sect1>
+  </chapter>
+
+</book>
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index b7558ec81ed5..25d62e6e3290 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -70,8 +70,7 @@ static inline int is_multicast_ether_addr(const u8 *addr)
 }
 
 /**
- * is_local_ether_addr - Determine if the Ethernet address is locally-assigned
- * one (IEEE 802).
+ * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
  * @addr: Pointer to a six-byte array containing the Ethernet address
  *
  * Return true if the address is a local address.
diff --git a/net/core/dev.c b/net/core/dev.c
index b2f6cb5e0f72..b3e19ae57f95 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3038,8 +3038,7 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 EXPORT_SYMBOL(dev_unicast_sync);
 
 /**
- *	dev_unicast_unsync - Remove synchronized addresses from the destination
- *			     device
+ *	dev_unicast_unsync - Remove synchronized addresses from the destination device
  *	@to: destination device
  *	@from: source device
  *
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4e354221ec23..cfc07dac636c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1907,11 +1907,11 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
  * of bytes already consumed and the next call to
  * skb_seq_read() will return the remaining part of the block.
  *
- * Note: The size of each block of data returned can be arbitary,
+ * Note 1: The size of each block of data returned can be arbitary,
  *       this limitation is the cost for zerocopy seqeuental
  *       reads of potentially non linear data.
  *
- * Note: Fragment lists within fragments are not implemented
+ * Note 2: Fragment lists within fragments are not implemented
  *       at the moment, state->root_skb could be replaced with
  *       a stack for this purpose.
  */

From 91d35dd93e14c34539a8005183ea500f25caad02 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Wed, 13 Feb 2008 15:03:26 -0800
Subject: [PATCH 56/66] moduleparam: fix alpha, ia64 and ppc64 compile failures

On alpha, ia64 and ppc64 only relocations to local data can go into
read-only sections. The vast majority of module parameters use the global
generic param_set_*/param_get_* functions, so the 'const' attribute for
struct kernel_param is not only useless, but it also causes compile
failures due to 'section type conflict' in those rare cases where
param_set/get are local functions.

This fixes http://bugzilla.kernel.org/show_bug.cgi?id=8964

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Adrian Bunk <bunk@stusta.de>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/moduleparam.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 8126e55c5bdc..ec624381c844 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -62,6 +62,16 @@ struct kparam_array
 	void *elem;
 };
 
+/* On alpha, ia64 and ppc64 relocations to global data cannot go into
+   read-only sections (which is part of respective UNIX ABI on these
+   platforms). So 'const' makes no sense and even causes compile failures
+   with some compilers. */
+#if defined(CONFIG_ALPHA) || defined(CONFIG_IA64) || defined(CONFIG_PPC64)
+#define __moduleparam_const
+#else
+#define __moduleparam_const const
+#endif
+
 /* This is the fundamental function for registering boot/module
    parameters.  perm sets the visibility in sysfs: 000 means it's
    not there, read bits mean it's readable, write bits mean it's
@@ -71,7 +81,7 @@ struct kparam_array
 	static int __param_perm_check_##name __attribute__((unused)) =	\
 	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2));	\
 	static const char __param_str_##name[] = prefix #name;		\
-	static struct kernel_param const __param_##name			\
+	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
 	= { __param_str_##name, perm, set, get, { arg } }

From 413d57c9907c72ed608df2be72ef8ed13a3eeb46 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 13 Feb 2008 15:03:29 -0800
Subject: [PATCH 57/66] xfs: convert beX_add to beX_add_cpu (new common API)

remove beX_add functions and replace all uses with beX_add_cpu

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Dave Chinner <dgc@sgi.com>
Cc: Timothy Shimmin <tes@sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/quota/xfs_qm.c          |  6 ++---
 fs/xfs/quota/xfs_trans_dquot.c |  6 ++---
 fs/xfs/xfs_alloc.c             | 16 ++++++------
 fs/xfs/xfs_alloc_btree.c       | 16 ++++++------
 fs/xfs/xfs_arch.h              | 15 -----------
 fs/xfs/xfs_attr_leaf.c         | 46 +++++++++++++++++-----------------
 fs/xfs/xfs_bmap_btree.c        | 16 ++++++------
 fs/xfs/xfs_da_btree.c          | 14 +++++------
 fs/xfs/xfs_dir2_block.c        |  8 +++---
 fs/xfs/xfs_dir2_data.c         |  4 +--
 fs/xfs/xfs_dir2_leaf.c         | 16 ++++++------
 fs/xfs/xfs_dir2_node.c         | 18 ++++++-------
 fs/xfs/xfs_fsops.c             |  4 +--
 fs/xfs/xfs_ialloc.c            | 12 ++++-----
 fs/xfs/xfs_ialloc_btree.c      | 16 ++++++------
 fs/xfs/xfs_log.c               |  6 ++---
 fs/xfs/xfs_trans.c             | 24 +++++++++---------
 17 files changed, 114 insertions(+), 129 deletions(-)

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 35582fe9d648..1f3da5b8657b 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1648,14 +1648,14 @@ xfs_qm_quotacheck_dqadjust(
 	 * Adjust the inode count and the block count to reflect this inode's
 	 * resource usage.
 	 */
-	be64_add(&dqp->q_core.d_icount, 1);
+	be64_add_cpu(&dqp->q_core.d_icount, 1);
 	dqp->q_res_icount++;
 	if (nblks) {
-		be64_add(&dqp->q_core.d_bcount, nblks);
+		be64_add_cpu(&dqp->q_core.d_bcount, nblks);
 		dqp->q_res_bcount += nblks;
 	}
 	if (rtblks) {
-		be64_add(&dqp->q_core.d_rtbcount, rtblks);
+		be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
 		dqp->q_res_rtbcount += rtblks;
 	}
 
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de6874bf1b8..f441f836ca8b 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -421,13 +421,13 @@ xfs_trans_apply_dquot_deltas(
 				       (xfs_qcnt_t) -qtrx->qt_icount_delta);
 #endif
 			if (totalbdelta)
-				be64_add(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+				be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
 
 			if (qtrx->qt_icount_delta)
-				be64_add(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+				be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
 
 			if (totalrtbdelta)
-				be64_add(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+				be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
 
 			/*
 			 * Get any default limits in use.
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ea6aa60ace06..bdbfbbee4959 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -592,7 +592,7 @@ xfs_alloc_ag_vextent(
 		if (!(args->wasfromfl)) {
 
 			agf = XFS_BUF_TO_AGF(args->agbp);
-			be32_add(&agf->agf_freeblks, -(args->len));
+			be32_add_cpu(&agf->agf_freeblks, -(args->len));
 			xfs_trans_agblocks_delta(args->tp,
 						 -((long)(args->len)));
 			args->pag->pagf_freeblks -= args->len;
@@ -1720,7 +1720,7 @@ xfs_free_ag_extent(
 
 		agf = XFS_BUF_TO_AGF(agbp);
 		pag = &mp->m_perag[agno];
-		be32_add(&agf->agf_freeblks, len);
+		be32_add_cpu(&agf->agf_freeblks, len);
 		xfs_trans_agblocks_delta(tp, len);
 		pag->pagf_freeblks += len;
 		XFS_WANT_CORRUPTED_GOTO(
@@ -2008,18 +2008,18 @@ xfs_alloc_get_freelist(
 	 * Get the block number and update the data structures.
 	 */
 	bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
-	be32_add(&agf->agf_flfirst, 1);
+	be32_add_cpu(&agf->agf_flfirst, 1);
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
 		agf->agf_flfirst = 0;
 	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
-	be32_add(&agf->agf_flcount, -1);
+	be32_add_cpu(&agf->agf_flcount, -1);
 	xfs_trans_agflist_delta(tp, -1);
 	pag->pagf_flcount--;
 
 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
 	if (btreeblk) {
-		be32_add(&agf->agf_btreeblks, 1);
+		be32_add_cpu(&agf->agf_btreeblks, 1);
 		pag->pagf_btreeblks++;
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
@@ -2117,17 +2117,17 @@ xfs_alloc_put_freelist(
 			be32_to_cpu(agf->agf_seqno), &agflbp)))
 		return error;
 	agfl = XFS_BUF_TO_AGFL(agflbp);
-	be32_add(&agf->agf_fllast, 1);
+	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
 		agf->agf_fllast = 0;
 	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
-	be32_add(&agf->agf_flcount, 1);
+	be32_add_cpu(&agf->agf_flcount, 1);
 	xfs_trans_agflist_delta(tp, 1);
 	pag->pagf_flcount++;
 
 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
 	if (btreeblk) {
-		be32_add(&agf->agf_btreeblks, -1);
+		be32_add_cpu(&agf->agf_btreeblks, -1);
 		pag->pagf_btreeblks--;
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 1603ce595853..3ce2645508ae 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -221,7 +221,7 @@ xfs_alloc_delrec(
 			 */
 			bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
 			agf->agf_roots[cur->bc_btnum] = *lpp;
-			be32_add(&agf->agf_levels[cur->bc_btnum], -1);
+			be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
 			mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
 			/*
 			 * Put this buffer/block on the ag's freelist.
@@ -1256,9 +1256,9 @@ xfs_alloc_lshift(
 	/*
 	 * Bump and log left's numrecs, decrement and log right's numrecs.
 	 */
-	be16_add(&left->bb_numrecs, 1);
+	be16_add_cpu(&left->bb_numrecs, 1);
 	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, -1);
+	be16_add_cpu(&right->bb_numrecs, -1);
 	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
 	/*
 	 * Slide the contents of right down one entry.
@@ -1346,7 +1346,7 @@ xfs_alloc_newroot(
 
 		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 		agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-		be32_add(&agf->agf_levels[cur->bc_btnum], 1);
+		be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
 		seqno = be32_to_cpu(agf->agf_seqno);
 		mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
 		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
@@ -1558,9 +1558,9 @@ xfs_alloc_rshift(
 	/*
 	 * Decrement and log left's numrecs, bump and log right's numrecs.
 	 */
-	be16_add(&left->bb_numrecs, -1);
+	be16_add_cpu(&left->bb_numrecs, -1);
 	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, 1);
+	be16_add_cpu(&right->bb_numrecs, 1);
 	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
 	/*
 	 * Using a temporary cursor, update the parent key values of the
@@ -1643,7 +1643,7 @@ xfs_alloc_split(
 	 */
 	if ((be16_to_cpu(left->bb_numrecs) & 1) &&
 	    cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-		be16_add(&right->bb_numrecs, 1);
+		be16_add_cpu(&right->bb_numrecs, 1);
 	i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
 	/*
 	 * For non-leaf blocks, copy keys and addresses over to the new block.
@@ -1689,7 +1689,7 @@ xfs_alloc_split(
 	 * Adjust numrecs, sibling pointers.
 	 */
 	lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-	be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
+	be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
 	right->bb_rightsib = left->bb_rightsib;
 	left->bb_rightsib = cpu_to_be32(rbno);
 	right->bb_leftsib = cpu_to_be32(lbno);
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index c4836890b726..f9472a2076d4 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -170,21 +170,6 @@
     } \
 }
 
-static inline void be16_add(__be16 *a, __s16 b)
-{
-	*a = cpu_to_be16(be16_to_cpu(*a) + b);
-}
-
-static inline void be32_add(__be32 *a, __s32 b)
-{
-	*a = cpu_to_be32(be32_to_cpu(*a) + b);
-}
-
-static inline void be64_add(__be64 *a, __s64 b)
-{
-	*a = cpu_to_be64(be64_to_cpu(*a) + b);
-}
-
 /*
  * In directories inode numbers are stored as unaligned arrays of unsigned
  * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index eb3815ebb7aa..b08e2a2a8add 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -317,7 +317,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 	memcpy(sfe->nameval, args->name, args->namelen);
 	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
 	sf->hdr.count++;
-	be16_add(&sf->hdr.totsize, size);
+	be16_add_cpu(&sf->hdr.totsize, size);
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
 
 	xfs_sbversion_add_attr2(mp, args->trans);
@@ -363,7 +363,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 	if (end != totsize)
 		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
 	sf->hdr.count--;
-	be16_add(&sf->hdr.totsize, -size);
+	be16_add_cpu(&sf->hdr.totsize, -size);
 
 	/*
 	 * Fix up the start offset of the attribute fork
@@ -1133,7 +1133,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 		xfs_da_log_buf(args->trans, bp,
 		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
 	}
-	be16_add(&hdr->count, 1);
+	be16_add_cpu(&hdr->count, 1);
 
 	/*
 	 * Allocate space for the new string (at the end of the run).
@@ -1147,7 +1147,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 					 mp->m_sb.sb_blocksize, NULL));
 	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
 	ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
-	be16_add(&map->size,
+	be16_add_cpu(&map->size,
 		-xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
 					  mp->m_sb.sb_blocksize, &tmp));
 	entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
@@ -1214,12 +1214,12 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 	map = &hdr->freemap[0];
 	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
 		if (be16_to_cpu(map->base) == tmp) {
-			be16_add(&map->base, sizeof(xfs_attr_leaf_entry_t));
-			be16_add(&map->size,
+			be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
+			be16_add_cpu(&map->size,
 				 -((int)sizeof(xfs_attr_leaf_entry_t)));
 		}
 	}
-	be16_add(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
+	be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
 	xfs_da_log_buf(args->trans, bp,
 		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
 	return(0);
@@ -1727,9 +1727,9 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 		ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
 		ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
 		if (be16_to_cpu(map->base) == tablesize) {
-			be16_add(&map->base,
+			be16_add_cpu(&map->base,
 				 -((int)sizeof(xfs_attr_leaf_entry_t)));
-			be16_add(&map->size, sizeof(xfs_attr_leaf_entry_t));
+			be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
 		}
 
 		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
@@ -1751,19 +1751,19 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 	if ((before >= 0) || (after >= 0)) {
 		if ((before >= 0) && (after >= 0)) {
 			map = &hdr->freemap[before];
-			be16_add(&map->size, entsize);
-			be16_add(&map->size,
+			be16_add_cpu(&map->size, entsize);
+			be16_add_cpu(&map->size,
 				 be16_to_cpu(hdr->freemap[after].size));
 			hdr->freemap[after].base = 0;
 			hdr->freemap[after].size = 0;
 		} else if (before >= 0) {
 			map = &hdr->freemap[before];
-			be16_add(&map->size, entsize);
+			be16_add_cpu(&map->size, entsize);
 		} else {
 			map = &hdr->freemap[after];
 			/* both on-disk, don't endian flip twice */
 			map->base = entry->nameidx;
-			be16_add(&map->size, entsize);
+			be16_add_cpu(&map->size, entsize);
 		}
 	} else {
 		/*
@@ -1788,7 +1788,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 	 * Compress the remaining entries and zero out the removed stuff.
 	 */
 	memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
-	be16_add(&hdr->usedbytes, -entsize);
+	be16_add_cpu(&hdr->usedbytes, -entsize);
 	xfs_da_log_buf(args->trans, bp,
 	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
 				   entsize));
@@ -1796,7 +1796,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 	tmp = (be16_to_cpu(hdr->count) - args->index)
 					* sizeof(xfs_attr_leaf_entry_t);
 	memmove((char *)entry, (char *)(entry+1), tmp);
-	be16_add(&hdr->count, -1);
+	be16_add_cpu(&hdr->count, -1);
 	xfs_da_log_buf(args->trans, bp,
 	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
 	entry = &leaf->entries[be16_to_cpu(hdr->count)];
@@ -2182,15 +2182,15 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 		 */
 		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
 			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
-			be16_add(&hdr_s->usedbytes, -tmp);
-			be16_add(&hdr_s->count, -1);
+			be16_add_cpu(&hdr_s->usedbytes, -tmp);
+			be16_add_cpu(&hdr_s->count, -1);
 			entry_d--;	/* to compensate for ++ in loop hdr */
 			desti--;
 			if ((start_s + i) < offset)
 				result++;	/* insertion index adjustment */
 		} else {
 #endif /* GROT */
-			be16_add(&hdr_d->firstused, -tmp);
+			be16_add_cpu(&hdr_d->firstused, -tmp);
 			/* both on-disk, don't endian flip twice */
 			entry_d->hashval = entry_s->hashval;
 			/* both on-disk, don't endian flip twice */
@@ -2203,10 +2203,10 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
 			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
-			be16_add(&hdr_s->usedbytes, -tmp);
-			be16_add(&hdr_d->usedbytes, tmp);
-			be16_add(&hdr_s->count, -1);
-			be16_add(&hdr_d->count, 1);
+			be16_add_cpu(&hdr_s->usedbytes, -tmp);
+			be16_add_cpu(&hdr_d->usedbytes, tmp);
+			be16_add_cpu(&hdr_s->count, -1);
+			be16_add_cpu(&hdr_d->count, 1);
 			tmp = be16_to_cpu(hdr_d->count)
 						* sizeof(xfs_attr_leaf_entry_t)
 						+ sizeof(xfs_attr_leaf_hdr_t);
@@ -2247,7 +2247,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 	 * Fill in the freemap information
 	 */
 	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
-	be16_add(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
+	be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
 			sizeof(xfs_attr_leaf_entry_t));
 	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
 			      - be16_to_cpu(hdr_d->freemap[0].base));
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index c4181d85605c..bd18987326a3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -631,7 +631,7 @@ xfs_bmbt_delrec(
 		memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
 		xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
 	}
-	be16_add(&left->bb_numrecs, numrrecs);
+	be16_add_cpu(&left->bb_numrecs, numrrecs);
 	left->bb_rightsib = right->bb_rightsib;
 	xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
 	if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
@@ -924,7 +924,7 @@ xfs_bmbt_killroot(
 		xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
 		block = ifp->if_broot;
 	}
-	be16_add(&block->bb_numrecs, i);
+	be16_add_cpu(&block->bb_numrecs, i);
 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
 	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
 	ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
@@ -947,7 +947,7 @@ xfs_bmbt_killroot(
 			XFS_TRANS_DQ_BCOUNT, -1L);
 	xfs_trans_binval(cur->bc_tp, cbp);
 	cur->bc_bufs[level - 1] = NULL;
-	be16_add(&block->bb_level, -1);
+	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
 		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
 	cur->bc_nlevels--;
@@ -1401,9 +1401,9 @@ xfs_bmbt_rshift(
 		key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
 		rkp = &key;
 	}
-	be16_add(&left->bb_numrecs, -1);
+	be16_add_cpu(&left->bb_numrecs, -1);
 	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, 1);
+	be16_add_cpu(&right->bb_numrecs, 1);
 #ifdef DEBUG
 	if (level > 0)
 		xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
@@ -1535,7 +1535,7 @@ xfs_bmbt_split(
 	right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
 	if ((be16_to_cpu(left->bb_numrecs) & 1) &&
 	    cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-		be16_add(&right->bb_numrecs, 1);
+		be16_add_cpu(&right->bb_numrecs, 1);
 	i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
 	if (level > 0) {
 		lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
@@ -1562,7 +1562,7 @@ xfs_bmbt_split(
 		xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
 		*startoff = xfs_bmbt_disk_get_startoff(rrp);
 	}
-	be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
+	be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
 	right->bb_rightsib = left->bb_rightsib;
 	left->bb_rightsib = cpu_to_be64(args.fsbno);
 	right->bb_leftsib = cpu_to_be64(lbno);
@@ -2240,7 +2240,7 @@ xfs_bmbt_newroot(
 	bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
 	cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
 	*cblock = *block;
-	be16_add(&block->bb_level, 1);
+	be16_add_cpu(&block->bb_level, 1);
 	block->bb_numrecs = cpu_to_be16(1);
 	cur->bc_nlevels++;
 	cur->bc_ptrs[level + 1] = 1;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1b446849fb3d..021a8f7e563f 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -511,12 +511,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 * Move the req'd B-tree elements from high in node1 to
 		 * low in node2.
 		 */
-		be16_add(&node2->hdr.count, count);
+		be16_add_cpu(&node2->hdr.count, count);
 		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
 		btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
 		btree_d = &node2->btree[0];
 		memcpy(btree_d, btree_s, tmp);
-		be16_add(&node1->hdr.count, -count);
+		be16_add_cpu(&node1->hdr.count, -count);
 	} else {
 		/*
 		 * Move the req'd B-tree elements from low in node2 to
@@ -527,7 +527,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		btree_s = &node2->btree[0];
 		btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
 		memcpy(btree_d, btree_s, tmp);
-		be16_add(&node1->hdr.count, count);
+		be16_add_cpu(&node1->hdr.count, count);
 		xfs_da_log_buf(tp, blk1->bp,
 			XFS_DA_LOGRANGE(node1, btree_d, tmp));
 
@@ -539,7 +539,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		btree_s = &node2->btree[count];
 		btree_d = &node2->btree[0];
 		memmove(btree_d, btree_s, tmp);
-		be16_add(&node2->hdr.count, -count);
+		be16_add_cpu(&node2->hdr.count, -count);
 	}
 
 	/*
@@ -604,7 +604,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	btree->before = cpu_to_be32(newblk->blkno);
 	xfs_da_log_buf(state->args->trans, oldblk->bp,
 		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
-	be16_add(&node->hdr.count, 1);
+	be16_add_cpu(&node->hdr.count, 1);
 	xfs_da_log_buf(state->args->trans, oldblk->bp,
 		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
 
@@ -959,7 +959,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
 	memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
 	xfs_da_log_buf(state->args->trans, drop_blk->bp,
 	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
-	be16_add(&node->hdr.count, -1);
+	be16_add_cpu(&node->hdr.count, -1);
 	xfs_da_log_buf(state->args->trans, drop_blk->bp,
 	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
 
@@ -1018,7 +1018,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	 */
 	tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
 	memcpy(btree, &drop_node->btree[0], tmp);
-	be16_add(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+	be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
 
 	xfs_da_log_buf(tp, save_blk->bp,
 		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index a5f4f4fb8868..fb5a556725b3 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -271,7 +271,7 @@ xfs_dir2_block_addname(
 		}
 		lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
 		lfloghigh -= be32_to_cpu(btp->stale) - 1;
-		be32_add(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+		be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
 		xfs_dir2_data_make_free(tp, bp,
 			(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
 			(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
@@ -326,7 +326,7 @@ xfs_dir2_block_addname(
 		/*
 		 * Update the tail (entry count).
 		 */
-		be32_add(&btp->count, 1);
+		be32_add_cpu(&btp->count, 1);
 		/*
 		 * If we now need to rebuild the bestfree map, do so.
 		 * This needs to happen before the next call to use_free.
@@ -387,7 +387,7 @@ xfs_dir2_block_addname(
 			lfloglow = MIN(mid, lfloglow);
 			lfloghigh = MAX(highstale, lfloghigh);
 		}
-		be32_add(&btp->stale, -1);
+		be32_add_cpu(&btp->stale, -1);
 	}
 	/*
 	 * Point to the new data entry.
@@ -767,7 +767,7 @@ xfs_dir2_block_removename(
 	/*
 	 * Fix up the block tail.
 	 */
-	be32_add(&btp->stale, 1);
+	be32_add_cpu(&btp->stale, 1);
 	xfs_dir2_block_log_tail(tp, bp);
 	/*
 	 * Remove the leaf entry by marking it stale.
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index d2452699e9b1..fb8c9e08b23d 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -587,7 +587,7 @@ xfs_dir2_data_make_free(
 		/*
 		 * Fix up the new big freespace.
 		 */
-		be16_add(&prevdup->length, len + be16_to_cpu(postdup->length));
+		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
 		*xfs_dir2_data_unused_tag_p(prevdup) =
 			cpu_to_be16((char *)prevdup - (char *)d);
 		xfs_dir2_data_log_unused(tp, bp, prevdup);
@@ -621,7 +621,7 @@ xfs_dir2_data_make_free(
 	 */
 	else if (prevdup) {
 		dfp = xfs_dir2_data_freefind(d, prevdup);
-		be16_add(&prevdup->length, len);
+		be16_add_cpu(&prevdup->length, len);
 		*xfs_dir2_data_unused_tag_p(prevdup) =
 			cpu_to_be16((char *)prevdup - (char *)d);
 		xfs_dir2_data_log_unused(tp, bp, prevdup);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0ca0020ba09f..bc52b803d79b 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -359,7 +359,7 @@ xfs_dir2_leaf_addname(
 			bestsp--;
 			memmove(&bestsp[0], &bestsp[1],
 				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
-			be32_add(&ltp->bestcount, 1);
+			be32_add_cpu(&ltp->bestcount, 1);
 			xfs_dir2_leaf_log_tail(tp, lbp);
 			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 		}
@@ -445,7 +445,7 @@ xfs_dir2_leaf_addname(
 		 */
 		lfloglow = index;
 		lfloghigh = be16_to_cpu(leaf->hdr.count);
-		be16_add(&leaf->hdr.count, 1);
+		be16_add_cpu(&leaf->hdr.count, 1);
 	}
 	/*
 	 * There are stale entries.
@@ -523,7 +523,7 @@ xfs_dir2_leaf_addname(
 			lfloglow = MIN(index, lfloglow);
 			lfloghigh = MAX(highstale, lfloghigh);
 		}
-		be16_add(&leaf->hdr.stale, -1);
+		be16_add_cpu(&leaf->hdr.stale, -1);
 	}
 	/*
 	 * Fill in the new leaf entry.
@@ -626,7 +626,7 @@ xfs_dir2_leaf_compact(
 	 * Update and log the header, log the leaf entries.
 	 */
 	ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
-	be16_add(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
+	be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
 	leaf->hdr.stale = 0;
 	xfs_dir2_leaf_log_header(args->trans, bp);
 	if (loglow != -1)
@@ -728,7 +728,7 @@ xfs_dir2_leaf_compact_x1(
 	/*
 	 * Adjust the leaf header values.
 	 */
-	be16_add(&leaf->hdr.count, -(from - to));
+	be16_add_cpu(&leaf->hdr.count, -(from - to));
 	leaf->hdr.stale = cpu_to_be16(1);
 	/*
 	 * Remember the low/high stale value only in the "right"
@@ -1470,7 +1470,7 @@ xfs_dir2_leaf_removename(
 	/*
 	 * We just mark the leaf entry stale by putting a null in it.
 	 */
-	be16_add(&leaf->hdr.stale, 1);
+	be16_add_cpu(&leaf->hdr.stale, 1);
 	xfs_dir2_leaf_log_header(tp, lbp);
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
 	xfs_dir2_leaf_log_ents(tp, lbp, index, index);
@@ -1531,7 +1531,7 @@ xfs_dir2_leaf_removename(
 			 */
 			memmove(&bestsp[db - i], bestsp,
 				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
-			be32_add(&ltp->bestcount, -(db - i));
+			be32_add_cpu(&ltp->bestcount, -(db - i));
 			xfs_dir2_leaf_log_tail(tp, lbp);
 			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 		} else
@@ -1712,7 +1712,7 @@ xfs_dir2_leaf_trim_data(
 	 * Eliminate the last bests entry from the table.
 	 */
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	be32_add(&ltp->bestcount, -1);
+	be32_add_cpu(&ltp->bestcount, -1);
 	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
 	xfs_dir2_leaf_log_tail(tp, lbp);
 	xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index eb18e399e836..8dade711f099 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -254,7 +254,7 @@ xfs_dir2_leafn_add(
 				(be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep));
 		lfloglow = index;
 		lfloghigh = be16_to_cpu(leaf->hdr.count);
-		be16_add(&leaf->hdr.count, 1);
+		be16_add_cpu(&leaf->hdr.count, 1);
 	}
 	/*
 	 * There are stale entries.  We'll use one for the new entry.
@@ -322,7 +322,7 @@ xfs_dir2_leafn_add(
 			lfloglow = MIN(index, lfloglow);
 			lfloghigh = MAX(highstale, lfloghigh);
 		}
-		be16_add(&leaf->hdr.stale, -1);
+		be16_add_cpu(&leaf->hdr.stale, -1);
 	}
 	/*
 	 * Insert the new entry, log everything.
@@ -697,10 +697,10 @@ xfs_dir2_leafn_moveents(
 	/*
 	 * Update the headers and log them.
 	 */
-	be16_add(&leaf_s->hdr.count, -(count));
-	be16_add(&leaf_s->hdr.stale, -(stale));
-	be16_add(&leaf_d->hdr.count, count);
-	be16_add(&leaf_d->hdr.stale, stale);
+	be16_add_cpu(&leaf_s->hdr.count, -(count));
+	be16_add_cpu(&leaf_s->hdr.stale, -(stale));
+	be16_add_cpu(&leaf_d->hdr.count, count);
+	be16_add_cpu(&leaf_d->hdr.stale, stale);
 	xfs_dir2_leaf_log_header(tp, bp_s);
 	xfs_dir2_leaf_log_header(tp, bp_d);
 	xfs_dir2_leafn_check(args->dp, bp_s);
@@ -885,7 +885,7 @@ xfs_dir2_leafn_remove(
 	 * Kill the leaf entry by marking it stale.
 	 * Log the leaf block changes.
 	 */
-	be16_add(&leaf->hdr.stale, 1);
+	be16_add_cpu(&leaf->hdr.stale, 1);
 	xfs_dir2_leaf_log_header(tp, bp);
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
 	xfs_dir2_leaf_log_ents(tp, bp, index, index);
@@ -971,7 +971,7 @@ xfs_dir2_leafn_remove(
 			/*
 			 * One less used entry in the free table.
 			 */
-			be32_add(&free->hdr.nused, -1);
+			be32_add_cpu(&free->hdr.nused, -1);
 			xfs_dir2_free_log_header(tp, fbp);
 			/*
 			 * If this was the last entry in the table, we can
@@ -1642,7 +1642,7 @@ xfs_dir2_node_addname_int(
 		 * (this should always be true) then update the header.
 		 */
 		if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) {
-			be32_add(&free->hdr.nused, 1);
+			be32_add_cpu(&free->hdr.nused, 1);
 			xfs_dir2_free_log_header(tp, fbp);
 		}
 		/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b8de7f3cc17e..eadc1591c795 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -318,7 +318,7 @@ xfs_growfs_data_private(
 		}
 		ASSERT(bp);
 		agi = XFS_BUF_TO_AGI(bp);
-		be32_add(&agi->agi_length, new);
+		be32_add_cpu(&agi->agi_length, new);
 		ASSERT(nagcount == oagcount ||
 		       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
 		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
@@ -331,7 +331,7 @@ xfs_growfs_data_private(
 		}
 		ASSERT(bp);
 		agf = XFS_BUF_TO_AGF(bp);
-		be32_add(&agf->agf_length, new);
+		be32_add_cpu(&agf->agf_length, new);
 		ASSERT(be32_to_cpu(agf->agf_length) ==
 		       be32_to_cpu(agi->agi_length));
 		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 1409c2d61c11..c5836b951d0c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -301,8 +301,8 @@ xfs_ialloc_ag_alloc(
 		}
 		xfs_trans_inode_alloc_buf(tp, fbuf);
 	}
-	be32_add(&agi->agi_count, newlen);
-	be32_add(&agi->agi_freecount, newlen);
+	be32_add_cpu(&agi->agi_count, newlen);
+	be32_add_cpu(&agi->agi_freecount, newlen);
 	agno = be32_to_cpu(agi->agi_seqno);
 	down_read(&args.mp->m_peraglock);
 	args.mp->m_perag[agno].pagi_freecount += newlen;
@@ -885,7 +885,7 @@ xfs_dialloc(
 	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
 			rec.ir_free)))
 		goto error0;
-	be32_add(&agi->agi_freecount, -1);
+	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 	down_read(&mp->m_peraglock);
 	mp->m_perag[tagno].pagi_freecount--;
@@ -1065,8 +1065,8 @@ xfs_difree(
 		 * to be freed when the transaction is committed.
 		 */
 		ilen = XFS_IALLOC_INODES(mp);
-		be32_add(&agi->agi_count, -ilen);
-		be32_add(&agi->agi_freecount, -(ilen - 1));
+		be32_add_cpu(&agi->agi_count, -ilen);
+		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
 		mp->m_perag[agno].pagi_freecount -= ilen - 1;
@@ -1095,7 +1095,7 @@ xfs_difree(
 		/* 
 		 * Change the inode free counts and log the ag/sb changes.
 		 */
-		be32_add(&agi->agi_freecount, 1);
+		be32_add_cpu(&agi->agi_freecount, 1);
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
 		mp->m_perag[agno].pagi_freecount++;
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 8cdeeaf8632b..e5310c90e50f 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -189,7 +189,7 @@ xfs_inobt_delrec(
 			 */
 			bno = be32_to_cpu(agi->agi_root);
 			agi->agi_root = *pp;
-			be32_add(&agi->agi_level, -1);
+			be32_add_cpu(&agi->agi_level, -1);
 			/*
 			 * Free the block.
 			 */
@@ -1132,7 +1132,7 @@ xfs_inobt_lshift(
 	/*
 	 * Bump and log left's numrecs, decrement and log right's numrecs.
 	 */
-	be16_add(&left->bb_numrecs, 1);
+	be16_add_cpu(&left->bb_numrecs, 1);
 	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
 #ifdef DEBUG
 	if (level > 0)
@@ -1140,7 +1140,7 @@ xfs_inobt_lshift(
 	else
 		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
 #endif
-	be16_add(&right->bb_numrecs, -1);
+	be16_add_cpu(&right->bb_numrecs, -1);
 	xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
 	/*
 	 * Slide the contents of right down one entry.
@@ -1232,7 +1232,7 @@ xfs_inobt_newroot(
 	 * Set the root data in the a.g. inode structure.
 	 */
 	agi->agi_root = cpu_to_be32(args.agbno);
-	be32_add(&agi->agi_level, 1);
+	be32_add_cpu(&agi->agi_level, 1);
 	xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
 		XFS_AGI_ROOT | XFS_AGI_LEVEL);
 	/*
@@ -1426,9 +1426,9 @@ xfs_inobt_rshift(
 	/*
 	 * Decrement and log left's numrecs, bump and log right's numrecs.
 	 */
-	be16_add(&left->bb_numrecs, -1);
+	be16_add_cpu(&left->bb_numrecs, -1);
 	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, 1);
+	be16_add_cpu(&right->bb_numrecs, 1);
 #ifdef DEBUG
 	if (level > 0)
 		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
@@ -1529,7 +1529,7 @@ xfs_inobt_split(
 	 */
 	if ((be16_to_cpu(left->bb_numrecs) & 1) &&
 	    cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-		be16_add(&right->bb_numrecs, 1);
+		be16_add_cpu(&right->bb_numrecs, 1);
 	i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
 	/*
 	 * For non-leaf blocks, copy keys and addresses over to the new block.
@@ -1565,7 +1565,7 @@ xfs_inobt_split(
 	 * Find the left block number by looking in the buffer.
 	 * Adjust numrecs, sibling pointers.
 	 */
-	be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
+	be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
 	right->bb_rightsib = left->bb_rightsib;
 	left->bb_rightsib = cpu_to_be32(args.agbno);
 	right->bb_leftsib = cpu_to_be32(lbno);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b3ac3805d3c4..a75edca1860f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1509,9 +1509,9 @@ xlog_sync(xlog_t		*log,
 		 * case, though.
 		 */
 		for (i = 0; i < split; i += BBSIZE) {
-			be32_add((__be32 *)dptr, 1);
+			be32_add_cpu((__be32 *)dptr, 1);
 			if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
-				be32_add((__be32 *)dptr, 1);
+				be32_add_cpu((__be32 *)dptr, 1);
 			dptr += BBSIZE;
 		}
 
@@ -1600,7 +1600,7 @@ xlog_state_finish_copy(xlog_t		*log,
 {
 	spin_lock(&log->l_icloglock);
 
-	be32_add(&iclog->ic_header.h_num_logops, record_cnt);
+	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
 	iclog->ic_offset += copy_bytes;
 
 	spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 71e4c8dcc69b..140386434aa3 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -567,26 +567,26 @@ xfs_trans_apply_sb_deltas(
 	 */
 	if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
 		if (tp->t_icount_delta)
-			be64_add(&sbp->sb_icount, tp->t_icount_delta);
+			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
 		if (tp->t_ifree_delta)
-			be64_add(&sbp->sb_ifree, tp->t_ifree_delta);
+			be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta);
 		if (tp->t_fdblocks_delta)
-			be64_add(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
 		if (tp->t_res_fdblocks_delta)
-			be64_add(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
 	}
 
 	if (tp->t_frextents_delta)
-		be64_add(&sbp->sb_frextents, tp->t_frextents_delta);
+		be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
 	if (tp->t_res_frextents_delta)
-		be64_add(&sbp->sb_frextents, tp->t_res_frextents_delta);
+		be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
 
 	if (tp->t_dblocks_delta) {
-		be64_add(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_agcount_delta) {
-		be32_add(&sbp->sb_agcount, tp->t_agcount_delta);
+		be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta);
 		whole = 1;
 	}
 	if (tp->t_imaxpct_delta) {
@@ -594,19 +594,19 @@ xfs_trans_apply_sb_deltas(
 		whole = 1;
 	}
 	if (tp->t_rextsize_delta) {
-		be32_add(&sbp->sb_rextsize, tp->t_rextsize_delta);
+		be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
 		whole = 1;
 	}
 	if (tp->t_rbmblocks_delta) {
-		be32_add(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
+		be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_rblocks_delta) {
-		be64_add(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_rextents_delta) {
-		be64_add(&sbp->sb_rextents, tp->t_rextents_delta);
+		be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
 		whole = 1;
 	}
 	if (tp->t_rextslog_delta) {

From f1a5955d90981c602ab77a8a181a0aa0f4f12cd9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Feb 2008 15:03:29 -0800
Subject: [PATCH 58/66] docbook: drop z85230 library from kernel-api

Drop z85230 support library info from kernel-api since it's duplicated in
the Z85230 book.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/kernel-api.tmpl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 7e054c9124e6..f31601e8bd89 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -449,11 +449,6 @@ X!Isound/sound_firmware.c
 !Edrivers/serial/8250.c
   </chapter>
 
-  <chapter id="z85230">
-     <title>Z85230 Support Library</title>
-!Edrivers/net/wan/z85230.c
-  </chapter>
-
   <chapter id="fbdev">
      <title>Frame Buffer Library</title>
 

From ed58f8027945f1cf415bfe3805e1fa3fe8ed9edf Mon Sep 17 00:00:00 2001
From: Sergio Luis <sergio@uece.br>
Date: Wed, 13 Feb 2008 15:03:30 -0800
Subject: [PATCH 59/66] fs/smbfs/inode.c: fix warning message deprecating smbfs

Fix the warning message regarding smbfs to

"smbfs is deprecated and will be removed from the 2.6.27 kernel. Please migrate to cifs"

instead of

"smbfs is deprecated and will be removedfrom the 2.6.27 kernel.  Please migrate to cifs"

Signed-off-by: Sergio Luis <sergio@uece.br>
Screwed-up-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/smbfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4e5c22ca802e..376ef3ee6ed7 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -505,7 +505,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (warn_count < 5) {
 		warn_count++;
 		printk(KERN_EMERG "smbfs is deprecated and will be removed"
-			"from the 2.6.27 kernel.  Please migrate to cifs\n");
+			" from the 2.6.27 kernel. Please migrate to cifs\n");
 	}
 
 	if (!raw_data)

From ac76cff2ecd73944473a437cd87770f812635025 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael-lists@free-electrons.com>
Date: Wed, 13 Feb 2008 15:03:32 -0800
Subject: [PATCH 60/66] Documentation: sysctl/kernel.txt: fix documentation
 reference

This patch fixes a reference to Documentation/kmod.txt
which was apparently renamed to Documentation/debugging-modules.txt

Signed-off-by: Michael Opdenacker <michael@free-electrons.com>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index dc8801d4e944..276a7e637822 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -29,7 +29,7 @@ show up in /proc/sys/kernel:
 - java-interpreter            [ binfmt_java, obsolete ]
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
-- modprobe                    ==> Documentation/kmod.txt
+- modprobe                    ==> Documentation/debugging-modules.txt
 - msgmax
 - msgmnb
 - msgmni

From e28d80f18211e5d49e450ba0f07b8fdca6dfb83b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 13 Feb 2008 15:03:33 -0800
Subject: [PATCH 61/66] udf: fix directory offset handling

Patch cleaning up UDF directory offset handling missed modifications in dir.c
(because I've submitted an old version :(). Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Tested-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/udf/dir.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 4b44e23caa12..8d8643ada199 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -43,13 +43,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	struct fileIdentDesc *fi = NULL;
 	struct fileIdentDesc cfi;
 	int block, iblock;
-	loff_t nf_pos = filp->f_pos - 1;
+	loff_t nf_pos = (filp->f_pos - 1) << 2;
 	int flen;
 	char fname[UDF_NAME_LEN];
 	char *nameptr;
 	uint16_t liu;
 	uint8_t lfi;
-	loff_t size = (udf_ext0_offset(dir) + dir->i_size) >> 2;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
 	struct buffer_head *tmp, *bha[16];
 	kernel_lb_addr eloc;
 	uint32_t elen;
@@ -63,13 +63,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 		return 0;
 
 	if (nf_pos == 0)
-		nf_pos = (udf_ext0_offset(dir) >> 2);
+		nf_pos = udf_ext0_offset(dir);
 
-	fibh.soffset = fibh.eoffset = (nf_pos & ((dir->i_sb->s_blocksize - 1) >> 2)) << 2;
+	fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
 	iinfo = UDF_I(dir);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 		fibh.sbh = fibh.ebh = NULL;
-	} else if (inode_bmap(dir, nf_pos >> (dir->i_sb->s_blocksize_bits - 2),
+	} else if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
 			      &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) {
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
@@ -111,7 +111,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	}
 
 	while (nf_pos < size) {
-		filp->f_pos = nf_pos + 1;
+		filp->f_pos = (nf_pos >> 2) + 1;
 
 		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
 					&elen, &offset);
@@ -178,7 +178,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 		}
 	} /* end while */
 
-	filp->f_pos = nf_pos + 1;
+	filp->f_pos = (nf_pos >> 2) + 1;
 
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);

From cba44359d15ac7a3bca2c9199b7ff403d7edc69e Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 13 Feb 2008 15:03:33 -0800
Subject: [PATCH 62/66] udf: fix udf_add_free_space

In commit 742ba02a51c8d0bf5446b154531179760c1ed0a2 (udf: create common
function for changing free space counter) by accident I reversed safety
condition which lead to null pointer dereference in case of media error and
wrong counting of free space in normal situation

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/udf/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index d721a1af1972..f855dcbbdfb8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -145,7 +145,7 @@ static bool udf_add_free_space(struct udf_sb_info *sbi,
 {
 	struct logicalVolIntegrityDesc *lvid;
 
-	if (sbi->s_lvid_bh)
+	if (sbi->s_lvid_bh == NULL)
 		return false;
 
 	lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;

From 2e1d146a19f2941aec08f60ca67fb2763baad595 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Wed, 13 Feb 2008 15:03:34 -0800
Subject: [PATCH 63/66] Smack: check for 'struct socket' with NULL sk

There's a small problem with smack and NFS. A similar report was also
sent here: http://lkml.org/lkml/2007/10/27/85

I've also added similar checks in inode_{get/set}security().  Cheating from
SELinux post_create_socket(), it does the same.

[akpm@linux-foundation.org: remove uneeded BUG_ON()]
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Acked-by: Casey Schaufler <casey@schuafler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 security/smack/smack_lsm.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 1c11e4245859..5b690482f8cb 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -701,7 +701,7 @@ static int smack_inode_getsecurity(const struct inode *inode,
 		return -EOPNOTSUPP;
 
 	sock = SOCKET_I(ip);
-	if (sock == NULL)
+	if (sock == NULL || sock->sk == NULL)
 		return -EOPNOTSUPP;
 
 	ssp = sock->sk->sk_security;
@@ -1280,10 +1280,11 @@ static void smack_to_secattr(char *smack, struct netlbl_lsm_secattr *nlsp)
  */
 static int smack_netlabel(struct sock *sk)
 {
-	struct socket_smack *ssp = sk->sk_security;
+	struct socket_smack *ssp;
 	struct netlbl_lsm_secattr secattr;
 	int rc = 0;
 
+	ssp = sk->sk_security;
 	netlbl_secattr_init(&secattr);
 	smack_to_secattr(ssp->smk_out, &secattr);
 	if (secattr.flags != NETLBL_SECATTR_NONE)
@@ -1331,7 +1332,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
 		return -EOPNOTSUPP;
 
 	sock = SOCKET_I(inode);
-	if (sock == NULL)
+	if (sock == NULL || sock->sk == NULL)
 		return -EOPNOTSUPP;
 
 	ssp = sock->sk->sk_security;
@@ -1362,7 +1363,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
 static int smack_socket_post_create(struct socket *sock, int family,
 				    int type, int protocol, int kern)
 {
-	if (family != PF_INET)
+	if (family != PF_INET || sock->sk == NULL)
 		return 0;
 	/*
 	 * Set the outbound netlbl.

From 9170d2f6e1dc4d79650fbf492d1cd45291c66504 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 Feb 2008 15:03:36 -0800
Subject: [PATCH 64/66] pcmcia: ipwireless depends on NETDEVICES

ipwireless (added by 099dc4fb62653f6019d78db55fba7a18ef02d65b) is clearly
a net device:

drivers/built-in.o: In function `ipwireless_ppp_start_xmit':
/home/pmundt/devel/git/sh-2.6.25/drivers/char/pcmcia/ipwireless/network.c:165: undefined reference to `skb_under_panic'
/home/pmundt/devel/git/sh-2.6.25/drivers/char/pcmcia/ipwireless/network.c:165: undefined reference to `kfree_skb'
drivers/built-in.o: In function `ipwireless_network_packet_received':
/home/pmundt/devel/git/sh-2.6.25/drivers/char/pcmcia/ipwireless/network.c:377: undefined reference to `__alloc_skb'
/home/pmundt/devel/git/sh-2.6.25/drivers/char/pcmcia/ipwireless/network.c:377: undefined reference to `skb_over_panic'
drivers/built-in.o: In function `ppp_shutdown_interface':
/home/pmundt/devel/git/sh-2.6.25/drivers/net/ppp_generic.c:2517: undefined reference to `unregister_netdev'
/home/pmundt/devel/git/sh-2.6.25/drivers/net/ppp_generic.c:2517: undefined reference to `free_netdev'
[ ... and many more ... ]

select strikes again. ipwireless selects PPP which in turn tries to select
SLHC, both of which are technically "protected" by an if NETDEVICES
in drivers/net/Kconfig. This leads to .config hilarity, with net suddenly
ending up in the SCSI menu:

	#
	# SCSI device support
	#
	# CONFIG_SCSI_DMA is not set
	# CONFIG_SCSI_NETLINK is not set
	CONFIG_PPP=y
	# CONFIG_PHONE is not set

Curiously the SLHC select from PPP doesn't seem to happen, as there's no
CONFIG_SLHC=y (only CONFIG_PPP=y gets set) -- Kconfig bug? Caught with a
randconfig.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/pcmcia/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
index 00b8a84b0319..ffa0efce0aed 100644
--- a/drivers/char/pcmcia/Kconfig
+++ b/drivers/char/pcmcia/Kconfig
@@ -45,7 +45,7 @@ config CARDMAN_4040
 
 config IPWIRELESS
 	tristate "IPWireless 3G UMTS PCMCIA card support"
-	depends on PCMCIA
+	depends on PCMCIA && NETDEVICES
 	select PPP
 	help
 	  This is a driver for 3G UMTS PCMCIA card from IPWireless company. In

From fb40bd78b0f91b274879cf5db8facd1e04b6052e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Wed, 13 Feb 2008 15:03:37 -0800
Subject: [PATCH 65/66] Linux Kernel Markers: support multiple probes

RCU style multiple probes support for the Linux Kernel Markers.  Common case
(one probe) is still fast and does not require dynamic allocation or a
supplementary pointer dereference on the fast path.

- Move preempt disable from the marker site to the callback.

Since we now have an internal callback, move the preempt disable/enable to the
callback instead of the marker site.

Since the callback change is done asynchronously (passing from a handler that
supports arguments to a handler that does not setup the arguments is no
arguments are passed), we can safely update it even if it is outside the
preempt disable section.

- Move probe arm to probe connection. Now, a connected probe is automatically
  armed.

Remove MARK_MAX_FORMAT_LEN, unused.

This patch modifies the Linux Kernel Markers API : it removes the probe
"arm/disarm" and changes the probe function prototype : it now expects a
va_list * instead of a "...".

If we want to have more than one probe connected to a marker at a given
time (LTTng, or blktrace, ssytemtap) then we need this patch. Without it,
connecting a second probe handler to a marker will fail.

It allow us, for instance, to do interesting combinations :

Do standard tracing with LTTng and, eventually, to compute statistics
with SystemTAP, or to have a special trigger on an event that would call
a systemtap script which would stop flight recorder tracing.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Mason <mmlnx@us.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: David Smith <dsmith@redhat.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sputrace.c |  31 +-
 include/linux/marker.h                       |  59 +-
 include/linux/module.h                       |   2 +-
 kernel/marker.c                              | 677 ++++++++++++++-----
 kernel/module.c                              |   7 +-
 samples/markers/probe-example.c              |  25 +-
 6 files changed, 565 insertions(+), 236 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c
index 2b1953f6f12e..01974f7776e1 100644
--- a/arch/powerpc/platforms/cell/spufs/sputrace.c
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.c
@@ -146,34 +146,28 @@ static void sputrace_log_item(const char *name, struct spu_context *ctx,
 	wake_up(&sputrace_wait);
 }
 
-static void spu_context_event(const struct marker *mdata,
-		void *private, const char *format, ...)
+static void spu_context_event(void *probe_private, void *call_data,
+		const char *format, va_list *args)
 {
-	struct spu_probe *p = mdata->private;
-	va_list ap;
+	struct spu_probe *p = probe_private;
 	struct spu_context *ctx;
 	struct spu *spu;
 
-	va_start(ap, format);
-	ctx = va_arg(ap, struct spu_context *);
-	spu = va_arg(ap, struct spu *);
+	ctx = va_arg(*args, struct spu_context *);
+	spu = va_arg(*args, struct spu *);
 
 	sputrace_log_item(p->name, ctx, spu);
-	va_end(ap);
 }
 
-static void spu_context_nospu_event(const struct marker *mdata,
-		void *private, const char *format, ...)
+static void spu_context_nospu_event(void *probe_private, void *call_data,
+		const char *format, va_list *args)
 {
-	struct spu_probe *p = mdata->private;
-	va_list ap;
+	struct spu_probe *p = probe_private;
 	struct spu_context *ctx;
 
-	va_start(ap, format);
-	ctx = va_arg(ap, struct spu_context *);
+	ctx = va_arg(*args, struct spu_context *);
 
 	sputrace_log_item(p->name, ctx, NULL);
-	va_end(ap);
 }
 
 struct spu_probe spu_probes[] = {
@@ -219,10 +213,6 @@ static int __init sputrace_init(void)
 		if (error)
 			printk(KERN_INFO "Unable to register probe %s\n",
 					p->name);
-
-		error = marker_arm(p->name);
-		if (error)
-			printk(KERN_INFO "Unable to arm probe %s\n", p->name);
 	}
 
 	return 0;
@@ -238,7 +228,8 @@ static void __exit sputrace_exit(void)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(spu_probes); i++)
-		marker_probe_unregister(spu_probes[i].name);
+		marker_probe_unregister(spu_probes[i].name,
+			spu_probes[i].probe_func, &spu_probes[i]);
 
 	remove_proc_entry("sputrace", NULL);
 	kfree(sputrace_log);
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 5f36cf946bcb..b5f95637f289 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -19,16 +19,23 @@ struct marker;
 
 /**
  * marker_probe_func - Type of a marker probe function
- * @mdata: pointer of type struct marker
- * @private_data: caller site private data
+ * @probe_private: probe private data
+ * @call_private: call site private data
  * @fmt: format string
- * @...: variable argument list
+ * @args: variable argument list pointer. Use a pointer to overcome C's
+ *        inability to pass this around as a pointer in a portable manner in
+ *        the callee otherwise.
  *
  * Type of marker probe functions. They receive the mdata and need to parse the
  * format string to recover the variable argument list.
  */
-typedef void marker_probe_func(const struct marker *mdata,
-	void *private_data, const char *fmt, ...);
+typedef void marker_probe_func(void *probe_private, void *call_private,
+		const char *fmt, va_list *args);
+
+struct marker_probe_closure {
+	marker_probe_func *func;	/* Callback */
+	void *probe_private;		/* Private probe data */
+};
 
 struct marker {
 	const char *name;	/* Marker name */
@@ -36,8 +43,11 @@ struct marker {
 				 * variable argument list.
 				 */
 	char state;		/* Marker state. */
-	marker_probe_func *call;/* Probe handler function pointer */
-	void *private;		/* Private probe data */
+	char ptype;		/* probe type : 0 : single, 1 : multi */
+	void (*call)(const struct marker *mdata,	/* Probe wrapper */
+		void *call_private, const char *fmt, ...);
+	struct marker_probe_closure single;
+	struct marker_probe_closure *multi;
 } __attribute__((aligned(8)));
 
 #ifdef CONFIG_MARKERS
@@ -49,7 +59,7 @@ struct marker {
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define __trace_mark(name, call_data, format, args...)			\
+#define __trace_mark(name, call_private, format, args...)		\
 	do {								\
 		static const char __mstrtab_name_##name[]		\
 		__attribute__((section("__markers_strings")))		\
@@ -60,24 +70,23 @@ struct marker {
 		static struct marker __mark_##name			\
 		__attribute__((section("__markers"), aligned(8))) =	\
 		{ __mstrtab_name_##name, __mstrtab_format_##name,	\
-		0, __mark_empty_function, NULL };			\
+		0, 0, marker_probe_cb,					\
+		{ __mark_empty_function, NULL}, NULL };			\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
-			preempt_disable();				\
 			(*__mark_##name.call)				\
-				(&__mark_##name, call_data,		\
+				(&__mark_##name, call_private,		\
 				format, ## args);			\
-			preempt_enable();				\
 		}							\
 	} while (0)
 
 extern void marker_update_probe_range(struct marker *begin,
-	struct marker *end, struct module *probe_module, int *refcount);
+	struct marker *end);
 #else /* !CONFIG_MARKERS */
-#define __trace_mark(name, call_data, format, args...) \
+#define __trace_mark(name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
 static inline void marker_update_probe_range(struct marker *begin,
-	struct marker *end, struct module *probe_module, int *refcount)
+	struct marker *end)
 { }
 #endif /* CONFIG_MARKERS */
 
@@ -92,8 +101,6 @@ static inline void marker_update_probe_range(struct marker *begin,
 #define trace_mark(name, format, args...) \
 	__trace_mark(name, NULL, format, ## args)
 
-#define MARK_MAX_FORMAT_LEN	1024
-
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  */
@@ -106,24 +113,30 @@ static inline void __printf(1, 2) __mark_check_format(const char *fmt, ...)
 
 extern marker_probe_func __mark_empty_function;
 
+extern void marker_probe_cb(const struct marker *mdata,
+	void *call_private, const char *fmt, ...);
+extern void marker_probe_cb_noarg(const struct marker *mdata,
+	void *call_private, const char *fmt, ...);
+
 /*
  * Connect a probe to a marker.
  * private data pointer must be a valid allocated memory address, or NULL.
  */
 extern int marker_probe_register(const char *name, const char *format,
-				marker_probe_func *probe, void *private);
+				marker_probe_func *probe, void *probe_private);
 
 /*
  * Returns the private data given to marker_probe_register.
  */
-extern void *marker_probe_unregister(const char *name);
+extern int marker_probe_unregister(const char *name,
+	marker_probe_func *probe, void *probe_private);
 /*
  * Unregister a marker by providing the registered private data.
  */
-extern void *marker_probe_unregister_private_data(void *private);
+extern int marker_probe_unregister_private_data(marker_probe_func *probe,
+	void *probe_private);
 
-extern int marker_arm(const char *name);
-extern int marker_disarm(const char *name);
-extern void *marker_get_private_data(const char *name);
+extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
+	int num);
 
 #endif
diff --git a/include/linux/module.h b/include/linux/module.h
index ac28e8761e84..330bec08c2c4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -465,7 +465,7 @@ int unregister_module_notifier(struct notifier_block * nb);
 
 extern void print_modules(void);
 
-extern void module_update_markers(struct module *probe_module, int *refcount);
+extern void module_update_markers(void);
 
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..c4c2cd8b61f5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,22 +27,15 @@
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
 
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
+
 /*
  * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
  */
 static DEFINE_MUTEX(markers_mutex);
 
-/*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-
 /*
  * Marker hash table, containing the active markers.
  * Protected by module_mutex.
@@ -50,12 +43,26 @@ static int deferred_sync;
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
 
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
 struct marker_entry {
 	struct hlist_node hlist;
 	char *format;
-	marker_probe_func *probe;
-	void *private;
+	void (*call)(const struct marker *mdata,	/* Probe wrapper */
+		void *call_private, const char *fmt, ...);
+	struct marker_probe_closure single;
+	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
+	struct rcu_head rcu;
+	void *oldptr;
+	char rcu_pending:1;
+	char ptype:1;
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 /**
  * __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
  * @fmt: format string
  * @...: variable argument list
  *
@@ -72,12 +80,266 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  */
-void __mark_empty_function(const struct marker *mdata, void *private,
-	const char *fmt, ...)
+void __mark_empty_function(void *probe_private, void *call_private,
+	const char *fmt, va_list *args)
 {
 }
 EXPORT_SYMBOL_GPL(__mark_empty_function);
 
+/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+	const char *fmt, ...)
+{
+	va_list args;
+	char ptype;
+
+	/*
+	 * disabling preemption to make sure the teardown of the callbacks can
+	 * be done correctly when they are in modules and they insure RCU read
+	 * coherency.
+	 */
+	preempt_disable();
+	ptype = ACCESS_ONCE(mdata->ptype);
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = ACCESS_ONCE(mdata->single.func);
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		va_start(args, fmt);
+		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_end(args);
+	} else {
+		struct marker_probe_closure *multi;
+		int i;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		multi = ACCESS_ONCE(mdata->multi);
+		for (i = 0; multi[i].func; i++) {
+			va_start(args, fmt);
+			multi[i].func(multi[i].probe_private, call_private, fmt,
+				&args);
+			va_end(args);
+		}
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+	void *call_private, const char *fmt, ...)
+{
+	va_list args;	/* not initialized */
+	char ptype;
+
+	preempt_disable();
+	ptype = ACCESS_ONCE(mdata->ptype);
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = ACCESS_ONCE(mdata->single.func);
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func(mdata->single.probe_private, call_private, fmt, &args);
+	} else {
+		struct marker_probe_closure *multi;
+		int i;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		multi = ACCESS_ONCE(mdata->multi);
+		for (i = 0; multi[i].func; i++)
+			multi[i].func(multi[i].probe_private, call_private, fmt,
+				&args);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+	struct marker_entry *entry = container_of(head,
+		struct marker_entry, rcu);
+	kfree(entry->oldptr);
+	/* Make sure we free the data before setting the pending flag to 0 */
+	smp_wmb();
+	entry->rcu_pending = 0;
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+	int i;
+
+	if (!marker_debug)
+		return;
+
+	if (!entry->ptype) {
+		printk(KERN_DEBUG "Single probe : %p %p\n",
+			entry->single.func,
+			entry->single.probe_private);
+	} else {
+		for (i = 0; entry->multi[i].func; i++)
+			printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+				entry->multi[i].func,
+				entry->multi[i].probe_private);
+	}
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0;
+	struct marker_probe_closure *old, *new;
+
+	WARN_ON(!probe);
+
+	debug_print_probes(entry);
+	old = entry->multi;
+	if (!entry->ptype) {
+		if (entry->single.func == probe &&
+				entry->single.probe_private == probe_private)
+			return ERR_PTR(-EBUSY);
+		if (entry->single.func == __mark_empty_function) {
+			/* 0 -> 1 probes */
+			entry->single.func = probe;
+			entry->single.probe_private = probe_private;
+			entry->refcount = 1;
+			entry->ptype = 0;
+			debug_print_probes(entry);
+			return NULL;
+		} else {
+			/* 1 -> 2 probes */
+			nr_probes = 1;
+			old = NULL;
+		}
+	} else {
+		/* (N -> N+1), (N != 0, 1) probes */
+		for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+			if (old[nr_probes].func == probe
+					&& old[nr_probes].probe_private
+						== probe_private)
+				return ERR_PTR(-EBUSY);
+	}
+	/* + 2 : one for new probe, one for NULL func */
+	new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+			GFP_KERNEL);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (!old)
+		new[0] = entry->single;
+	else
+		memcpy(new, old,
+			nr_probes * sizeof(struct marker_probe_closure));
+	new[nr_probes].func = probe;
+	new[nr_probes].probe_private = probe_private;
+	entry->refcount = nr_probes + 1;
+	entry->multi = new;
+	entry->ptype = 1;
+	debug_print_probes(entry);
+	return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0, nr_del = 0, i;
+	struct marker_probe_closure *old, *new;
+
+	old = entry->multi;
+
+	debug_print_probes(entry);
+	if (!entry->ptype) {
+		/* 0 -> N is an error */
+		WARN_ON(entry->single.func == __mark_empty_function);
+		/* 1 -> 0 probes */
+		WARN_ON(probe && entry->single.func != probe);
+		WARN_ON(entry->single.probe_private != probe_private);
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+		debug_print_probes(entry);
+		return NULL;
+	} else {
+		/* (N -> M), (N > 1, M >= 0) probes */
+		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+			if ((!probe || old[nr_probes].func == probe)
+					&& old[nr_probes].probe_private
+						== probe_private)
+				nr_del++;
+		}
+	}
+
+	if (nr_probes - nr_del == 0) {
+		/* N -> 0, (N > 1) */
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+	} else if (nr_probes - nr_del == 1) {
+		/* N -> 1, (N > 1) */
+		for (i = 0; old[i].func; i++)
+			if ((probe && old[i].func != probe) ||
+					old[i].probe_private != probe_private)
+				entry->single = old[i];
+		entry->refcount = 1;
+		entry->ptype = 0;
+	} else {
+		int j = 0;
+		/* N -> M, (N > 1, M > 1) */
+		/* + 1 for NULL */
+		new = kzalloc((nr_probes - nr_del + 1)
+			* sizeof(struct marker_probe_closure), GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		for (i = 0; old[i].func; i++)
+			if ((probe && old[i].func != probe) ||
+					old[i].probe_private != probe_private)
+				new[j++] = old[i];
+		entry->refcount = nr_probes - nr_del;
+		entry->ptype = 1;
+		entry->multi = new;
+	}
+	debug_print_probes(entry);
+	return old;
+}
+
 /*
  * Get marker if the marker is present in the marker hash table.
  * Must be called with markers_mutex held.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
  * Add the marker to the marker hash table. Must be called with markers_mutex
  * held.
  */
-static int add_marker(const char *name, const char *format,
-	marker_probe_func *probe, void *private)
+static struct marker_entry *add_marker(const char *name, const char *format)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
 	hlist_for_each_entry(e, node, head, hlist) {
 		if (!strcmp(name, e->name)) {
 			printk(KERN_NOTICE
-				"Marker %s busy, probe %p already installed\n",
-				name, e->probe);
-			return -EBUSY;	/* Already there */
+				"Marker %s busy\n", name);
+			return ERR_PTR(-EBUSY);	/* Already there */
 		}
 	}
 	/*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
 	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
 			GFP_KERNEL);
 	if (!e)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	memcpy(&e->name[0], name, name_len);
 	if (format) {
 		e->format = &e->name[name_len];
 		memcpy(e->format, format, format_len);
+		if (strcmp(e->format, MARK_NOARGS) == 0)
+			e->call = marker_probe_cb_noarg;
+		else
+			e->call = marker_probe_cb;
 		trace_mark(core_marker_format, "name %s format %s",
 				e->name, e->format);
-	} else
+	} else {
 		e->format = NULL;
-	e->probe = probe;
-	e->private = private;
+		e->call = marker_probe_cb;
+	}
+	e->single.func = __mark_empty_function;
+	e->single.probe_private = NULL;
+	e->multi = NULL;
+	e->ptype = 0;
 	e->refcount = 0;
+	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
-	return 0;
+	return e;
 }
 
 /*
  * Remove the marker from the marker hash table. Must be called with mutex_lock
  * held.
  */
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
 	struct marker_entry *e;
 	int found = 0;
 	size_t len = strlen(name) + 1;
-	void *private = NULL;
 	u32 hash = jhash(name, len-1, 0);
 
 	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
 			break;
 		}
 	}
-	if (found) {
-		private = e->private;
-		hlist_del(&e->hlist);
-		kfree(e);
-	}
-	return private;
+	if (!found)
+		return -ENOENT;
+	if (e->single.func != __mark_empty_function)
+		return -EBUSY;
+	hlist_del(&e->hlist);
+	/* Make sure the call_rcu has been executed */
+	if (e->rcu_pending)
+		rcu_barrier();
+	kfree(e);
+	return 0;
 }
 
 /*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	size_t name_len = strlen((*entry)->name) + 1;
 	size_t format_len = strlen(format) + 1;
 
+
 	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
 			GFP_KERNEL);
 	if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	memcpy(&e->name[0], (*entry)->name, name_len);
 	e->format = &e->name[name_len];
 	memcpy(e->format, format, format_len);
-	e->probe = (*entry)->probe;
-	e->private = (*entry)->private;
+	if (strcmp(e->format, MARK_NOARGS) == 0)
+		e->call = marker_probe_cb_noarg;
+	else
+		e->call = marker_probe_cb;
+	e->single = (*entry)->single;
+	e->multi = (*entry)->multi;
+	e->ptype = (*entry)->ptype;
 	e->refcount = (*entry)->refcount;
+	e->rcu_pending = 0;
 	hlist_add_before(&e->hlist, &(*entry)->hlist);
 	hlist_del(&(*entry)->hlist);
+	/* Make sure the call_rcu has been executed */
+	if ((*entry)->rcu_pending)
+		rcu_barrier();
 	kfree(*entry);
 	*entry = e;
 	trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 /*
  * Sets the probe callback corresponding to one marker.
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+		int active)
 {
 	int ret;
 	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
 		if (ret)
 			return ret;
 	}
-	elem->call = (*entry)->probe;
-	elem->private = (*entry)->private;
-	elem->state = 1;
+
+	/*
+	 * probe_cb setup (statically known) is done here. It is
+	 * asynchronous with the rest of execution, therefore we only
+	 * pass from a "safe" callback (with argument) to an "unsafe"
+	 * callback (does not set arguments).
+	 */
+	elem->call = (*entry)->call;
+	/*
+	 * Sanity check :
+	 * We only update the single probe private data when the ptr is
+	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+	 */
+	WARN_ON(elem->single.func != __mark_empty_function
+		&& elem->single.probe_private
+		!= (*entry)->single.probe_private &&
+		!elem->ptype);
+	elem->single.probe_private = (*entry)->single.probe_private;
+	/*
+	 * Make sure the private data is valid when we update the
+	 * single probe ptr.
+	 */
+	smp_wmb();
+	elem->single.func = (*entry)->single.func;
+	/*
+	 * We also make sure that the new probe callbacks array is consistent
+	 * before setting a pointer to it.
+	 */
+	rcu_assign_pointer(elem->multi, (*entry)->multi);
+	/*
+	 * Update the function or multi probe array pointer before setting the
+	 * ptype.
+	 */
+	smp_wmb();
+	elem->ptype = (*entry)->ptype;
+	elem->state = active;
+
 	return 0;
 }
 
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
  */
 static void disable_marker(struct marker *elem)
 {
+	/* leave "call" as is. It is known statically. */
 	elem->state = 0;
-	elem->call = __mark_empty_function;
+	elem->single.func = __mark_empty_function;
+	/* Update the function before setting the ptype */
+	smp_wmb();
+	elem->ptype = 0;	/* single probe */
 	/*
 	 * Leave the private data and id there, because removal is racy and
 	 * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
  * marker_update_probe_range - Update a probe range
  * @begin: beginning of the range
  * @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
  *
  * Updates the probe callback corresponding to a range of markers.
  */
 void marker_update_probe_range(struct marker *begin,
-	struct marker *end, struct module *probe_module,
-	int *refcount)
+	struct marker *end)
 {
 	struct marker *iter;
 	struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
 	mutex_lock(&markers_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_marker(iter->name);
-		if (mark_entry && mark_entry->refcount) {
-			set_marker(&mark_entry, iter);
+		if (mark_entry) {
+			set_marker(&mark_entry, iter,
+					!!mark_entry->refcount);
 			/*
 			 * ignore error, continue
 			 */
-			if (probe_module)
-				if (probe_module ==
-			__module_text_address((unsigned long)mark_entry->probe))
-					(*refcount)++;
 		} else {
 			disable_marker(iter);
 		}
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
  * Issues a synchronize_sched() when no reference to the module passed
  * as parameter is found in the probes so the probe module can be
  * safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
  */
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
 {
-	int refcount = 0;
-
 	/* Core kernel markers */
-	marker_update_probe_range(__start___markers,
-			__stop___markers, probe_module, &refcount);
+	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
-	module_update_markers(probe_module, &refcount);
-	if (probe_module && refcount == 0) {
-		synchronize_sched();
-		deferred_sync = 0;
-	}
+	module_update_markers();
 }
 
 /**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
  * @name: marker name
  * @format: format string
  * @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
  *
  * private data must be a valid allocated memory address, or NULL.
  * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
  */
 int marker_probe_register(const char *name, const char *format,
-			marker_probe_func *probe, void *private)
+			marker_probe_func *probe, void *probe_private)
 {
 	struct marker_entry *entry;
 	int ret = 0;
+	struct marker_probe_closure *old;
 
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
-	if (entry && entry->refcount) {
-		ret = -EBUSY;
-		goto end;
-	}
-	if (deferred_sync) {
-		synchronize_sched();
-		deferred_sync = 0;
+	if (!entry) {
+		entry = add_marker(name, format);
+		if (IS_ERR(entry)) {
+			ret = PTR_ERR(entry);
+			goto end;
+		}
 	}
-	ret = add_marker(name, format, probe, private);
-	if (ret)
+	/*
+	 * If we detect that a call_rcu is pending for this marker,
+	 * make sure it's executed now.
+	 */
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_add_probe(entry, probe, probe_private);
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
 		goto end;
+	}
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
-	return ret;
+	marker_update_probes();		/* may update entry */
+	mutex_lock(&markers_mutex);
+	entry = get_marker(name);
+	WARN_ON(!entry);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
 end:
 	mutex_unlock(&markers_mutex);
 	return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
 /**
  * marker_probe_unregister -  Disconnect a probe from a marker
  * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
  *
  * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
  */
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+	marker_probe_func *probe, void *probe_private)
 {
-	struct module *probe_module;
 	struct marker_entry *entry;
-	void *private;
+	struct marker_probe_closure *old;
+	int ret = 0;
 
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	if (!entry) {
-		private = ERR_PTR(-ENOENT);
+		ret = -ENOENT;
 		goto end;
 	}
-	entry->refcount = 0;
-	/* In what module is the probe handler ? */
-	probe_module = __module_text_address((unsigned long)entry->probe);
-	private = remove_marker(name);
-	deferred_sync = 1;
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(probe_module);
-	return private;
+	marker_update_probes();		/* may update entry */
+	mutex_lock(&markers_mutex);
+	entry = get_marker(name);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
+	remove_marker(name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-	return private;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(marker_probe_unregister);
 
-/**
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
 {
-	struct module *probe_module;
-	struct hlist_head *head;
-	struct hlist_node *node;
 	struct marker_entry *entry;
-	int found = 0;
 	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node;
 
-	mutex_lock(&markers_mutex);
 	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
 		head = &marker_table[i];
 		hlist_for_each_entry(entry, node, head, hlist) {
-			if (entry->private == private) {
-				found = 1;
-				goto iter_end;
+			if (!entry->ptype) {
+				if (entry->single.func == probe
+						&& entry->single.probe_private
+						== probe_private)
+					return entry;
+			} else {
+				struct marker_probe_closure *closure;
+				closure = entry->multi;
+				for (i = 0; closure[i].func; i++) {
+					if (closure[i].func == probe &&
+							closure[i].probe_private
+							== probe_private)
+						return entry;
+				}
 			}
 		}
 	}
-iter_end:
-	if (!found) {
-		private = ERR_PTR(-ENOENT);
-		goto end;
-	}
-	entry->refcount = 0;
-	/* In what module is the probe handler ? */
-	probe_module = __module_text_address((unsigned long)entry->probe);
-	private = remove_marker(entry->name);
-	deferred_sync = 1;
-	mutex_unlock(&markers_mutex);
-	marker_update_probes(probe_module);
-	return private;
-end:
-	mutex_unlock(&markers_mutex);
-	return private;
+	return NULL;
 }
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 
 /**
- * marker_arm - Arm a marker
- * @name: marker name
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
  *
- * Activate a marker. It keeps a reference count of the number of
- * arming/disarming done.
- * Returns 0 if ok, error value on error.
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
  */
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+		void *probe_private)
 {
 	struct marker_entry *entry;
 	int ret = 0;
+	struct marker_probe_closure *old;
 
 	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
+	entry = get_marker_from_private_data(probe, probe_private);
 	if (!entry) {
 		ret = -ENOENT;
 		goto end;
 	}
-	/*
-	 * Only need to update probes when refcount passes from 0 to 1.
-	 */
-	if (entry->refcount++)
-		goto end;
-end:
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
-	struct marker_entry *entry;
-	int ret = 0;
-
+	marker_update_probes();		/* may update entry */
 	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry) {
-		ret = -ENOENT;
-		goto end;
-	}
-	/*
-	 * Only permit decrement refcount if higher than 0.
-	 * Do probe update only on 1 -> 0 transition.
-	 */
-	if (entry->refcount) {
-		if (--entry->refcount)
-			goto end;
-	} else {
-		ret = -EPERM;
-		goto end;
-	}
+	entry = get_marker_from_private_data(probe, probe_private);
+	WARN_ON(!entry);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
+	remove_marker(entry->name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 
 /**
  * marker_get_private_data - Get a marker's probe private data
  * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
  *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
  * Returns the private data pointer, or an ERR_PTR.
  * The private data pointer should _only_ be dereferenced if the caller is the
  * owner of the data, or its content could vanish. This is mostly used to
  * confirm that a caller is the owner of a registered probe.
  */
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+		int num)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
 	struct marker_entry *e;
 	size_t name_len = strlen(name) + 1;
 	u32 hash = jhash(name, name_len-1, 0);
-	int found = 0;
+	int i;
 
 	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
 	hlist_for_each_entry(e, node, head, hlist) {
 		if (!strcmp(name, e->name)) {
-			found = 1;
-			return e->private;
+			if (!e->ptype) {
+				if (num == 0 && e->single.func == probe)
+					return e->single.probe_private;
+				else
+					break;
+			} else {
+				struct marker_probe_closure *closure;
+				int match = 0;
+				closure = e->multi;
+				for (i = 0; closure[i].func; i++) {
+					if (closure[i].func != probe)
+						continue;
+					if (match++ == num)
+						return closure[i].probe_private;
+				}
+			}
 		}
 	}
 	return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index 4202da97a1da..92595bad3812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2038,7 +2038,7 @@ static struct module *load_module(void __user *umod,
 #ifdef CONFIG_MARKERS
 	if (!mod->taints)
 		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers, NULL, NULL);
+			mod->markers + mod->num_markers);
 #endif
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
@@ -2564,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
 #endif
 
 #ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
 {
 	struct module *mod;
 
@@ -2572,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
 	list_for_each_entry(mod, &modules, list)
 		if (!mod->taints)
 			marker_update_probe_range(mod->markers,
-				mod->markers + mod->num_markers,
-				probe_module, refcount);
+				mod->markers + mod->num_markers);
 	mutex_unlock(&module_mutex);
 }
 #endif
diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c
index a36797535615..c8e099d4d1fd 100644
--- a/samples/markers/probe-example.c
+++ b/samples/markers/probe-example.c
@@ -20,31 +20,27 @@ struct probe_data {
 	marker_probe_func *probe_func;
 };
 
-void probe_subsystem_event(const struct marker *mdata, void *private,
-	const char *format, ...)
+void probe_subsystem_event(void *probe_data, void *call_data,
+	const char *format, va_list *args)
 {
-	va_list ap;
 	/* Declare args */
 	unsigned int value;
 	const char *mystr;
 
 	/* Assign args */
-	va_start(ap, format);
-	value = va_arg(ap, typeof(value));
-	mystr = va_arg(ap, typeof(mystr));
+	value = va_arg(*args, typeof(value));
+	mystr = va_arg(*args, typeof(mystr));
 
 	/* Call printk */
-	printk(KERN_DEBUG "Value %u, string %s\n", value, mystr);
+	printk(KERN_INFO "Value %u, string %s\n", value, mystr);
 
 	/* or count, check rights, serialize data in a buffer */
-
-	va_end(ap);
 }
 
 atomic_t eventb_count = ATOMIC_INIT(0);
 
-void probe_subsystem_eventb(const struct marker *mdata, void *private,
-	const char *format, ...)
+void probe_subsystem_eventb(void *probe_data, void *call_data,
+	const char *format, va_list *args)
 {
 	/* Increment counter */
 	atomic_inc(&eventb_count);
@@ -72,10 +68,6 @@ static int __init probe_init(void)
 		if (result)
 			printk(KERN_INFO "Unable to register probe %s\n",
 				probe_array[i].name);
-		result = marker_arm(probe_array[i].name);
-		if (result)
-			printk(KERN_INFO "Unable to arm probe %s\n",
-				probe_array[i].name);
 	}
 	return 0;
 }
@@ -85,7 +77,8 @@ static void __exit probe_fini(void)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(probe_array); i++)
-		marker_probe_unregister(probe_array[i].name);
+		marker_probe_unregister(probe_array[i].name,
+			probe_array[i].probe_func, &probe_array[i]);
 	printk(KERN_INFO "Number of event b : %u\n",
 			atomic_read(&eventb_count));
 }

From b2e3e658b344c6bcfb8fb694100ab2f2b5b2edb0 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Wed, 13 Feb 2008 15:03:39 -0800
Subject: [PATCH 66/66] Linux Kernel Markers: create modpost file

This adds some new magic in the MODPOST phase for CONFIG_MARKERS.  Analogous
to the Module.symvers file, the build will now write a Module.markers file
when CONFIG_MARKERS=y is set.  This file lists the name, defining module, and
format string of each marker, separated by \t characters.  This simple text
file can be used by offline build procedures for instrumentation code,
analogous to how System.map and Module.symvers can be useful to have for
kernels other than the one you are running right now.

The strings are made easy to extract by having the __trace_mark macro define
the name and format together in a single array called __mstrtab_* in the
__markers_strings section.  This is straightforward and reliable as long as
the marker structs are always defined by this macro.  It is an unreasonable
amount of hairy work to extract the string pointers from the __markers section
structs, which entails handling a relocation type for every machine under the
sun.

Mathieu :
- Ran through checkpatch.pl

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: David Smith <dsmith@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/marker.h   |   9 +--
 scripts/Makefile.modpost |  11 +++
 scripts/mod/modpost.c    | 164 ++++++++++++++++++++++++++++++++++++++-
 scripts/mod/modpost.h    |   3 +
 4 files changed, 180 insertions(+), 7 deletions(-)

diff --git a/include/linux/marker.h b/include/linux/marker.h
index b5f95637f289..5df879dc3776 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -61,15 +61,12 @@ struct marker {
  */
 #define __trace_mark(name, call_private, format, args...)		\
 	do {								\
-		static const char __mstrtab_name_##name[]		\
+		static const char __mstrtab_##name[]			\
 		__attribute__((section("__markers_strings")))		\
-		= #name;						\
-		static const char __mstrtab_format_##name[]		\
-		__attribute__((section("__markers_strings")))		\
-		= format;						\
+		= #name "\0" format;					\
 		static struct marker __mark_##name			\
 		__attribute__((section("__markers"), aligned(8))) =	\
-		{ __mstrtab_name_##name, __mstrtab_format_##name,	\
+		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
 		0, 0, marker_probe_cb,					\
 		{ __mark_empty_function, NULL}, NULL };			\
 		__mark_check_format(format, ## args);			\
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 65e707e1ffc3..cfc004e04417 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -13,6 +13,7 @@
 # 2) modpost is then used to
 # 3)  create one <module>.mod.c file pr. module
 # 4)  create one Module.symvers file with CRC for all exported symbols
+# 4a) [CONFIG_MARKERS] create one Module.markers file listing defined markers
 # 5) compile all <module>.mod.c files
 # 6) final link of the module to a <module.ko> file
 
@@ -45,6 +46,10 @@ include scripts/Makefile.lib
 
 kernelsymfile := $(objtree)/Module.symvers
 modulesymfile := $(firstword $(KBUILD_EXTMOD))/Module.symvers
+kernelmarkersfile := $(objtree)/Module.markers
+modulemarkersfile := $(firstword $(KBUILD_EXTMOD))/Module.markers
+
+markersfile = $(if $(KBUILD_EXTMOD),$(modulemarkersfile),$(kernelmarkersfile))
 
 # Step 1), find all modules listed in $(MODVERDIR)/
 __modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod)))
@@ -63,6 +68,8 @@ modpost = scripts/mod/modpost                    \
  $(if $(KBUILD_EXTMOD),-I $(modulesymfile))      \
  $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
+ $(if $(CONFIG_MARKERS),-K $(kernelmarkersfile)) \
+ $(if $(CONFIG_MARKERS),-M $(markersfile))	 \
  $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w)
 
 quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
@@ -82,6 +89,10 @@ vmlinux.o: FORCE
 $(symverfile):         __modpost ;
 $(modules:.ko=.mod.c): __modpost ;
 
+ifdef CONFIG_MARKERS
+$(markersfile):	       __modpost ;
+endif
+
 
 # Step 5), compile all *.mod.c files
 
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index dbe1fb5e8cc0..61742771c65d 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -11,6 +11,8 @@
  * Usage: modpost vmlinux module1.o module2.o ...
  */
 
+#define _GNU_SOURCE
+#include <stdio.h>
 #include <ctype.h>
 #include "modpost.h"
 #include "../../include/linux/license.h"
@@ -435,6 +437,8 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->export_unused_gpl_sec = i;
 		else if (strcmp(secname, "__ksymtab_gpl_future") == 0)
 			info->export_gpl_future_sec = i;
+		else if (strcmp(secname, "__markers_strings") == 0)
+			info->markers_strings_sec = i;
 
 		if (sechdrs[i].sh_type != SHT_SYMTAB)
 			continue;
@@ -1470,6 +1474,62 @@ static void check_sec_ref(struct module *mod, const char *modname,
 	}
 }
 
+static void get_markers(struct elf_info *info, struct module *mod)
+{
+	const Elf_Shdr *sh = &info->sechdrs[info->markers_strings_sec];
+	const char *strings = (const char *) info->hdr + sh->sh_offset;
+	const Elf_Sym *sym, *first_sym, *last_sym;
+	size_t n;
+
+	if (!info->markers_strings_sec)
+		return;
+
+	/*
+	 * First count the strings.  We look for all the symbols defined
+	 * in the __markers_strings section named __mstrtab_*.  For
+	 * these local names, the compiler puts a random .NNN suffix on,
+	 * so the names don't correspond exactly.
+	 */
+	first_sym = last_sym = NULL;
+	n = 0;
+	for (sym = info->symtab_start; sym < info->symtab_stop; sym++)
+		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT &&
+		    sym->st_shndx == info->markers_strings_sec &&
+		    !strncmp(info->strtab + sym->st_name,
+			     "__mstrtab_", sizeof "__mstrtab_" - 1)) {
+			if (first_sym == NULL)
+				first_sym = sym;
+			last_sym = sym;
+			++n;
+		}
+
+	if (n == 0)
+		return;
+
+	/*
+	 * Now collect each name and format into a line for the output.
+	 * Lines look like:
+	 *	marker_name	vmlinux	marker %s format %d
+	 * The format string after the second \t can use whitespace.
+	 */
+	mod->markers = NOFAIL(malloc(sizeof mod->markers[0] * n));
+	mod->nmarkers = n;
+
+	n = 0;
+	for (sym = first_sym; sym <= last_sym; sym++)
+		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT &&
+		    sym->st_shndx == info->markers_strings_sec &&
+		    !strncmp(info->strtab + sym->st_name,
+			     "__mstrtab_", sizeof "__mstrtab_" - 1)) {
+			const char *name = strings + sym->st_value;
+			const char *fmt = strchr(name, '\0') + 1;
+			char *line = NULL;
+			asprintf(&line, "%s\t%s\t%s\n", name, mod->name, fmt);
+			NOFAIL(line);
+			mod->markers[n++] = line;
+		}
+}
+
 static void read_symbols(char *modname)
 {
 	const char *symname;
@@ -1521,6 +1581,8 @@ static void read_symbols(char *modname)
 		get_src_version(modname, mod->srcversion,
 				sizeof(mod->srcversion)-1);
 
+	get_markers(&info, mod);
+
 	parse_elf_finish(&info);
 
 	/* Our trick to get versioning for struct_module - it's
@@ -1867,16 +1929,104 @@ static void write_dump(const char *fname)
 	write_if_changed(&buf, fname);
 }
 
+static void add_marker(struct module *mod, const char *name, const char *fmt)
+{
+	char *line = NULL;
+	asprintf(&line, "%s\t%s\t%s\n", name, mod->name, fmt);
+	NOFAIL(line);
+
+	mod->markers = NOFAIL(realloc(mod->markers, ((mod->nmarkers + 1) *
+						     sizeof mod->markers[0])));
+	mod->markers[mod->nmarkers++] = line;
+}
+
+static void read_markers(const char *fname)
+{
+	unsigned long size, pos = 0;
+	void *file = grab_file(fname, &size);
+	char *line;
+
+	if (!file)		/* No old markers, silently ignore */
+		return;
+
+	while ((line = get_next_line(&pos, file, size))) {
+		char *marker, *modname, *fmt;
+		struct module *mod;
+
+		marker = line;
+		modname = strchr(marker, '\t');
+		if (!modname)
+			goto fail;
+		*modname++ = '\0';
+		fmt = strchr(modname, '\t');
+		if (!fmt)
+			goto fail;
+		*fmt++ = '\0';
+		if (*marker == '\0' || *modname == '\0')
+			goto fail;
+
+		mod = find_module(modname);
+		if (!mod) {
+			if (is_vmlinux(modname))
+				have_vmlinux = 1;
+			mod = new_module(NOFAIL(strdup(modname)));
+			mod->skip = 1;
+		}
+
+		add_marker(mod, marker, fmt);
+	}
+	return;
+fail:
+	fatal("parse error in markers list file\n");
+}
+
+static int compare_strings(const void *a, const void *b)
+{
+	return strcmp(*(const char **) a, *(const char **) b);
+}
+
+static void write_markers(const char *fname)
+{
+	struct buffer buf = { };
+	struct module *mod;
+	size_t i;
+
+	for (mod = modules; mod; mod = mod->next)
+		if ((!external_module || !mod->skip) && mod->markers != NULL) {
+			/*
+			 * Sort the strings so we can skip duplicates when
+			 * we write them out.
+			 */
+			qsort(mod->markers, mod->nmarkers,
+			      sizeof mod->markers[0], &compare_strings);
+			for (i = 0; i < mod->nmarkers; ++i) {
+				char *line = mod->markers[i];
+				buf_write(&buf, line, strlen(line));
+				while (i + 1 < mod->nmarkers &&
+				       !strcmp(mod->markers[i],
+					       mod->markers[i + 1]))
+					free(mod->markers[i++]);
+				free(mod->markers[i]);
+			}
+			free(mod->markers);
+			mod->markers = NULL;
+		}
+
+	write_if_changed(&buf, fname);
+}
+
 int main(int argc, char **argv)
 {
 	struct module *mod;
 	struct buffer buf = { };
 	char *kernel_read = NULL, *module_read = NULL;
 	char *dump_write = NULL;
+	char *markers_read = NULL;
+	char *markers_write = NULL;
 	int opt;
 	int err;
 
-	while ((opt = getopt(argc, argv, "i:I:msSo:aw")) != -1) {
+	while ((opt = getopt(argc, argv, "i:I:msSo:awM:K:")) != -1) {
 		switch (opt) {
 		case 'i':
 			kernel_read = optarg;
@@ -1903,6 +2053,12 @@ int main(int argc, char **argv)
 		case 'w':
 			warn_unresolved = 1;
 			break;
+			case 'M':
+				markers_write = optarg;
+				break;
+			case 'K':
+				markers_read = optarg;
+				break;
 		default:
 			exit(1);
 		}
@@ -1950,5 +2106,11 @@ int main(int argc, char **argv)
 		     "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n",
 		     sec_mismatch_count);
 
+	if (markers_read)
+		read_markers(markers_read);
+
+	if (markers_write)
+		write_markers(markers_write);
+
 	return err;
 }
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index 999f15e0e008..565c5872407e 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -112,6 +112,8 @@ struct module {
 	int has_init;
 	int has_cleanup;
 	struct buffer dev_table_buf;
+	char **markers;
+	size_t nmarkers;
 	char	     srcversion[25];
 };
 
@@ -126,6 +128,7 @@ struct elf_info {
 	Elf_Section  export_gpl_sec;
 	Elf_Section  export_unused_gpl_sec;
 	Elf_Section  export_gpl_future_sec;
+	Elf_Section  markers_strings_sec;
 	const char   *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;