From 0bf26b2ce294b7ab7b7b196572de8a4504b6084f Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 29 Dec 2010 11:52:53 +0100 Subject: [PATCH] --- yaml --- r: 223695 b: refs/heads/master c: 42ce7fd6319bed8ecb26d656c476365da46b29e9 h: refs/heads/master i: 223693: ebfa0e85968bbdbb6b333b1c572e677a573871ac 223691: c091cd5b7ee4e1d5cfa476a34574ea8bfd393fe9 223687: f31eddd2140d1a7a6225dbe62ba60ba32eaee61f 223679: 0f2052efeb2d56a46868cfb455a99b0ab6909806 v: v3 --- [refs] | 2 +- trunk/Documentation/filesystems/Locking | 214 +++++++++++++----------- trunk/drivers/spi/omap2_mcspi.c | 39 +++++ trunk/kernel/user.c | 1 - trunk/mm/memcontrol.c | 19 ++- 5 files changed, 162 insertions(+), 113 deletions(-) diff --git a/[refs] b/[refs] index 7a2ed3dac7eb..02d8d5bd5810 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: ebb76ce16daf6908dc030dec1c00827d37129fe5 +refs/heads/master: 42ce7fd6319bed8ecb26d656c476365da46b29e9 diff --git a/trunk/Documentation/filesystems/Locking b/trunk/Documentation/filesystems/Locking index 7686e7684495..b6426f15b4ae 100644 --- a/trunk/Documentation/filesystems/Locking +++ b/trunk/Documentation/filesystems/Locking @@ -18,6 +18,7 @@ prototypes: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); locking rules: + none have BKL dcache_lock rename_lock ->d_lock may block d_revalidate: no no no yes d_hash no no no yes @@ -41,23 +42,18 @@ ata *); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*readlink) (struct dentry *, char __user *,int); - void * (*follow_link) (struct dentry *, struct nameidata *); - void (*put_link) (struct dentry *, struct nameidata *, void *); + int (*follow_link) (struct dentry *, struct nameidata *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, struct nameidata *); - int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); - long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); - int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); locking rules: - all may block + all may block, none have BKL i_mutex(inode) lookup: yes create: yes @@ -70,24 +66,19 @@ rmdir: yes (both) (see below) rename: yes (all) (see below) readlink: no follow_link: no -put_link: no truncate: yes (see below) setattr: yes permission: no -check_acl: no getattr: no setxattr: yes getxattr: no listxattr: no removexattr: yes -truncate_range: yes -fallocate: no -fiemap: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. ->truncate() is never called directly - it's a callback, not a -method. It's called by vmtruncate() - deprecated library function used by +method. It's called by vmtruncate() - library function normally used by ->setattr(). Locking information above applies to that call (i.e. is inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been passed). @@ -100,7 +91,7 @@ prototypes: struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, struct writeback_control *wbc); + int (*write_inode) (struct inode *, int); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); @@ -114,11 +105,10 @@ prototypes: int (*show_options)(struct seq_file *, struct vfsmount *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); - int (*trim_fs) (struct super_block *, struct fstrim_range *); locking rules: All may block [not true, see below] + None have BKL s_umount alloc_inode: destroy_inode: @@ -137,8 +127,6 @@ umount_begin: no show_options: no (namespace_sem) quota_read: no (see below) quota_write: no (see below) -bdev_try_to_free_page: no (see below) -trim_fs: no ->statfs() has s_umount (shared) when called by ustat(2) (native or compat), but that's an accident of bad API; s_umount is used to pin @@ -151,25 +139,19 @@ be the only ones operating on the quota file by the quota code (via dqio_sem) (unless an admin really wants to screw up something and writes to quota files with quotas on). For other details about locking see also dquot_operations section. -->bdev_try_to_free_page is called from the ->releasepage handler of -the block device inode. See there for more details. --------------------------- file_system_type --------------------------- prototypes: int (*get_sb) (struct file_system_type *, int, const char *, void *, struct vfsmount *); - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); void (*kill_sb) (struct super_block *); locking rules: - may block -get_sb yes -mount yes -kill_sb yes + may block BKL +get_sb yes no +kill_sb yes no ->get_sb() returns error or 0 with locked superblock attached to the vfsmount (exclusive on ->s_umount). -->mount() returns ERR_PTR or the root dentry. ->kill_sb() takes a write-locked superblock, does all shutdown work on it, unlocks and drops the reference. @@ -194,35 +176,27 @@ prototypes: void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, loff_t offset, unsigned long nr_segs); - int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, - unsigned long *); - int (*migratepage)(struct address_space *, struct page *, struct page *); - int (*launder_page)(struct page *); - int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); - int (*error_remove_page)(struct address_space *, struct page *); + int (*launder_page) (struct page *); locking rules: All except set_page_dirty and freepage may block - PageLocked(page) i_mutex -writepage: yes, unlocks (see below) -readpage: yes, unlocks -sync_page: maybe -writepages: -set_page_dirty no -readpages: -write_begin: locks the page yes -write_end: yes, unlocks yes -bmap: -invalidatepage: yes -releasepage: yes -freepage: yes -direct_IO: -get_xip_mem: maybe -migratepage: yes (both) -launder_page: yes -is_partially_uptodate: yes -error_remove_page: yes + BKL PageLocked(page) i_mutex +writepage: no yes, unlocks (see below) +readpage: no yes, unlocks +sync_page: no maybe +writepages: no +set_page_dirty no no +readpages: no +write_begin: no locks the page yes +write_end: no yes, unlocks yes +perform_write: no n/a yes +bmap: no +invalidatepage: no yes +releasepage: no yes +freepage: no yes +direct_IO: no +launder_page: no yes ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). @@ -302,8 +276,9 @@ under spinlock (it cannot block) and is sometimes called with the page not locked. ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some -filesystems and by the swapper. The latter will eventually go away. Please, -keep it that way and don't breed new callers. +filesystems and by the swapper. The latter will eventually go away. All +instances do not actually need the BKL. Please, keep it that way and don't +breed new callers. ->invalidatepage() is called when the filesystem must attempt to drop some or all of the buffers from the page when it is being truncated. It @@ -324,37 +299,47 @@ cleaned, or an error value if not. Note that in order to prevent the page getting mapped back in and redirtied, it needs to be kept locked across the entire operation. + Note: currently almost all instances of address_space methods are +using BKL for internal serialization and that's one of the worst sources +of contention. Normally they are calling library functions (in fs/buffer.c) +and pass foo_get_block() as a callback (on local block-based filesystems, +indeed). BKL is not needed for library stuff and is usually taken by +foo_get_block(). It's an overkill, since block bitmaps can be protected by +internal fs locking and real critical areas are much smaller than the areas +filesystems protect now. + ----------------------- file_lock_operations ------------------------------ prototypes: + void (*fl_insert)(struct file_lock *); /* lock insertion callback */ + void (*fl_remove)(struct file_lock *); /* lock removal callback */ void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); locking rules: - file_lock_lock may block -fl_copy_lock: yes no -fl_release_private: maybe no + BKL may block +fl_insert: yes no +fl_remove: yes no +fl_copy_lock: yes no +fl_release_private: yes yes ----------------------- lock_manager_operations --------------------------- prototypes: int (*fl_compare_owner)(struct file_lock *, struct file_lock *); void (*fl_notify)(struct file_lock *); /* unblock callback */ - int (*fl_grant)(struct file_lock *, struct file_lock *, int); void (*fl_release_private)(struct file_lock *); void (*fl_break)(struct file_lock *); /* break_lease callback */ - int (*fl_mylease)(struct file_lock *, struct file_lock *); - int (*fl_change)(struct file_lock **, int); locking rules: - file_lock_lock may block -fl_compare_owner: yes no -fl_notify: yes no -fl_grant: no no -fl_release_private: maybe no -fl_break: yes no -fl_mylease: yes no -fl_change yes no - + BKL may block +fl_compare_owner: yes no +fl_notify: yes no +fl_release_private: yes yes +fl_break: yes no + + Currently only NFSD and NLM provide instances of this class. None of the +them block. If you have out-of-tree instances - please, show up. Locking +in that area will change. --------------------------- buffer_head ----------------------------------- prototypes: void (*b_end_io)(struct buffer_head *bh, int uptodate); @@ -379,17 +364,17 @@ prototypes: void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: - bd_mutex -open: yes -release: yes -ioctl: no -compat_ioctl: no -direct_access: no -media_changed: no -unlock_native_capacity: no -revalidate_disk: no -getgeo: no -swap_slot_free_notify: no (see below) + BKL bd_mutex +open: no yes +release: no yes +ioctl: no no +compat_ioctl: no no +direct_access: no no +media_changed: no no +unlock_native_capacity: no no +revalidate_disk: no no +getgeo: no no +swap_slot_free_notify: no no (see below) media_changed, unlock_native_capacity and revalidate_disk are called only from check_disk_change(). @@ -428,21 +413,34 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*flock) (struct file *, int, struct file_lock *); - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, - size_t, unsigned int); - ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, - size_t, unsigned int); - int (*setlease)(struct file *, long, struct file_lock **); }; locking rules: - All may block except for ->setlease. - No VFS locks held on entry except for ->fsync and ->setlease. - -->fsync() has i_mutex on inode. - -->setlease has the file_list_lock held and must not sleep. + All may block. + BKL +llseek: no (see below) +read: no +aio_read: no +write: no +aio_write: no +readdir: no +poll: no +unlocked_ioctl: no +compat_ioctl: no +mmap: no +open: no +flush: no +release: no +fsync: no (see below) +aio_fsync: no +fasync: no +lock: yes +readv: no +writev: no +sendfile: no +sendpage: no +get_unmapped_area: no +check_flags: no ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -452,10 +450,17 @@ mutex or just to use i_size_read() instead. Note: this does not protect the file->f_pos against concurrent modifications since this is something the userspace has to take care about. -->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. -Most instances call fasync_helper(), which does that maintenance, so it's -not normally something one needs to worry about. Return values > 0 will be -mapped to zero in the VFS layer. +Note: ext2_release() was *the* source of contention on fs-intensive +loads and dropping BKL on ->release() helps to get rid of that (we still +grab BKL for cases when we close a file that had been opened r/w, but that +can and should be done using the internal locking with smaller critical areas). +Current worst offender is ext2_get_block()... + +->fasync() is called without BKL protection, and is responsible for +maintaining the FASYNC bit in filp->f_flags. Most instances call +fasync_helper(), which does that maintenance, so it's not normally +something one needs to worry about. Return values > 0 will be mapped to +zero in the VFS layer. ->readdir() and ->ioctl() on directories must be changed. Ideally we would move ->readdir() to inode_operations and use a separate method for directory @@ -466,6 +471,8 @@ components. And there are other reasons why the current interface is a mess... ->read on directories probably must go away - we should just enforce -EISDIR in sys_read() and friends. +->fsync() has i_mutex on inode. + --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); @@ -500,12 +507,12 @@ prototypes: int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: - mmap_sem PageLocked(page) -open: yes -close: yes -fault: yes can return with page locked -page_mkwrite: yes can return with page locked -access: yes + BKL mmap_sem PageLocked(page) +open: no yes +close: no yes +fault: no yes can return with page locked +page_mkwrite: no yes can return with page locked +access: no yes ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated @@ -532,3 +539,6 @@ VM_IO | VM_PFNMAP VMAs. (if you break something or notice that it is broken and do not fix it yourself - at least put it here) + +ipc/shm.c::shm_delete() - may need BKL. +->read() and ->write() in many drivers are (probably) missing BKL. diff --git a/trunk/drivers/spi/omap2_mcspi.c b/trunk/drivers/spi/omap2_mcspi.c index 2a651e61bfbf..951a160fc27f 100644 --- a/trunk/drivers/spi/omap2_mcspi.c +++ b/trunk/drivers/spi/omap2_mcspi.c @@ -1305,10 +1305,49 @@ static int __exit omap2_mcspi_remove(struct platform_device *pdev) /* work with hotplug and coldplug */ MODULE_ALIAS("platform:omap2_mcspi"); +#ifdef CONFIG_SUSPEND +/* + * When SPI wake up from off-mode, CS is in activate state. If it was in + * unactive state when driver was suspend, then force it to unactive state at + * wake up. + */ +static int omap2_mcspi_resume(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + struct omap2_mcspi *mcspi = spi_master_get_devdata(master); + struct omap2_mcspi_cs *cs; + + omap2_mcspi_enable_clocks(mcspi); + list_for_each_entry(cs, &omap2_mcspi_ctx[master->bus_num - 1].cs, + node) { + if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) { + + /* + * We need to toggle CS state for OMAP take this + * change in account. + */ + MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 1); + __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0); + MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 0); + __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0); + } + } + omap2_mcspi_disable_clocks(mcspi); + return 0; +} +#else +#define omap2_mcspi_resume NULL +#endif + +static const struct dev_pm_ops omap2_mcspi_pm_ops = { + .resume = omap2_mcspi_resume, +}; + static struct platform_driver omap2_mcspi_driver = { .driver = { .name = "omap2_mcspi", .owner = THIS_MODULE, + .pm = &omap2_mcspi_pm_ops }, .remove = __exit_p(omap2_mcspi_remove), }; diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 5c598ca781df..2c7d8d5914b1 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -158,7 +158,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c index 00bb8a64d028..7a22b4129211 100644 --- a/trunk/mm/memcontrol.c +++ b/trunk/mm/memcontrol.c @@ -1925,18 +1925,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, rcu_read_lock(); p = rcu_dereference(mm->owner); + VM_BUG_ON(!p); /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "mem" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. + * because we don't have task_lock(), "p" can exit while + * we're here. In that case, "mem" can point to root + * cgroup but never be NULL. (and task_struct itself is freed + * by RCU, cgroup itself is RCU safe.) Then, we have small + * risk here to get wrong cgroup. But such kind of mis-account + * by race always happens because we don't have cgroup_mutex(). + * It's overkill and we allow that small race, here. */ mem = mem_cgroup_from_task(p); - if (!mem || mem_cgroup_is_root(mem)) { + VM_BUG_ON(!mem); + if (mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; }