From 77e63aad6ee1eff7f5bd7d6ea2fe3072caed07bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Nov 2005 14:34:23 -0800 Subject: [PATCH] --- yaml --- r: 14709 b: refs/heads/master c: 6aab341e0a28aff100a09831c5300a2994b8b986 h: refs/heads/master i: 14707: f8c85894f249fc963989aada3eb6b290f96704a4 v: v3 --- [refs] | 2 +- trunk/arch/powerpc/kernel/vdso.c | 6 +- trunk/drivers/char/mem.c | 2 +- trunk/fs/cifs/CHANGES | 8 +- trunk/fs/cifs/README | 30 +---- trunk/fs/cifs/TODO | 4 +- trunk/fs/cifs/cifsfs.c | 23 +--- trunk/fs/cifs/cifssmb.c | 25 ---- trunk/fs/cifs/dir.c | 34 ++---- trunk/fs/cifs/inode.c | 50 ++------ trunk/fs/cifs/misc.c | 17 +-- trunk/fs/cifs/netmisc.c | 4 +- trunk/fs/cifs/transport.c | 1 - trunk/fs/proc/task_mmu.c | 7 +- trunk/include/linux/mm.h | 5 +- trunk/mm/fremap.c | 22 ++-- trunk/mm/madvise.c | 2 +- trunk/mm/memory.c | 189 ++++++++++++++++--------------- trunk/mm/mempolicy.c | 12 +- trunk/mm/msync.c | 12 +- trunk/mm/nommu.c | 2 +- trunk/mm/rmap.c | 14 +-- 22 files changed, 168 insertions(+), 303 deletions(-) diff --git a/[refs] b/[refs] index ade14adc7ec9..07136cafae33 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 6ab16d249513a50bef3f1b275cea6aa8d3f51832 +refs/heads/master: 6aab341e0a28aff100a09831c5300a2994b8b986 diff --git a/trunk/arch/powerpc/kernel/vdso.c b/trunk/arch/powerpc/kernel/vdso.c index b44b36e0c293..f0c47dab0903 100644 --- a/trunk/arch/powerpc/kernel/vdso.c +++ b/trunk/arch/powerpc/kernel/vdso.c @@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); struct page *upg = (vma && vma->vm_mm) ? - follow_page(vma->vm_mm, vma->vm_start + - i*PAGE_SIZE, 0) + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) : NULL; dump_one_vdso_page(pg, upg); } @@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); struct page *upg = (vma && vma->vm_mm) ? - follow_page(vma->vm_mm, vma->vm_start + - i*PAGE_SIZE, 0) + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) : NULL; dump_one_vdso_page(pg, upg); } diff --git a/trunk/drivers/char/mem.c b/trunk/drivers/char/mem.c index 29c3b631445a..91dd669273e0 100644 --- a/trunk/drivers/char/mem.c +++ b/trunk/drivers/char/mem.c @@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size) if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) goto out_up; - if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED)) + if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) break; count = vma->vm_end - addr; if (count > size) diff --git a/trunk/fs/cifs/CHANGES b/trunk/fs/cifs/CHANGES index 943ef9b82244..6bded10c0d50 100644 --- a/trunk/fs/cifs/CHANGES +++ b/trunk/fs/cifs/CHANGES @@ -1,12 +1,10 @@ Version 1.39 ------------ -Defer close of a file handle slightly if pending writes depend on that handle +Defer close of a file handle slightly if pending writes depend on that file handle (this reduces the EBADF bad file handle errors that can be logged under heavy stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 -Fix SFU style symlinks and mknod needed for servers which do not support the -CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative -dentries so files that the client sees as deleted but that later get created -on the server will be recognized. Add client side permission check on setattr. +Fix SFU style symlinks and mknod needed for servers which do not support the CIFS +Unix Extensions. Fix setfacl/getfacl on bigendian. Version 1.38 ------------ diff --git a/trunk/fs/cifs/README b/trunk/fs/cifs/README index e5d09a2fc7a5..bb90941826ad 100644 --- a/trunk/fs/cifs/README +++ b/trunk/fs/cifs/README @@ -278,9 +278,7 @@ A partial list of the supported mount options follows: (such as Windows), permissions can also be checked at the client, and a crude form of client side permission checking can be enabled by specifying file_mode and dir_mode on - the client. Note that the mount.cifs helper must be - at version 1.10 or higher to support specifying the uid - (or gid) in non-numberic form. + the client gid If CIFS Unix extensions are not supported by the server this overrides the default gid for inodes. file_mode If CIFS Unix extensions are not supported by the server @@ -347,10 +345,7 @@ A partial list of the supported mount options follows: client system. It is typically only needed when the server supports the CIFS Unix Extensions but the UIDs/GIDs on the client and server system do not match closely enough to allow - access by the user doing the mount, but it may be useful with - non CIFS Unix Extension mounts for cases in which the default - mode is specified on the mount but is not to be enforced on the - client (e.g. perhaps when MultiUserMount is enabled) + access by the user doing the mount. Note that this does not affect the normal ACL check on the target machine done by the server software (of the server ACL against the user name provided at mount time). @@ -373,21 +368,15 @@ A partial list of the supported mount options follows: setuids If the CIFS Unix extensions are negotiated with the server the client will attempt to set the effective uid and gid of the local process on newly created files, directories, and - devices (create, mkdir, mknod). If the CIFS Unix Extensions - are not negotiated, for newly created files and directories - instead of using the default uid and gid specified on the - the mount, cache the new file's uid and gid locally which means - that the uid for the file can change when the inode is - reloaded (or the user remounts the share). + devices (create, mkdir, mknod). nosetuids The client will not attempt to set the uid and gid on on newly created files, directories, and devices (create, mkdir, mknod) which will result in the server setting the uid and gid to the default (usually the server uid of the user who mounted the share). Letting the server (rather than - the client) set the uid and gid is the default. If the CIFS - Unix Extensions are not negotiated then the uid and gid for - new files will appear to be the uid (gid) of the mounter or the - uid (gid) parameter specified on the mount. + the client) set the uid and gid is the default. This + parameter has no effect if the CIFS Unix Extensions are not + negotiated. netbiosname When mounting to servers via port 139, specifies the RFC1001 source name to use to represent the client netbios machine name when doing the RFC1001 netbios session initialize. @@ -429,13 +418,6 @@ A partial list of the supported mount options follows: byte range locks). remount remount the share (often used to change from ro to rw mounts or vice versa) - sfu When the CIFS Unix Extensions are not negotiated, attempt to - create device files and fifos in a format compatible with - Services for Unix (SFU). In addition retrieve bits 10-12 - of the mode via the SETFILEBITS extended attribute (as - SFU does). In the future the bottom 9 bits of the mode - mode also will be emulated using queries of the security - descriptor (ACL). The mount.cifs mount helper also accepts a few mount options before -o including: diff --git a/trunk/fs/cifs/TODO b/trunk/fs/cifs/TODO index fc34c74ec4be..c909298d11ed 100644 --- a/trunk/fs/cifs/TODO +++ b/trunk/fs/cifs/TODO @@ -1,4 +1,4 @@ -Version 1.39 November 30, 2005 +version 1.37 October 9, 2005 A Partial List of Missing Features ================================== @@ -58,7 +58,7 @@ o) Improve performance of readpages by sending more than one read at a time when 8 pages or more are requested. In conjuntion add support for async_cifs_readpages. -p) Add support for storing symlink info to Windows servers +p) Add support for storing symlink and fifo info to Windows servers in the Extended Attribute format their SFU clients would recognize. q) Finish fcntl D_NOTIFY support so kde and gnome file list windows diff --git a/trunk/fs/cifs/cifsfs.c b/trunk/fs/cifs/cifsfs.c index f4974b41e485..51548ed2e9cc 100644 --- a/trunk/fs/cifs/cifsfs.c +++ b/trunk/fs/cifs/cifsfs.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -430,11 +429,6 @@ static void cifs_umount_begin(struct super_block * sblock) { cFYI(1,("wake up tasks now - umount begin not complete")); wake_up_all(&tcon->ses->server->request_q); - wake_up_all(&tcon->ses->server->response_q); - msleep(1); /* yield */ - /* we have to kick the requests once more */ - wake_up_all(&tcon->ses->server->response_q); - msleep(1); } /* BB FIXME - finish add checks for tidStatus BB */ @@ -901,9 +895,6 @@ static int cifs_oplock_thread(void * dummyarg) static int cifs_dnotify_thread(void * dummyarg) { - struct list_head *tmp; - struct cifsSesInfo *ses; - daemonize("cifsdnotifyd"); allow_signal(SIGTERM); @@ -912,19 +903,7 @@ static int cifs_dnotify_thread(void * dummyarg) if(try_to_freeze()) continue; set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(15*HZ); - read_lock(&GlobalSMBSeslock); - /* check if any stuck requests that need - to be woken up and wakeq so the - thread can wake up and error out */ - list_for_each(tmp, &GlobalSMBSessionList) { - ses = list_entry(tmp, struct cifsSesInfo, - cifsSessionList); - if(ses && ses->server && - atomic_read(&ses->server->inSend)) - wake_up_all(&ses->server->response_q); - } - read_unlock(&GlobalSMBSeslock); + schedule_timeout(39*HZ); } while(!signal_pending(current)); complete_and_exit (&cifs_dnotify_exited, 0); } diff --git a/trunk/fs/cifs/cifssmb.c b/trunk/fs/cifs/cifssmb.c index 6867e556d37e..d179b0c3eee4 100644 --- a/trunk/fs/cifs/cifssmb.c +++ b/trunk/fs/cifs/cifssmb.c @@ -90,18 +90,6 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, check for tcp and smb session status done differently for those three - in the calling routine */ if(tcon) { - if(tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1,("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } if((tcon->ses) && (tcon->ses->status != CifsExiting) && (tcon->ses->server)){ struct nls_table *nls_codepage; @@ -199,19 +187,6 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, check for tcp and smb session status done differently for those three - in the calling routine */ if(tcon) { - if(tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1,("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } - if((tcon->ses) && (tcon->ses->status != CifsExiting) && (tcon->ses->server)){ struct nls_table *nls_codepage; diff --git a/trunk/fs/cifs/dir.c b/trunk/fs/cifs/dir.c index 32cc96cafa3e..16b21522e8fe 100644 --- a/trunk/fs/cifs/dir.c +++ b/trunk/fs/cifs/dir.c @@ -228,15 +228,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, else { rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,xid); - if(newinode) { + if(newinode) newinode->i_mode = mode; - if((oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current->fsuid; - newinode->i_gid = current->fsgid; - } - } } if (rc != 0) { @@ -472,20 +465,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); - /* since paths are not looked up by component - the parent - directories are presumed to be good here */ + /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); } else if (rc == -ENOENT) { rc = 0; - direntry->d_time = jiffies; - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; d_add(direntry, NULL); - /* if it was once a directory (but how can we tell?) we could do - shrink_dcache_parent(direntry); */ } else { cERROR(1,("Error 0x%x on cifs_get_inode_info in lookup of %s", rc,full_path)); @@ -504,20 +489,21 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { int isValid = 1; +/* lock_kernel(); *//* surely we do not want to lock the kernel for a whole network round trip which could take seconds */ + if (direntry->d_inode) { if (cifs_revalidate(direntry)) { + /* unlock_kernel(); */ return 0; } } else { - cFYI(1, ("neg dentry 0x%p name = %s", - direntry, direntry->d_name.name)); - if(time_after(jiffies, direntry->d_time + HZ) || - !lookupCacheEnabled) { - d_drop(direntry); - isValid = 0; - } + cFYI(1, + ("In cifs_d_revalidate with no inode but name = %s and dentry 0x%p", + direntry->d_name.name, direntry)); } +/* unlock_kernel(); */ + return isValid; } diff --git a/trunk/fs/cifs/inode.c b/trunk/fs/cifs/inode.c index 053c1cadf703..05b525812adb 100644 --- a/trunk/fs/cifs/inode.c +++ b/trunk/fs/cifs/inode.c @@ -710,7 +710,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) char *full_path = NULL; struct inode *newinode = NULL; - cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); + cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p ", mode, inode)); xid = GetXid(); @@ -768,16 +768,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) /* BB to be implemented via Windows secrty descriptors eg CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, -1, -1, local_nls); */ - if(direntry->d_inode) { - direntry->d_inode->i_mode = mode; - if(cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_SET_UID) { - direntry->d_inode->i_uid = - current->fsuid; - direntry->d_inode->i_gid = - current->fsgid; - } - } + } } kfree(full_path); FreeXid(xid); @@ -1048,20 +1039,14 @@ int cifs_revalidate(struct dentry *direntry) filemap_fdatawrite(direntry->d_inode->i_mapping); } if (invalidate_inode) { - /* shrink_dcache not necessary now that cifs dentry ops - are exported for negative dentries */ -/* if(S_ISDIR(direntry->d_inode->i_mode)) - shrink_dcache_parent(direntry); */ - if (S_ISREG(direntry->d_inode->i_mode)) { - if (direntry->d_inode->i_mapping) - filemap_fdatawait(direntry->d_inode->i_mapping); - /* may eventually have to do this for open files too */ - if (list_empty(&(cifsInode->openFileList))) { - /* changed on server - flush read ahead pages */ - cFYI(1, ("Invalidating read ahead data on " - "closed file")); - invalidate_remote_inode(direntry->d_inode); - } + if (direntry->d_inode->i_mapping) + filemap_fdatawait(direntry->d_inode->i_mapping); + /* may eventually have to do this for open files too */ + if (list_empty(&(cifsInode->openFileList))) { + /* Has changed on server - flush read ahead pages */ + cFYI(1, ("Invalidating read ahead data on " + "closed file")); + invalidate_remote_inode(direntry->d_inode); } } /* up(&direntry->d_inode->i_sem); */ @@ -1120,20 +1105,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ", direntry->d_name.name, attrs->ia_valid)); - cifs_sb = CIFS_SB(direntry->d_inode->i_sb); pTcon = cifs_sb->tcon; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM == 0) { - /* check if we have permission to change attrs */ - rc = inode_change_ok(direntry->d_inode, attrs); - if(rc < 0) { - FreeXid(xid); - return rc; - } else - rc = 0; - } - down(&direntry->d_sb->s_vfs_rename_sem); full_path = build_path_from_dentry(direntry); up(&direntry->d_sb->s_vfs_rename_sem); @@ -1173,9 +1147,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) 1 /* 45 seconds */); cFYI(1,("Wrt seteof rc %d", rc)); } - } else - rc = -EINVAL; - + } if (rc != 0) { /* Set file size by pathname rather than by handle either because no valid, writeable file handle for diff --git a/trunk/fs/cifs/misc.c b/trunk/fs/cifs/misc.c index 94baf6c8ecbd..ca27a82c54cd 100644 --- a/trunk/fs/cifs/misc.c +++ b/trunk/fs/cifs/misc.c @@ -397,12 +397,12 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid) if(smb->Command == SMB_COM_LOCKING_ANDX) return 0; else - cERROR(1, ("Rcvd Request not response")); + cERROR(1, ("Rcvd Request not response ")); } } else { /* bad signature or mid */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) cERROR(1, - ("Bad protocol string signature header %x", + ("Bad protocol string signature header %x ", *(unsigned int *) smb->Protocol)); if (mid != smb->Mid) cERROR(1, ("Mids do not match")); @@ -417,7 +417,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) __u32 len = smb->smb_buf_length; __u32 clc_len; /* calculated length */ cFYI(0, - ("Entering checkSMB with Length: %x, smb_buf_length: %x", + ("Entering checkSMB with Length: %x, smb_buf_length: %x ", length, len)); if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) || (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) { @@ -451,16 +451,9 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) cERROR(1, ("bad smb size detected for Mid=%d", smb->Mid)); /* Windows XP can return a few bytes too much, presumably an illegal pad, at the end of byte range lock responses - so we allow for that three byte pad, as long as actual + so we allow for up to eight byte pad, as long as actual received length is as long or longer than calculated length */ - /* We have now had to extend this more, since there is a - case in which it needs to be bigger still to handle a - malformed response to transact2 findfirst from WinXP when - access denied is returned and thus bcc and wct are zero - but server says length is 0x21 bytes too long as if the server - forget to reset the smb rfc1001 length when it reset the - wct and bcc to minimum size and drop the t2 parms and data */ - if((4+len > clc_len) && (len <= clc_len + 512)) + if((4+len > clc_len) && (len <= clc_len + 3)) return 0; else return 1; diff --git a/trunk/fs/cifs/netmisc.c b/trunk/fs/cifs/netmisc.c index 5de74d216fdd..f7814689844b 100644 --- a/trunk/fs/cifs/netmisc.c +++ b/trunk/fs/cifs/netmisc.c @@ -330,7 +330,7 @@ static const struct { ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { ERRSRV, 2241, NT_STATUS_INVALID_LOGON_HOURS}, { ERRSRV, 2240, NT_STATUS_INVALID_WORKSTATION}, { - ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { + ERRSRV, 2242, NT_STATUS_PASSWORD_EXPIRED}, { ERRSRV, 2239, NT_STATUS_ACCOUNT_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { @@ -676,7 +676,7 @@ static const struct { ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { - ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { + ERRSRV, 2242, NT_STATUS_PASSWORD_MUST_CHANGE}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { diff --git a/trunk/fs/cifs/transport.c b/trunk/fs/cifs/transport.c index f8871196098c..41a9659c16bc 100644 --- a/trunk/fs/cifs/transport.c +++ b/trunk/fs/cifs/transport.c @@ -515,7 +515,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, *pbytes_returned = in_buf->smb_buf_length; /* BB special case reconnect tid and uid here? */ - /* BB special case Errbadpassword and pwdexpired here */ rc = map_smb_to_linux_error(in_buf); /* convert ByteCount if necessary */ diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c index 9ab97cef0daa..50bd5a8f0446 100644 --- a/trunk/fs/proc/task_mmu.c +++ b/trunk/fs/proc/task_mmu.c @@ -402,12 +402,11 @@ struct numa_maps { /* * Calculate numa node maps for a vma */ -static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) +static struct numa_maps *get_numa_maps(struct vm_area_struct *vma) { + int i; struct page *page; unsigned long vaddr; - struct mm_struct *mm = vma->vm_mm; - int i; struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) @@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) md->node[i] =0; for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { - page = follow_page(mm, vaddr, 0); + page = follow_page(vma, vaddr, 0); if (page) { int count = page_mapcount(page); diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index f0cdfd18db55..6a75a7a78bf1 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_GROWSUP 0x00000200 #define VM_SHM 0x00000000 /* Means nothing: delete it later */ -#define VM_UNPAGED 0x00000400 /* Pages managed without map count */ +#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_EXECUTABLE 0x00001000 @@ -664,6 +664,7 @@ struct zap_details { unsigned long truncate_count; /* Compare vm_truncate_count */ }; +struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); unsigned long unmap_vmas(struct mmu_gather **tlb, @@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); -struct page *follow_page(struct mm_struct *, unsigned long address, +struct page *follow_page(struct vm_area_struct *, unsigned long address, unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ diff --git a/trunk/mm/fremap.c b/trunk/mm/fremap.c index 007cbad9331e..f851775e09c2 100644 --- a/trunk/mm/fremap.c +++ b/trunk/mm/fremap.c @@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; if (pte_present(pte)) { - unsigned long pfn = pte_pfn(pte); - flush_cache_page(vma, addr, pfn); + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out; + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); } - page = pfn_to_page(pfn); - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } -out: return !!page; } @@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_UNPAGED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) @@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_UNPAGED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) diff --git a/trunk/mm/madvise.c b/trunk/mm/madvise.c index 328a3bcce527..2b7cf0400a21 100644 --- a/trunk/mm/madvise.c +++ b/trunk/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index d1f46f4e4c8a..b57fbc636058 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) } /* - * This function is called to print an error when a pte in a - * !VM_UNPAGED region is found pointing to an invalid pfn (which - * is an error. + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. * * The calling function must still handle the error. */ @@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) } /* - * page_is_anon applies strict checks for an anonymous page belonging to - * this vma at this address. It is used on VM_UNPAGED vmas, which are - * usually populated with shared originals (which must not be counted), - * but occasionally contain private COWed copies (when !VM_SHARED, or - * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window - * free pages, pages from other processes, or from other parts of this: - * it's tricky, but try not to be deceived by foreign anonymous pages. + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). */ -static inline int page_is_anon(struct page *page, - struct vm_area_struct *vma, unsigned long addr) +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - return page && PageAnon(page) && page_mapped(page) && - page_address_in_vma(page, vma) == addr; + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); } /* @@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; - unsigned long pfn; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_set_pte; } - pfn = pte_pfn(pte); - page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; - - if (unlikely(vm_flags & VM_UNPAGED)) - if (!page_is_anon(page, vma, addr)) - goto out_set_pte; - - /* - * If the pte points outside of valid memory but - * the region is not VM_UNPAGED, we have a problem. - */ - if (unlikely(!page)) { - print_bad_pte(vma, pte, addr); - goto out_set_pte; /* try to do something sane */ - } - /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - get_page(page); - page_dup_rmap(page); - rss[!!PageAnon(page)]++; + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); @@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { if (!vma->anon_vma) return 0; } @@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } if (pte_present(ptent)) { struct page *page; - unsigned long pfn; (*zap_work) -= PAGE_SIZE; - pfn = pte_pfn(ptent); - page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; - - if (unlikely(vma->vm_flags & VM_UNPAGED)) { - if (!page_is_anon(page, vma, addr)) - page = NULL; - } else if (unlikely(!page)) - print_bad_pte(vma, ptent, addr); - + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; @@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long pfn; struct page *page; + struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { @@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, goto unlock; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - pfn = pte_pfn(pte); - if (!pfn_valid(pfn)) + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) goto unlock; - page = pfn_to_page(pfn); if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { @@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + struct page *page = vm_normal_page(vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); } pte_unmap(pte); if (vmas) @@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags |= FOLL_WRITE; cond_resched(); - while (!(page = follow_page(mm, start, foll_flags))) { + while (!(page = follow_page(vma, start, foll_flags))) { int ret; ret = __handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); @@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * in 2.6 the LRU scan won't even find its pages, so this * flag means no more than count its pages in reserved_vm, * and omit it from core dump, even when VM_IO turned off. - * VM_UNPAGED tells the core MM not to "manage" these pages - * (e.g. refcount, mapcount, try to swap them out): in - * particular, zap_pte_range does not try to free them. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. */ - vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_pgoff = pfn; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE); + if (left) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *src_page, *new_page; - unsigned long pfn = pte_pfn(orig_pte); pte_t entry; int ret = VM_FAULT_MINOR; - if (unlikely(!pfn_valid(pfn))) { - /* - * Page table corrupted: show pte and kill process. - * Or it's an attempt to COW an out-of-map VM_UNPAGED - * entry, which copy_user_highpage does not support. - */ - print_bad_pte(vma, orig_pte, address); - ret = VM_FAULT_OOM; - goto unlock; - } - old_page = pfn_to_page(pfn); + old_page = vm_normal_page(vma, address, orig_pte); src_page = old_page; - - if (unlikely(vma->vm_flags & VM_UNPAGED)) - if (!page_is_anon(old_page, vma, address)) { - old_page = NULL; - goto gotten; - } + if (!old_page) + goto gotten; if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); @@ -1351,7 +1373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - copy_user_highpage(new_page, src_page, address); + cow_user_page(new_page, src_page, address); } /* @@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; - /* - * A VM_UNPAGED vma will normally be filled with present ptes - * by remap_pfn_range, and never arrive here; but it might have - * holes, or if !VM_DONTEXPAND, mremap might have expanded it. - * It's weird enough handling anon pages in unpaged vmas, we do - * not want to worry about ZERO_PAGEs too (it may or may not - * matter if their counts wrap): just give them anon pages. - */ - - if (write_access || (vma->vm_flags & VM_UNPAGED)) { + if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); @@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int anon = 0; pte_unmap(page_table); - BUG_ON(vma->vm_flags & VM_UNPAGED); - if (vma->vm_file) { mapping = vma->vm_file->f_mapping; sequence = mapping->truncate_count; @@ -1930,7 +1941,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; - copy_user_highpage(page, new_page, address); + cow_user_page(page, new_page, address); page_cache_release(new_page); new_page = page; anon = 1; diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c index 5609a31bdf22..bec88c81244e 100644 --- a/trunk/mm/mempolicy.c +++ b/trunk/mm/mempolicy.c @@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; + struct page *page; unsigned int nid; if (!pte_present(*pte)) continue; - pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - nid = pfn_to_nid(pfn); + nid = page_to_nid(page); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); - if (first->vm_flags & VM_UNPAGED) - return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) diff --git a/trunk/mm/msync.c b/trunk/mm/msync.c index b3f4caf3010b..1b5b6f662dcf 100644 --- a/trunk/mm/msync.c +++ b/trunk/mm/msync.c @@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; struct page *page; if (progress >= 64) { @@ -40,13 +39,9 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; if (!pte_maybe_dirty(*pte)) continue; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - page = pfn_to_page(pfn); - if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); @@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). - * Can't do anything with VM_UNPAGED regions either. */ - if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) + if (vma->vm_flags & VM_HUGETLB) return; BUG_ON(addr >= end); diff --git a/trunk/mm/nommu.c b/trunk/mm/nommu.c index 6deb6ab3d6ad..c1196812876b 100644 --- a/trunk/mm/nommu.c +++ b/trunk/mm/nommu.c @@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c index 2e034a0b89ab..6389cda02a20 100644 --- a/trunk/mm/rmap.c +++ b/trunk/mm/rmap.c @@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* * At what user virtual address is page expected in vma? checking that the * page matches the vma: currently only used on anon pages, by unuse_vma; - * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking - * care that an mmap of /dev/mem might window free and foreign pages. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; - unsigned long pfn; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor, for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; - - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, address); - continue; - } - - page = pfn_to_page(pfn); - BUG_ON(PageAnon(page)); + page = vm_normal_page(vma, address, *pte); + BUG_ON(!page || PageAnon(page)); if (ptep_clear_flush_young(vma, address, pte)) continue;