diff --git a/[refs] b/[refs] index a2b4ef65ce1f..f6c477dc5e46 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: a487b6705a811087c182c8cab7e3b5845dfa6ccb +refs/heads/master: e08b061ec0fca1f63bb1006bf1edc0556f36d0ae diff --git a/trunk/Documentation/auxdisplay/cfag12864b-example.c b/trunk/Documentation/auxdisplay/cfag12864b-example.c index e7823ffb1ca0..1d2c010bae12 100644 --- a/trunk/Documentation/auxdisplay/cfag12864b-example.c +++ b/trunk/Documentation/auxdisplay/cfag12864b-example.c @@ -194,6 +194,7 @@ static void cfag12864b_blit(void) */ #include +#include #define EXAMPLES 6 diff --git a/trunk/Documentation/cgroups/cgroups.txt b/trunk/Documentation/cgroups/cgroups.txt index 455d4e6d346d..6eb1a97e88ce 100644 --- a/trunk/Documentation/cgroups/cgroups.txt +++ b/trunk/Documentation/cgroups/cgroups.txt @@ -408,26 +408,6 @@ You can attach the current shell task by echoing 0: # echo 0 > tasks -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - 3. Kernel API ============= @@ -521,7 +501,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be called multiple times against a cgroup. int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *task, bool threadgroup) + struct task_struct *task) (cgroup_mutex held by caller) Called prior to moving a task into a cgroup; if the subsystem @@ -529,20 +509,14 @@ returns an error, this will abort the attach operation. If a NULL task is passed, then a successful result indicates that *any* unspecified task can be moved into the cgroup. Note that this isn't called on a fork. If this method returns 0 (success) then this should -remain valid while the caller holds cgroup_mutex. If threadgroup is -true, then a successful result indicates that all threads in the given -thread's threadgroup can be moved together. +remain valid while the caller holds cgroup_mutex. void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) + struct cgroup *old_cgrp, struct task_struct *task) (cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. -If threadgroup is true, the subsystem should take care of all threads -in the specified thread's threadgroup. Currently does not support any -subsystem that might need the old_cgrp for every thread in the group. void fork(struct cgroup_subsy *ss, struct task_struct *task) diff --git a/trunk/Documentation/cgroups/memory.txt b/trunk/Documentation/cgroups/memory.txt index b871f2552b45..23d1262c0775 100644 --- a/trunk/Documentation/cgroups/memory.txt +++ b/trunk/Documentation/cgroups/memory.txt @@ -179,9 +179,6 @@ The reclaim algorithm has not been modified for cgroups, except that pages that are selected for reclaiming come from the per cgroup LRU list. -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - 2. Locking The memory controller uses the following hierarchy @@ -213,7 +210,6 @@ We can alter the memory limit: NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, mega or gigabytes. NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). -NOTE: We cannot set limits on the root cgroup any more. # cat /cgroups/0/memory.limit_in_bytes 4194304 @@ -379,42 +375,7 @@ cgroups created below it. NOTE2: This feature can be enabled/disabled per subtree. -7. Soft limits - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best effort feature, it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is setup such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 megabytes) - -# echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use - -# echo 1G > memory.soft_limit_in_bytes - -NOTE1: Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. TODO +7. TODO 1. Add support for accounting huge pages (as a separate controller) 2. Make per-cgroup scanner reclaim not-shared pages first diff --git a/trunk/Documentation/crypto/async-tx-api.txt b/trunk/Documentation/crypto/async-tx-api.txt index ba046b8fa92f..9f59fcbf5d82 100644 --- a/trunk/Documentation/crypto/async-tx-api.txt +++ b/trunk/Documentation/crypto/async-tx-api.txt @@ -54,23 +54,20 @@ features surfaced as a result: 3.1 General format of the API: struct dma_async_tx_descriptor * -async_(, struct async_submit ctl *submit) +async_(, + enum async_tx_flags flags, + struct dma_async_tx_descriptor *dependency, + dma_async_tx_callback callback_routine, + void *callback_parameter); 3.2 Supported operations: -memcpy - memory copy between a source and a destination buffer -memset - fill a destination buffer with a byte value -xor - xor a series of source buffers and write the result to a - destination buffer -xor_val - xor a series of source buffers and set a flag if the - result is zero. The implementation attempts to prevent - writes to memory -pq - generate the p+q (raid6 syndrome) from a series of source buffers -pq_val - validate that a p and or q buffer are in sync with a given series of - sources -datap - (raid6_datap_recov) recover a raid6 data block and the p block - from the given sources -2data - (raid6_2data_recov) recover 2 raid6 data blocks from the given - sources +memcpy - memory copy between a source and a destination buffer +memset - fill a destination buffer with a byte value +xor - xor a series of source buffers and write the result to a + destination buffer +xor_zero_sum - xor a series of source buffers and set a flag if the + result is zero. The implementation attempts to prevent + writes to memory 3.3 Descriptor management: The return value is non-NULL and points to a 'descriptor' when the operation @@ -83,8 +80,8 @@ acknowledged by the application before the offload engine driver is allowed to recycle (or free) the descriptor. A descriptor can be acked by one of the following methods: 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted -2/ submitting an unacknowledged descriptor as a dependency to another - async_tx call will implicitly set the acknowledged state. +2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent + descriptor of a new operation. 3/ calling async_tx_ack() on the descriptor. 3.4 When does the operation execute? @@ -122,42 +119,30 @@ of an operation. Perform a xor->copy->xor operation where each operation depends on the result from the previous operation: -void callback(void *param) +void complete_xor_copy_xor(void *param) { - struct completion *cmp = param; - - complete(cmp); + printk("complete\n"); } -void run_xor_copy_xor(struct page **xor_srcs, - int xor_src_cnt, - struct page *xor_dest, - size_t xor_len, - struct page *copy_src, - struct page *copy_dest, - size_t copy_len) +int run_xor_copy_xor(struct page **xor_srcs, + int xor_src_cnt, + struct page *xor_dest, + size_t xor_len, + struct page *copy_src, + struct page *copy_dest, + size_t copy_len) { struct dma_async_tx_descriptor *tx; - addr_conv_t addr_conv[xor_src_cnt]; - struct async_submit_ctl submit; - addr_conv_t addr_conv[NDISKS]; - struct completion cmp; - - init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL, - addr_conv); - tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit) - submit->depend_tx = tx; - tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit); - - init_completion(&cmp); - init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx, - callback, &cmp, addr_conv); - tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, + ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL); + tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); + tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, + ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, + tx, complete_xor_copy_xor, NULL); async_tx_issue_pending_all(); - - wait_for_completion(&cmp); } See include/linux/async_tx.h for more information on the flags. See the diff --git a/trunk/Documentation/filesystems/sharedsubtree.txt b/trunk/Documentation/filesystems/sharedsubtree.txt index 23a181074f94..736540045dc7 100644 --- a/trunk/Documentation/filesystems/sharedsubtree.txt +++ b/trunk/Documentation/filesystems/sharedsubtree.txt @@ -4,7 +4,7 @@ Shared Subtrees Contents: 1) Overview 2) Features - 3) Setting mount states + 3) smount command 4) Use-case 5) Detailed semantics 6) Quiz @@ -41,14 +41,14 @@ replicas continue to be exactly same. Here is an example: - Let's say /mnt has a mount that is shared. + Lets say /mnt has a mount that is shared. mount --make-shared /mnt - Note: mount(8) command now supports the --make-shared flag, - so the sample 'smount' program is no longer needed and has been - removed. + note: mount command does not yet support the --make-shared flag. + I have included a small C program which does the same by executing + 'smount /mnt shared' - # mount --bind /mnt /tmp + #mount --bind /mnt /tmp The above command replicates the mount at /mnt to the mountpoint /tmp and the contents of both the mounts remain identical. @@ -58,8 +58,8 @@ replicas continue to be exactly same. #ls /tmp a b c - Now let's say we mount a device at /tmp/a - # mount /dev/sd0 /tmp/a + Now lets say we mount a device at /tmp/a + #mount /dev/sd0 /tmp/a #ls /tmp/a t1 t2 t2 @@ -80,20 +80,21 @@ replicas continue to be exactly same. Here is an example: - Let's say /mnt has a mount which is shared. - # mount --make-shared /mnt + Lets say /mnt has a mount which is shared. + #mount --make-shared /mnt - Let's bind mount /mnt to /tmp - # mount --bind /mnt /tmp + Lets bind mount /mnt to /tmp + #mount --bind /mnt /tmp the new mount at /tmp becomes a shared mount and it is a replica of the mount at /mnt. - Now let's make the mount at /tmp; a slave of /mnt - # mount --make-slave /tmp + Now lets make the mount at /tmp; a slave of /mnt + #mount --make-slave /tmp + [or smount /tmp slave] - let's mount /dev/sd0 on /mnt/a - # mount /dev/sd0 /mnt/a + lets mount /dev/sd0 on /mnt/a + #mount /dev/sd0 /mnt/a #ls /mnt/a t1 t2 t3 @@ -103,9 +104,9 @@ replicas continue to be exactly same. Note the mount event has propagated to the mount at /tmp - However let's see what happens if we mount something on the mount at /tmp + However lets see what happens if we mount something on the mount at /tmp - # mount /dev/sd1 /tmp/b + #mount /dev/sd1 /tmp/b #ls /tmp/b s1 s2 s3 @@ -123,11 +124,12 @@ replicas continue to be exactly same. 2d) A unbindable mount is a unbindable private mount - let's say we have a mount at /mnt and we make is unbindable + lets say we have a mount at /mnt and we make is unbindable - # mount --make-unbindable /mnt + #mount --make-unbindable /mnt + [ smount /mnt unbindable ] - Let's try to bind mount this mount somewhere else. + Lets try to bind mount this mount somewhere else. # mount --bind /mnt /tmp mount: wrong fs type, bad option, bad superblock on /mnt, or too many mounted file systems @@ -135,15 +137,149 @@ replicas continue to be exactly same. Binding a unbindable mount is a invalid operation. -3) Setting mount states +3) smount command - The mount command (util-linux package) can be used to set mount - states: + Currently the mount command is not aware of shared subtree features. + Work is in progress to add the support in mount ( util-linux package ). + Till then use the following program. - mount --make-shared mountpoint - mount --make-slave mountpoint - mount --make-private mountpoint - mount --make-unbindable mountpoint + ------------------------------------------------------------------------ + // + //this code was developed my Miklos Szeredi + //and modified by Ram Pai + // sample usage: + // smount /tmp shared + // + #include + #include + #include + #include + #include + #include + + #ifndef MS_REC + #define MS_REC 0x4000 /* 16384: Recursive loopback */ + #endif + + #ifndef MS_SHARED + #define MS_SHARED 1<<20 /* Shared */ + #endif + + #ifndef MS_PRIVATE + #define MS_PRIVATE 1<<18 /* Private */ + #endif + + #ifndef MS_SLAVE + #define MS_SLAVE 1<<19 /* Slave */ + #endif + + #ifndef MS_UNBINDABLE + #define MS_UNBINDABLE 1<<17 /* Unbindable */ + #endif + + int main(int argc, char *argv[]) + { + int type; + if(argc != 3) { + fprintf(stderr, "usage: %s dir " + "\n" , argv[0]); + return 1; + } + + fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); + + if (strcmp(argv[2],"rshared")==0) + type=(MS_SHARED|MS_REC); + else if (strcmp(argv[2],"rslave")==0) + type=(MS_SLAVE|MS_REC); + else if (strcmp(argv[2],"rprivate")==0) + type=(MS_PRIVATE|MS_REC); + else if (strcmp(argv[2],"runbindable")==0) + type=(MS_UNBINDABLE|MS_REC); + else if (strcmp(argv[2],"shared")==0) + type=MS_SHARED; + else if (strcmp(argv[2],"slave")==0) + type=MS_SLAVE; + else if (strcmp(argv[2],"private")==0) + type=MS_PRIVATE; + else if (strcmp(argv[2],"unbindable")==0) + type=MS_UNBINDABLE; + else { + fprintf(stderr, "invalid operation: %s\n", argv[2]); + return 1; + } + setfsuid(getuid()); + + if(mount("", argv[1], "dontcare", type, "") == -1) { + perror("mount"); + return 1; + } + return 0; + } + ----------------------------------------------------------------------- + + Copy the above code snippet into smount.c + gcc -o smount smount.c + + + (i) To mark all the mounts under /mnt as shared execute the following + command: + + smount /mnt rshared + the corresponding syntax planned for mount command is + mount --make-rshared /mnt + + just to mark a mount /mnt as shared, execute the following + command: + smount /mnt shared + the corresponding syntax planned for mount command is + mount --make-shared /mnt + + (ii) To mark all the shared mounts under /mnt as slave execute the + following + + command: + smount /mnt rslave + the corresponding syntax planned for mount command is + mount --make-rslave /mnt + + just to mark a mount /mnt as slave, execute the following + command: + smount /mnt slave + the corresponding syntax planned for mount command is + mount --make-slave /mnt + + (iii) To mark all the mounts under /mnt as private execute the + following command: + + smount /mnt rprivate + the corresponding syntax planned for mount command is + mount --make-rprivate /mnt + + just to mark a mount /mnt as private, execute the following + command: + smount /mnt private + the corresponding syntax planned for mount command is + mount --make-private /mnt + + NOTE: by default all the mounts are created as private. But if + you want to change some shared/slave/unbindable mount as + private at a later point in time, this command can help. + + (iv) To mark all the mounts under /mnt as unbindable execute the + following + + command: + smount /mnt runbindable + the corresponding syntax planned for mount command is + mount --make-runbindable /mnt + + just to mark a mount /mnt as unbindable, execute the following + command: + smount /mnt unbindable + the corresponding syntax planned for mount command is + mount --make-unbindable /mnt 4) Use cases @@ -214,7 +350,7 @@ replicas continue to be exactly same. mount --rbind / /view/v3 mount --rbind / /view/v4 - and if /usr has a versioning filesystem mounted, then that + and if /usr has a versioning filesystem mounted, than that mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and /view/v4/usr too @@ -254,7 +390,7 @@ replicas continue to be exactly same. For example: mount --make-shared /mnt - mount --bind /mnt /tmp + mount --bin /mnt /tmp The mount at /mnt and that at /tmp are both shared and belong to the same peer group. Anything mounted or unmounted under @@ -422,7 +558,7 @@ replicas continue to be exactly same. then the subtree under the unbindable mount is pruned in the new location. - eg: let's say we have the following mount tree. + eg: lets say we have the following mount tree. A / \ @@ -430,7 +566,7 @@ replicas continue to be exactly same. / \ / \ D E F G - Let's say all the mount except the mount C in the tree are + Lets say all the mount except the mount C in the tree are of a type other than unbindable. If this tree is rbound to say Z @@ -547,13 +683,13 @@ replicas continue to be exactly same. 'b' on mounts that receive propagation from mount 'B' and does not have sub-mounts within them are unmounted. - Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to + Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to each other. - let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount + lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount 'B1', 'B2' and 'B3' respectively. - let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on + lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on mount 'B1', 'B2' and 'B3' respectively. if 'C1' is unmounted, all the mounts that are most-recently-mounted on @@ -574,7 +710,7 @@ replicas continue to be exactly same. A cloned namespace contains all the mounts as that of the parent namespace. - Let's say 'A' and 'B' are the corresponding mounts in the parent and the + Lets say 'A' and 'B' are the corresponding mounts in the parent and the child namespace. If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to @@ -623,11 +759,11 @@ replicas continue to be exactly same. mount --make-slave /mnt At this point we have the first mount at /tmp and - its root dentry is 1. Let's call this mount 'A' + its root dentry is 1. Lets call this mount 'A' And then we have a second mount at /tmp1 with root - dentry 2. Let's call this mount 'B' + dentry 2. Lets call this mount 'B' Next we have a third mount at /mnt with root dentry - mnt. Let's call this mount 'C' + mnt. Lets call this mount 'C' 'B' is the slave of 'A' and 'C' is a slave of 'B' A -> B -> C @@ -658,7 +794,7 @@ replicas continue to be exactly same. Q3 Why is unbindable mount needed? - Let's say we want to replicate the mount tree at multiple + Lets say we want to replicate the mount tree at multiple locations within the same subtree. if one rbind mounts a tree within the same subtree 'n' times @@ -667,7 +803,7 @@ replicas continue to be exactly same. mounts. Here is a example. step 1: - let's say the root tree has just two directories with + lets say the root tree has just two directories with one vfsmount. root / \ @@ -739,7 +875,7 @@ replicas continue to be exactly same. Unclonable mounts come in handy here. step 1: - let's say the root tree has just two directories with + lets say the root tree has just two directories with one vfsmount. root / \ diff --git a/trunk/Documentation/filesystems/vfs.txt b/trunk/Documentation/filesystems/vfs.txt index 623f094c9d8d..f49eecf2e573 100644 --- a/trunk/Documentation/filesystems/vfs.txt +++ b/trunk/Documentation/filesystems/vfs.txt @@ -536,7 +536,6 @@ struct address_space_operations { /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); }; writepage: called by the VM to write a dirty page to backing store. @@ -695,12 +694,6 @@ struct address_space_operations { prevent redirtying the page, it is kept locked during the whole operation. - error_remove_page: normally set to generic_error_remove_page if truncation - is ok for this address space. Used for memory failure handling. - Setting this implies you deal with pages going away under you, - unless you have them locked or reference counts increased. - - The File Object =============== diff --git a/trunk/Documentation/ioctl/ioctl-number.txt b/trunk/Documentation/ioctl/ioctl-number.txt index 947374977ca5..aafca0a8f66a 100644 --- a/trunk/Documentation/ioctl/ioctl-number.txt +++ b/trunk/Documentation/ioctl/ioctl-number.txt @@ -135,7 +135,6 @@ Code Seq# Include File Comments 'l' 40-7F linux/udf_fs_i.h in development: -'m' 00-09 linux/mmtimer.h 'm' all linux/mtio.h conflict! 'm' all linux/soundcard.h conflict! 'm' all linux/synclink.h conflict! diff --git a/trunk/Documentation/sysctl/fs.txt b/trunk/Documentation/sysctl/fs.txt index 62682500878a..1458448436cc 100644 --- a/trunk/Documentation/sysctl/fs.txt +++ b/trunk/Documentation/sysctl/fs.txt @@ -96,16 +96,13 @@ handles that the Linux kernel will allocate. When you get lots of error messages about running out of file handles, you might want to increase this limit. -Historically, the three values in file-nr denoted the number of -allocated file handles, the number of allocated but unused file -handles, and the maximum number of file handles. Linux 2.6 always -reports 0 as the number of free file handles -- this is not an -error, it just means that the number of allocated file handles -exactly matches the number of used file handles. - -Attempts to allocate more file descriptors than file-max are -reported with printk, look for "VFS: file-max limit -reached". +The three values in file-nr denote the number of allocated +file handles, the number of unused file handles and the maximum +number of file handles. When the allocated file handles come +close to the maximum, but the number of unused file handles is +significantly greater than 0, you've encountered a peak in your +usage of file handles and you don't need to increase the maximum. + ============================================================== nr_open: diff --git a/trunk/Documentation/sysctl/kernel.txt b/trunk/Documentation/sysctl/kernel.txt index a028b92001ed..b3d8b4922740 100644 --- a/trunk/Documentation/sysctl/kernel.txt +++ b/trunk/Documentation/sysctl/kernel.txt @@ -22,7 +22,6 @@ show up in /proc/sys/kernel: - callhome [ S390 only ] - auto_msgmni - core_pattern -- core_pipe_limit - core_uses_pid - ctrl-alt-del - dentry-state @@ -136,27 +135,6 @@ core_pattern is used to specify a core dumpfile pattern name. ============================================================== -core_pipe_limit: - -This sysctl is only applicable when core_pattern is configured to pipe core -files to user space helper a (when the first character of core_pattern is a '|', -see above). When collecting cores via a pipe to an application, it is -occasionally usefull for the collecting application to gather data about the -crashing process from its /proc/pid directory. In order to do this safely, the -kernel must wait for the collecting process to exit, so as not to remove the -crashing processes proc files prematurely. This in turn creates the possibility -that a misbehaving userspace collecting process can block the reaping of a -crashed process simply by never exiting. This sysctl defends against that. It -defines how many concurrent crashing processes may be piped to user space -applications in parallel. If this value is exceeded, then those crashing -processes above that value are noted via the kernel log and their cores are -skipped. 0 is a special value, indicating that unlimited processes may be -captured in parallel, but that no waiting will take place (i.e. the collecting -process is not guaranteed access to /proc//). This value defaults -to 0. - -============================================================== - core_uses_pid: The default coredump filename is "core". By setting diff --git a/trunk/Documentation/sysctl/vm.txt b/trunk/Documentation/sysctl/vm.txt index a6e360d2055c..e6fb1ec2744b 100644 --- a/trunk/Documentation/sysctl/vm.txt +++ b/trunk/Documentation/sysctl/vm.txt @@ -32,8 +32,6 @@ Currently, these files are in /proc/sys/vm: - legacy_va_layout - lowmem_reserve_ratio - max_map_count -- memory_failure_early_kill -- memory_failure_recovery - min_free_kbytes - min_slab_ratio - min_unmapped_ratio @@ -55,6 +53,7 @@ Currently, these files are in /proc/sys/vm: - vfs_cache_pressure - zone_reclaim_mode + ============================================================== block_dump @@ -276,44 +275,6 @@ e.g., up to one or two maps per allocation. The default value is 65536. -============================================================= - -memory_failure_early_kill: - -Control how to kill processes when uncorrected memory error (typically -a 2bit error in a memory module) is detected in the background by hardware -that cannot be handled by the kernel. In some cases (like the page -still having a valid copy on disk) the kernel will handle the failure -transparently without affecting any applications. But if there is -no other uptodate copy of the data it will kill to prevent any data -corruptions from propagating. - -1: Kill all processes that have the corrupted and not reloadable page mapped -as soon as the corruption is detected. Note this is not supported -for a few types of pages, like kernel internally allocated data or -the swap cache, but works for the majority of user pages. - -0: Only unmap the corrupted page from all processes and only kill a process -who tries to access it. - -The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can -handle this if they want to. - -This is only active on architectures/platforms with advanced machine -check handling and depends on the hardware capabilities. - -Applications can override this setting individually with the PR_MCE_KILL prctl - -============================================================== - -memory_failure_recovery - -Enable memory failure recovery (when supported by the platform) - -1: Attempt recovery. - -0: Always panic on a memory failure. - ============================================================== min_free_kbytes: diff --git a/trunk/Documentation/vm/.gitignore b/trunk/Documentation/vm/.gitignore index 09b164a5700f..33e8a023df02 100644 --- a/trunk/Documentation/vm/.gitignore +++ b/trunk/Documentation/vm/.gitignore @@ -1,2 +1 @@ -page-types slabinfo diff --git a/trunk/Documentation/vm/page-types.c b/trunk/Documentation/vm/page-types.c index fa1a30d9e9d5..3eda8ea00852 100644 --- a/trunk/Documentation/vm/page-types.c +++ b/trunk/Documentation/vm/page-types.c @@ -5,7 +5,6 @@ * Copyright (C) 2009 Wu Fengguang */ -#define _LARGEFILE64_SOURCE #include #include #include @@ -14,32 +13,11 @@ #include #include #include -#include #include #include #include -/* - * pagemap kernel ABI bits - */ - -#define PM_ENTRY_BYTES sizeof(uint64_t) -#define PM_STATUS_BITS 3 -#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) -#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) -#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) -#define PM_PSHIFT_BITS 6 -#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) -#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) -#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) - -#define PM_PRESENT PM_STATUS(4LL) -#define PM_SWAP PM_STATUS(2LL) - - /* * kernel page flags */ @@ -148,14 +126,6 @@ static int nr_addr_ranges; static unsigned long opt_offset[MAX_ADDR_RANGES]; static unsigned long opt_size[MAX_ADDR_RANGES]; -#define MAX_VMAS 10240 -static int nr_vmas; -static unsigned long pg_start[MAX_VMAS]; -static unsigned long pg_end[MAX_VMAS]; -static unsigned long voffset; - -static int pagemap_fd; - #define MAX_BIT_FILTERS 64 static int nr_bit_filters; static uint64_t opt_mask[MAX_BIT_FILTERS]; @@ -165,6 +135,7 @@ static int page_size; #define PAGES_BATCH (64 << 10) /* 64k pages */ static int kpageflags_fd; +static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; #define HASH_SHIFT 13 #define HASH_SIZE (1 << HASH_SHIFT) @@ -187,11 +158,6 @@ static uint64_t page_flags[HASH_SIZE]; type __min2 = (y); \ __min1 < __min2 ? __min1 : __min2; }) -#define max_t(type, x, y) ({ \ - type __max1 = (x); \ - type __max2 = (y); \ - __max1 > __max2 ? __max1 : __max2; }) - static unsigned long pages2mb(unsigned long pages) { return (pages * page_size) >> 20; @@ -258,34 +224,26 @@ static char *page_flag_longname(uint64_t flags) static void show_page_range(unsigned long offset, uint64_t flags) { static uint64_t flags0; - static unsigned long voff; static unsigned long index; static unsigned long count; - if (flags == flags0 && offset == index + count && - (!opt_pid || voffset == voff + count)) { + if (flags == flags0 && offset == index + count) { count++; return; } - if (count) { - if (opt_pid) - printf("%lx\t", voff); - printf("%lx\t%lx\t%s\n", + if (count) + printf("%lu\t%lu\t%s\n", index, count, page_flag_name(flags0)); - } flags0 = flags; index = offset; - voff = voffset; count = 1; } static void show_page(unsigned long offset, uint64_t flags) { - if (opt_pid) - printf("%lx\t", voffset); - printf("%lx\t%s\n", offset, page_flag_name(flags)); + printf("%lu\t%s\n", offset, page_flag_name(flags)); } static void show_summary(void) @@ -425,8 +383,6 @@ static void walk_pfn(unsigned long index, unsigned long count) lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); while (count) { - uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; - batch = min_t(unsigned long, count, PAGES_BATCH); n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); if (n == 0) @@ -448,81 +404,6 @@ static void walk_pfn(unsigned long index, unsigned long count) } } - -#define PAGEMAP_BATCH 4096 -static unsigned long task_pfn(unsigned long pgoff) -{ - static uint64_t buf[PAGEMAP_BATCH]; - static unsigned long start; - static long count; - uint64_t pfn; - - if (pgoff < start || pgoff >= start + count) { - if (lseek64(pagemap_fd, - (uint64_t)pgoff * PM_ENTRY_BYTES, - SEEK_SET) < 0) { - perror("pagemap seek"); - exit(EXIT_FAILURE); - } - count = read(pagemap_fd, buf, sizeof(buf)); - if (count == 0) - return 0; - if (count < 0) { - perror("pagemap read"); - exit(EXIT_FAILURE); - } - if (count % PM_ENTRY_BYTES) { - fatal("pagemap read not aligned.\n"); - exit(EXIT_FAILURE); - } - count /= PM_ENTRY_BYTES; - start = pgoff; - } - - pfn = buf[pgoff - start]; - if (pfn & PM_PRESENT) - pfn = PM_PFRAME(pfn); - else - pfn = 0; - - return pfn; -} - -static void walk_task(unsigned long index, unsigned long count) -{ - int i = 0; - const unsigned long end = index + count; - - while (index < end) { - - while (pg_end[i] <= index) - if (++i >= nr_vmas) - return; - if (pg_start[i] >= end) - return; - - voffset = max_t(unsigned long, pg_start[i], index); - index = min_t(unsigned long, pg_end[i], end); - - assert(voffset < index); - for (; voffset < index; voffset++) { - unsigned long pfn = task_pfn(voffset); - if (pfn) - walk_pfn(pfn, 1); - } - } -} - -static void add_addr_range(unsigned long offset, unsigned long size) -{ - if (nr_addr_ranges >= MAX_ADDR_RANGES) - fatal("too many addr ranges\n"); - - opt_offset[nr_addr_ranges] = offset; - opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); - nr_addr_ranges++; -} - static void walk_addr_ranges(void) { int i; @@ -534,13 +415,10 @@ static void walk_addr_ranges(void) } if (!nr_addr_ranges) - add_addr_range(0, ULONG_MAX); + walk_pfn(0, ULONG_MAX); for (i = 0; i < nr_addr_ranges; i++) - if (!opt_pid) - walk_pfn(opt_offset[i], opt_size[i]); - else - walk_task(opt_offset[i], opt_size[i]); + walk_pfn(opt_offset[i], opt_size[i]); close(kpageflags_fd); } @@ -568,8 +446,8 @@ static void usage(void) " -r|--raw Raw mode, for kernel developers\n" " -a|--addr addr-spec Walk a range of pages\n" " -b|--bits bits-spec Walk pages with specified bits\n" -" -p|--pid pid Walk process address space\n" #if 0 /* planned features */ +" -p|--pid pid Walk process address space\n" " -f|--file filename Walk file address space\n" #endif " -l|--list Show page details in ranges\n" @@ -581,7 +459,7 @@ static void usage(void) " N+M pages range from N to N+M-1\n" " N,M pages range from N to M-1\n" " N, pages range from N to end\n" -" ,M pages range from 0 to M-1\n" +" ,M pages range from 0 to M\n" "bits-spec:\n" " bit1,bit2 (flags & (bit1|bit2)) != 0\n" " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" @@ -618,59 +496,23 @@ static unsigned long long parse_number(const char *str) static void parse_pid(const char *str) { - FILE *file; - char buf[5000]; - opt_pid = parse_number(str); - - sprintf(buf, "/proc/%d/pagemap", opt_pid); - pagemap_fd = open(buf, O_RDONLY); - if (pagemap_fd < 0) { - perror(buf); - exit(EXIT_FAILURE); - } - - sprintf(buf, "/proc/%d/maps", opt_pid); - file = fopen(buf, "r"); - if (!file) { - perror(buf); - exit(EXIT_FAILURE); - } - - while (fgets(buf, sizeof(buf), file) != NULL) { - unsigned long vm_start; - unsigned long vm_end; - unsigned long long pgoff; - int major, minor; - char r, w, x, s; - unsigned long ino; - int n; - - n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", - &vm_start, - &vm_end, - &r, &w, &x, &s, - &pgoff, - &major, &minor, - &ino); - if (n < 10) { - fprintf(stderr, "unexpected line: %s\n", buf); - continue; - } - pg_start[nr_vmas] = vm_start / page_size; - pg_end[nr_vmas] = vm_end / page_size; - if (++nr_vmas >= MAX_VMAS) { - fprintf(stderr, "too many VMAs\n"); - break; - } - } - fclose(file); } static void parse_file(const char *name) { } +static void add_addr_range(unsigned long offset, unsigned long size) +{ + if (nr_addr_ranges >= MAX_ADDR_RANGES) + fatal("too much addr ranges\n"); + + opt_offset[nr_addr_ranges] = offset; + opt_size[nr_addr_ranges] = size; + nr_addr_ranges++; +} + static void parse_addr_range(const char *optarg) { unsigned long offset; @@ -834,10 +676,8 @@ int main(int argc, char *argv[]) } } - if (opt_list && opt_pid) - printf("voffset\t"); if (opt_list == 1) - printf("offset\tlen\tflags\n"); + printf("offset\tcount\tflags\n"); if (opt_list == 2) printf("offset\tflags\n"); diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 0c138ba86526..7c1c0b05b298 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -2331,9 +2331,7 @@ S: Orphan F: drivers/hwmon/ HARDWARE RANDOM NUMBER GENERATOR CORE -M: Matt Mackall -M: Herbert Xu -S: Odd fixes +S: Orphan F: Documentation/hw_random.txt F: drivers/char/hw_random/ F: include/linux/hw_random.h diff --git a/trunk/arch/alpha/include/asm/fcntl.h b/trunk/arch/alpha/include/asm/fcntl.h index e42823e954aa..25da0017ec87 100644 --- a/trunk/arch/alpha/include/asm/fcntl.h +++ b/trunk/arch/alpha/include/asm/fcntl.h @@ -26,8 +26,6 @@ #define F_GETOWN 6 /* for sockets. */ #define F_SETSIG 10 /* for sockets. */ #define F_GETSIG 11 /* for sockets. */ -#define F_SETOWN_EX 12 -#define F_GETOWN_EX 13 /* for posix fcntl() and lockf() */ #define F_RDLCK 1 diff --git a/trunk/arch/alpha/kernel/core_marvel.c b/trunk/arch/alpha/kernel/core_marvel.c index 8e059e58b0ac..e302daecbe56 100644 --- a/trunk/arch/alpha/kernel/core_marvel.c +++ b/trunk/arch/alpha/kernel/core_marvel.c @@ -1016,7 +1016,7 @@ marvel_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *m { struct marvel_agp_aperture *aper = agp->aperture.sysdata; return iommu_bind(aper->arena, aper->pg_start + pg_start, - mem->page_count, mem->pages); + mem->page_count, mem->memory); } static int diff --git a/trunk/arch/alpha/kernel/core_titan.c b/trunk/arch/alpha/kernel/core_titan.c index 76686497b1e2..319fcb74611e 100644 --- a/trunk/arch/alpha/kernel/core_titan.c +++ b/trunk/arch/alpha/kernel/core_titan.c @@ -680,7 +680,7 @@ titan_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *me { struct titan_agp_aperture *aper = agp->aperture.sysdata; return iommu_bind(aper->arena, aper->pg_start + pg_start, - mem->page_count, mem->pages); + mem->page_count, mem->memory); } static int diff --git a/trunk/arch/alpha/kernel/pci_impl.h b/trunk/arch/alpha/kernel/pci_impl.h index 85457b2d4516..00edd04b585e 100644 --- a/trunk/arch/alpha/kernel/pci_impl.h +++ b/trunk/arch/alpha/kernel/pci_impl.h @@ -198,7 +198,7 @@ extern unsigned long size_for_memory(unsigned long max); extern int iommu_reserve(struct pci_iommu_arena *, long, long); extern int iommu_release(struct pci_iommu_arena *, long, long); -extern int iommu_bind(struct pci_iommu_arena *, long, long, struct page **); +extern int iommu_bind(struct pci_iommu_arena *, long, long, unsigned long *); extern int iommu_unbind(struct pci_iommu_arena *, long, long); diff --git a/trunk/arch/alpha/kernel/pci_iommu.c b/trunk/arch/alpha/kernel/pci_iommu.c index 8449504f5e0b..d15aedfe6066 100644 --- a/trunk/arch/alpha/kernel/pci_iommu.c +++ b/trunk/arch/alpha/kernel/pci_iommu.c @@ -876,7 +876,7 @@ iommu_release(struct pci_iommu_arena *arena, long pg_start, long pg_count) int iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count, - struct page **pages) + unsigned long *physaddrs) { unsigned long flags; unsigned long *ptes; @@ -896,7 +896,7 @@ iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count, } for(i = 0, j = pg_start; i < pg_count; i++, j++) - ptes[j] = mk_iommu_pte(page_to_phys(pages[i])); + ptes[j] = mk_iommu_pte(physaddrs[i]); spin_unlock_irqrestore(&arena->lock, flags); diff --git a/trunk/arch/arm/include/asm/hardware/iop3xx-adma.h b/trunk/arch/arm/include/asm/hardware/iop3xx-adma.h index 1a8c7279a28b..83e6ba338e2c 100644 --- a/trunk/arch/arm/include/asm/hardware/iop3xx-adma.h +++ b/trunk/arch/arm/include/asm/hardware/iop3xx-adma.h @@ -187,74 +187,11 @@ union iop3xx_desc { void *ptr; }; -/* No support for p+q operations */ -static inline int -iop_chan_pq_slot_count(size_t len, int src_cnt, int *slots_per_op) -{ - BUG(); - return 0; -} - -static inline void -iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt, - unsigned long flags) -{ - BUG(); -} - -static inline void -iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr) -{ - BUG(); -} - -static inline void -iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx, - dma_addr_t addr, unsigned char coef) -{ - BUG(); -} - -static inline int -iop_chan_pq_zero_sum_slot_count(size_t len, int src_cnt, int *slots_per_op) -{ - BUG(); - return 0; -} - -static inline void -iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, - unsigned long flags) -{ - BUG(); -} - -static inline void -iop_desc_set_pq_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len) -{ - BUG(); -} - -#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr - -static inline void -iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx, - dma_addr_t *src) -{ - BUG(); -} - static inline int iop_adma_get_max_xor(void) { return 32; } -static inline int iop_adma_get_max_pq(void) -{ - BUG(); - return 0; -} - static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) { int id = chan->device->id; @@ -395,11 +332,6 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt, return slot_cnt; } -static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc) -{ - return 0; -} - static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) { @@ -417,14 +349,6 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, return 0; } - -static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - BUG(); - return 0; -} - static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) { @@ -832,14 +756,13 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, hw_desc->src[0] = val; } -static inline enum sum_check_flags -iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) { struct iop3xx_desc_aau *hw_desc = desc->hw_desc; struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en)); - return desc_ctrl.zero_result_err << SUM_CHECK_P; + return desc_ctrl.zero_result_err; } static inline void iop_chan_append(struct iop_adma_chan *chan) diff --git a/trunk/arch/arm/include/asm/hardware/iop_adma.h b/trunk/arch/arm/include/asm/hardware/iop_adma.h index 59b8c3892f76..385c6e8cbbd2 100644 --- a/trunk/arch/arm/include/asm/hardware/iop_adma.h +++ b/trunk/arch/arm/include/asm/hardware/iop_adma.h @@ -86,7 +86,6 @@ struct iop_adma_chan { * @idx: pool index * @unmap_src_cnt: number of xor sources * @unmap_len: transaction bytecount - * @tx_list: list of descriptors that are associated with one operation * @async_tx: support for the async_tx api * @group_list: list of slots that make up a multi-descriptor transaction * for example transfer lengths larger than the supported hw max @@ -103,12 +102,10 @@ struct iop_adma_desc_slot { u16 idx; u16 unmap_src_cnt; size_t unmap_len; - struct list_head tx_list; struct dma_async_tx_descriptor async_tx; union { u32 *xor_check_result; u32 *crc32_result; - u32 *pq_check_result; }; }; diff --git a/trunk/arch/arm/mach-iop13xx/include/mach/adma.h b/trunk/arch/arm/mach-iop13xx/include/mach/adma.h index 6d3782d85a9f..5722e86f2174 100644 --- a/trunk/arch/arm/mach-iop13xx/include/mach/adma.h +++ b/trunk/arch/arm/mach-iop13xx/include/mach/adma.h @@ -150,8 +150,6 @@ static inline int iop_adma_get_max_xor(void) return 16; } -#define iop_adma_get_max_pq iop_adma_get_max_xor - static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) { return __raw_readl(ADMA_ADAR(chan)); @@ -213,10 +211,7 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op) #define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT #define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT #define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT -#define IOP_ADMA_PQ_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT #define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o) -#define iop_chan_pq_slot_count iop_chan_xor_slot_count -#define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) @@ -225,13 +220,6 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, return hw_desc->dest_addr; } -static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - return hw_desc->q_dest_addr; -} - static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) { @@ -331,58 +319,6 @@ iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, return 1; } -static inline void -iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt, - unsigned long flags) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - union { - u32 value; - struct iop13xx_adma_desc_ctrl field; - } u_desc_ctrl; - - u_desc_ctrl.value = 0; - u_desc_ctrl.field.src_select = src_cnt - 1; - u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ - u_desc_ctrl.field.pq_xfer_en = 1; - u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P); - u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT; - hw_desc->desc_ctrl = u_desc_ctrl.value; -} - -static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - union { - u32 value; - struct iop13xx_adma_desc_ctrl field; - } u_desc_ctrl; - - u_desc_ctrl.value = hw_desc->desc_ctrl; - return u_desc_ctrl.field.pq_xfer_en; -} - -static inline void -iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, - unsigned long flags) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - union { - u32 value; - struct iop13xx_adma_desc_ctrl field; - } u_desc_ctrl; - - u_desc_ctrl.value = 0; - u_desc_ctrl.field.src_select = src_cnt - 1; - u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ - u_desc_ctrl.field.zero_result = 1; - u_desc_ctrl.field.status_write_back_en = 1; - u_desc_ctrl.field.pq_xfer_en = 1; - u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P); - u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT; - hw_desc->desc_ctrl = u_desc_ctrl.value; -} - static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan, u32 byte_count) @@ -415,7 +351,6 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len) } } -#define iop_desc_set_pq_zero_sum_byte_count iop_desc_set_zero_sum_byte_count static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan, @@ -426,16 +361,6 @@ static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc, hw_desc->upper_dest_addr = 0; } -static inline void -iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - - hw_desc->dest_addr = addr[0]; - hw_desc->q_dest_addr = addr[1]; - hw_desc->upper_dest_addr = 0; -} - static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc, dma_addr_t addr) { @@ -463,29 +388,6 @@ static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc, } while (slot_cnt); } -static inline void -iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx, - dma_addr_t addr, unsigned char coef) -{ - int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op; - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter; - struct iop13xx_adma_src *src; - int i = 0; - - do { - iter = iop_hw_desc_slot_idx(hw_desc, i); - src = &iter->src[src_idx]; - src->src_addr = addr; - src->pq_upper_src_addr = 0; - src->pq_dmlt = coef; - slot_cnt -= slots_per_op; - if (slot_cnt) { - i += slots_per_op; - addr += IOP_ADMA_PQ_MAX_BYTE_COUNT; - } - } while (slot_cnt); -} - static inline void iop_desc_init_interrupt(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) @@ -497,15 +399,6 @@ iop_desc_init_interrupt(struct iop_adma_desc_slot *desc, } #define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr -#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr - -static inline void -iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx, - dma_addr_t *src) -{ - iop_desc_set_xor_src_addr(desc, pq_idx, src[pq_idx]); - iop_desc_set_xor_src_addr(desc, pq_idx+1, src[pq_idx+1]); -} static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc, u32 next_desc_addr) @@ -535,20 +428,18 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, hw_desc->block_fill_data = val; } -static inline enum sum_check_flags -iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) { struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field; - enum sum_check_flags flags; BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result)); - flags = byte_count.zero_result_err_q << SUM_CHECK_Q; - flags |= byte_count.zero_result_err << SUM_CHECK_P; - - return flags; + if (desc_ctrl.pq_xfer_en) + return byte_count.zero_result_err_q; + else + return byte_count.zero_result_err; } static inline void iop_chan_append(struct iop_adma_chan *chan) diff --git a/trunk/arch/arm/mach-iop13xx/setup.c b/trunk/arch/arm/mach-iop13xx/setup.c index 5c147fb66a01..bee42c609df6 100644 --- a/trunk/arch/arm/mach-iop13xx/setup.c +++ b/trunk/arch/arm/mach-iop13xx/setup.c @@ -477,8 +477,10 @@ void __init iop13xx_platform_init(void) plat_data = &iop13xx_adma_0_data; dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); dma_cap_set(DMA_XOR, plat_data->cap_mask); - dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); break; case IOP13XX_INIT_ADMA_1: @@ -487,8 +489,10 @@ void __init iop13xx_platform_init(void) plat_data = &iop13xx_adma_1_data; dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); dma_cap_set(DMA_XOR, plat_data->cap_mask); - dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); break; case IOP13XX_INIT_ADMA_2: @@ -497,11 +501,14 @@ void __init iop13xx_platform_init(void) plat_data = &iop13xx_adma_2_data; dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); dma_cap_set(DMA_XOR, plat_data->cap_mask); - dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); - dma_cap_set(DMA_PQ, plat_data->cap_mask); - dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask); + dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask); + dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask); + dma_cap_set(DMA_PQ_ZERO_SUM, plat_data->cap_mask); break; } } diff --git a/trunk/arch/arm/plat-iop/adma.c b/trunk/arch/arm/plat-iop/adma.c index 1ff6a37e893c..3c127aabe214 100644 --- a/trunk/arch/arm/plat-iop/adma.c +++ b/trunk/arch/arm/plat-iop/adma.c @@ -179,6 +179,7 @@ static int __init iop3xx_adma_cap_init(void) dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); #else dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask); dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); #endif @@ -187,6 +188,7 @@ static int __init iop3xx_adma_cap_init(void) dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); #else dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask); dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); #endif @@ -196,7 +198,7 @@ static int __init iop3xx_adma_cap_init(void) dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); #else dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask); - dma_cap_set(DMA_XOR_VAL, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_ZERO_SUM, iop3xx_aau_data.cap_mask); dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask); dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); #endif diff --git a/trunk/arch/frv/kernel/pm.c b/trunk/arch/frv/kernel/pm.c index 0d4d3e3a4cfc..be722fc1acff 100644 --- a/trunk/arch/frv/kernel/pm.c +++ b/trunk/arch/frv/kernel/pm.c @@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len) /* * Send us to sleep. */ -static int sysctl_pm_do_suspend(ctl_table *ctl, int write, +static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *fpos) { int retval, mode; @@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode) } -static int cmode_procctl(ctl_table *ctl, int write, +static int cmode_procctl(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_cmode; if (!write) - return proc_dointvec(ctl, write, buffer, lenp, fpos); + return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); new_cmode = user_atoi(buffer, *lenp); @@ -301,13 +301,13 @@ static int try_set_cm(int new_cm) return 0; } -static int p0_procctl(ctl_table *ctl, int write, +static int p0_procctl(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_p0; if (!write) - return proc_dointvec(ctl, write, buffer, lenp, fpos); + return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); new_p0 = user_atoi(buffer, *lenp); @@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table, return 1; } -static int cm_procctl(ctl_table *ctl, int write, +static int cm_procctl(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_cm; if (!write) - return proc_dointvec(ctl, write, buffer, lenp, fpos); + return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); new_cm = user_atoi(buffer, *lenp); diff --git a/trunk/arch/mips/lasat/sysctl.c b/trunk/arch/mips/lasat/sysctl.c index b3deed8db619..3f04d4c406b7 100644 --- a/trunk/arch/mips/lasat/sysctl.c +++ b/trunk/arch/mips/lasat/sysctl.c @@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table, /* And the same for proc */ -int proc_dolasatstring(ctl_table *table, int write, +int proc_dolasatstring(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dostring(table, write, buffer, lenp, ppos); + r = proc_dostring(table, write, filp, buffer, lenp, ppos); if ((!write) || r) return r; @@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, } /* proc function to write EEPROM after changing int entry */ -int proc_dolasatint(ctl_table *table, int write, +int proc_dolasatint(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dointvec(table, write, buffer, lenp, ppos); + r = proc_dointvec(table, write, filp, buffer, lenp, ppos); if ((!write) || r) return r; @@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, static int rtctmp; /* proc function to read/write RealTime Clock */ -int proc_dolasatrtc(ctl_table *table, int write, +int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { struct timespec ts; @@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, if (rtctmp < 0) rtctmp = 0; } - r = proc_dointvec(table, write, buffer, lenp, ppos); + r = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (r) return r; @@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table, #endif #ifdef CONFIG_INET -int proc_lasat_ip(ctl_table *table, int write, +int proc_lasat_ip(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int ip; @@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table, return 0; } -int proc_lasat_prid(ctl_table *table, int write, +int proc_lasat_prid(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dointvec(table, write, buffer, lenp, ppos); + r = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (r < 0) return r; if (write) { diff --git a/trunk/arch/parisc/include/asm/fcntl.h b/trunk/arch/parisc/include/asm/fcntl.h index 5f39d5597ced..1e1c824764ee 100644 --- a/trunk/arch/parisc/include/asm/fcntl.h +++ b/trunk/arch/parisc/include/asm/fcntl.h @@ -28,8 +28,6 @@ #define F_SETOWN 12 /* for sockets. */ #define F_SETSIG 13 /* for sockets. */ #define F_GETSIG 14 /* for sockets. */ -#define F_GETOWN_EX 15 -#define F_SETOWN_EX 16 /* for posix fcntl() and lockf() */ #define F_RDLCK 01 diff --git a/trunk/arch/powerpc/include/asm/fsldma.h b/trunk/arch/powerpc/include/asm/fsldma.h deleted file mode 100644 index a67aeed17d40..000000000000 --- a/trunk/arch/powerpc/include/asm/fsldma.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Freescale MPC83XX / MPC85XX DMA Controller - * - * Copyright (c) 2009 Ira W. Snyder - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ - -#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__ -#define __ARCH_POWERPC_ASM_FSLDMA_H__ - -#include - -/* - * Definitions for the Freescale DMA controller's DMA_SLAVE implemention - * - * The Freescale DMA_SLAVE implementation was designed to handle many-to-many - * transfers. An example usage would be an accelerated copy between two - * scatterlists. Another example use would be an accelerated copy from - * multiple non-contiguous device buffers into a single scatterlist. - * - * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This - * structure contains a list of hardware addresses that should be copied - * to/from the scatterlist passed into device_prep_slave_sg(). The structure - * also has some fields to enable hardware-specific features. - */ - -/** - * struct fsl_dma_hw_addr - * @entry: linked list entry - * @address: the hardware address - * @length: length to transfer - * - * Holds a single physical hardware address / length pair for use - * with the DMAEngine DMA_SLAVE API. - */ -struct fsl_dma_hw_addr { - struct list_head entry; - - dma_addr_t address; - size_t length; -}; - -/** - * struct fsl_dma_slave - * @addresses: a linked list of struct fsl_dma_hw_addr structures - * @request_count: value for DMA request count - * @src_loop_size: setup and enable constant source-address DMA transfers - * @dst_loop_size: setup and enable constant destination address DMA transfers - * @external_start: enable externally started DMA transfers - * @external_pause: enable externally paused DMA transfers - * - * Holds a list of address / length pairs for use with the DMAEngine - * DMA_SLAVE API implementation for the Freescale DMA controller. - */ -struct fsl_dma_slave { - - /* List of hardware address/length pairs */ - struct list_head addresses; - - /* Support for extra controller features */ - unsigned int request_count; - unsigned int src_loop_size; - unsigned int dst_loop_size; - bool external_start; - bool external_pause; -}; - -/** - * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave - * @slave: the &struct fsl_dma_slave to add to - * @address: the hardware address to add - * @length: the length of bytes to transfer from @address - * - * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on - * success, -ERRNO otherwise. - */ -static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave, - dma_addr_t address, size_t length) -{ - struct fsl_dma_hw_addr *addr; - - addr = kzalloc(sizeof(*addr), GFP_ATOMIC); - if (!addr) - return -ENOMEM; - - INIT_LIST_HEAD(&addr->entry); - addr->address = address; - addr->length = length; - - list_add_tail(&addr->entry, &slave->addresses); - return 0; -} - -/** - * fsl_dma_slave_free - free a struct fsl_dma_slave - * @slave: the struct fsl_dma_slave to free - * - * Free a struct fsl_dma_slave and all associated address/length pairs - */ -static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave) -{ - struct fsl_dma_hw_addr *addr, *tmp; - - if (slave) { - list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) { - list_del(&addr->entry); - kfree(addr); - } - - kfree(slave); - } -} - -/** - * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave - * @gfp: the flags to pass to kmalloc when allocating this structure - * - * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new - * struct fsl_dma_slave on success, or NULL on failure. - */ -static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp) -{ - struct fsl_dma_slave *slave; - - slave = kzalloc(sizeof(*slave), gfp); - if (!slave) - return NULL; - - INIT_LIST_HEAD(&slave->addresses); - return slave; -} - -#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */ diff --git a/trunk/arch/s390/appldata/appldata_base.c b/trunk/arch/s390/appldata/appldata_base.c index b55fd7ed1c31..264528e4f58d 100644 --- a/trunk/arch/s390/appldata/appldata_base.c +++ b/trunk/arch/s390/appldata/appldata_base.c @@ -50,9 +50,10 @@ static struct platform_device *appldata_pdev; * /proc entries (sysctl) */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; -static int appldata_timer_handler(ctl_table *ctl, int write, +static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -246,7 +247,7 @@ __appldata_vtimer_setup(int cmd) * Start/Stop timer, show status of timer (0 = not active, 1 = active) */ static int -appldata_timer_handler(ctl_table *ctl, int write, +appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int len; @@ -288,7 +289,7 @@ appldata_timer_handler(ctl_table *ctl, int write, * current timer interval. */ static int -appldata_interval_handler(ctl_table *ctl, int write, +appldata_interval_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int len, interval; @@ -334,7 +335,7 @@ appldata_interval_handler(ctl_table *ctl, int write, * monitoring (0 = not in process, 1 = in process) */ static int -appldata_generic_handler(ctl_table *ctl, int write, +appldata_generic_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; diff --git a/trunk/arch/s390/kernel/debug.c b/trunk/arch/s390/kernel/debug.c index 20f282c911c2..4c512561687d 100644 --- a/trunk/arch/s390/kernel/debug.c +++ b/trunk/arch/s390/kernel/debug.c @@ -881,11 +881,11 @@ static int debug_active=1; * if debug_active is already off */ static int -s390dbf_procactive(ctl_table *table, int write, +s390dbf_procactive(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); else return 0; } diff --git a/trunk/arch/s390/mm/cmm.c b/trunk/arch/s390/mm/cmm.c index b201135cc18c..413c240cbca7 100644 --- a/trunk/arch/s390/mm/cmm.c +++ b/trunk/arch/s390/mm/cmm.c @@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp) static struct ctl_table cmm_table[]; static int -cmm_pages_handler(ctl_table *ctl, int write, +cmm_pages_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { char buf[16], *p; @@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, } static int -cmm_timeout_handler(ctl_table *ctl, int write, +cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; diff --git a/trunk/arch/sh/drivers/dma/Kconfig b/trunk/arch/sh/drivers/dma/Kconfig index 4d58eb0973d4..b91fa8dbf047 100644 --- a/trunk/arch/sh/drivers/dma/Kconfig +++ b/trunk/arch/sh/drivers/dma/Kconfig @@ -1,9 +1,12 @@ menu "DMA support" +config SH_DMA_API + bool config SH_DMA bool "SuperH on-chip DMA controller (DMAC) support" depends on CPU_SH3 || CPU_SH4 + select SH_DMA_API default n config SH_DMA_IRQ_MULTI @@ -16,15 +19,6 @@ config SH_DMA_IRQ_MULTI CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ CPU_SUBTYPE_SH7760 -config SH_DMA_API - depends on SH_DMA - bool "SuperH DMA API support" - default n - help - SH_DMA_API always enabled DMA API of used SuperH. - If you want to use DMA ENGINE, you must not enable this. - Please enable DMA_ENGINE and SH_DMAE. - config NR_ONCHIP_DMA_CHANNELS int depends on SH_DMA diff --git a/trunk/arch/sh/drivers/dma/Makefile b/trunk/arch/sh/drivers/dma/Makefile index d88c9484762c..c6068137b46f 100644 --- a/trunk/arch/sh/drivers/dma/Makefile +++ b/trunk/arch/sh/drivers/dma/Makefile @@ -2,7 +2,8 @@ # Makefile for the SuperH DMA specific kernel interface routines under Linux. # -obj-$(CONFIG_SH_DMA_API) += dma-sh.o dma-api.o dma-sysfs.o +obj-$(CONFIG_SH_DMA_API) += dma-api.o dma-sysfs.o +obj-$(CONFIG_SH_DMA) += dma-sh.o obj-$(CONFIG_PVR2_DMA) += dma-pvr2.o obj-$(CONFIG_G2_DMA) += dma-g2.o obj-$(CONFIG_SH_DMABRG) += dmabrg.o diff --git a/trunk/arch/sh/include/asm/dma-sh.h b/trunk/arch/sh/include/asm/dma-sh.h index 78eed3e0bdf5..68a5f4cb0343 100644 --- a/trunk/arch/sh/include/asm/dma-sh.h +++ b/trunk/arch/sh/include/asm/dma-sh.h @@ -116,17 +116,4 @@ static u32 dma_base_addr[] __maybe_unused = { #define CHCR 0x0C #define DMAOR 0x40 -/* - * for dma engine - * - * SuperH DMA mode - */ -#define SHDMA_MIX_IRQ (1 << 1) -#define SHDMA_DMAOR1 (1 << 2) -#define SHDMA_DMAE1 (1 << 3) - -struct sh_dmae_pdata { - unsigned int mode; -}; - #endif /* __DMA_SH_H */ diff --git a/trunk/arch/x86/include/asm/nmi.h b/trunk/arch/x86/include/asm/nmi.h index 139d4c1a33a7..e63cf7d441e1 100644 --- a/trunk/arch/x86/include/asm/nmi.h +++ b/trunk/arch/x86/include/asm/nmi.h @@ -40,7 +40,8 @@ extern unsigned int nmi_watchdog; #define NMI_INVALID 3 struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , +struct file; +extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; diff --git a/trunk/arch/x86/kernel/apic/nmi.c b/trunk/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..cb66a22d98ad 100644 --- a/trunk/arch/x86/kernel/apic/nmi.c +++ b/trunk/arch/x86/kernel/apic/nmi.c @@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..cf53a78e2dcf 100644 --- a/trunk/arch/x86/kernel/vsyscall_64.c +++ b/trunk/arch/x86/kernel/vsyscall_64.c @@ -228,11 +228,19 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL + +static int +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); +} + static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec }, + .proc_handler = vsyscall_sysctl_change }, {} }; diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c index f4cee9028cf0..82728f2c6d55 100644 --- a/trunk/arch/x86/mm/fault.c +++ b/trunk/arch/x86/mm/fault.c @@ -167,7 +167,6 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; force_sig_info(si_signo, &info, tsk); } @@ -791,12 +790,10 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code, } static void -do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - unsigned int fault) +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - int code = BUS_ADRERR; up_read(&mm->mmap_sem); @@ -812,15 +809,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; -#ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { - printk(KERN_ERR - "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", - tsk->comm, tsk->pid, address); - code = BUS_MCEERR_AR; - } -#endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } static noinline void @@ -830,8 +819,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) - do_sigbus(regs, error_code, address, fault); + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); else BUG(); } diff --git a/trunk/crypto/async_tx/Kconfig b/trunk/crypto/async_tx/Kconfig index e5aeb2b79e6f..d8fb39145986 100644 --- a/trunk/crypto/async_tx/Kconfig +++ b/trunk/crypto/async_tx/Kconfig @@ -14,12 +14,3 @@ config ASYNC_MEMSET tristate select ASYNC_CORE -config ASYNC_PQ - tristate - select ASYNC_CORE - -config ASYNC_RAID6_RECOV - tristate - select ASYNC_CORE - select ASYNC_PQ - diff --git a/trunk/crypto/async_tx/Makefile b/trunk/crypto/async_tx/Makefile index d1e0e6f72bc1..27baa7d52fbc 100644 --- a/trunk/crypto/async_tx/Makefile +++ b/trunk/crypto/async_tx/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o obj-$(CONFIG_ASYNC_XOR) += async_xor.o -obj-$(CONFIG_ASYNC_PQ) += async_pq.o -obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o -obj-$(CONFIG_ASYNC_RAID6_TEST) += raid6test.o diff --git a/trunk/crypto/async_tx/async_memcpy.c b/trunk/crypto/async_tx/async_memcpy.c index 0ec1fb69d4ea..ddccfb01c416 100644 --- a/trunk/crypto/async_tx/async_memcpy.c +++ b/trunk/crypto/async_tx/async_memcpy.c @@ -33,31 +33,28 @@ * async_memcpy - attempt to copy memory with a dma engine. * @dest: destination page * @src: src page - * @dest_offset: offset into 'dest' to start transaction - * @src_offset: offset into 'src' to start transaction + * @offset: offset in pages to start transaction * @len: length in bytes - * @submit: submission / completion modifiers - * - * honored flags: ASYNC_TX_ACK + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, + * @depend_tx: memcpy depends on the result of this transaction + * @cb_fn: function to call when the memcpy completes + * @cb_param: parameter to pass to the callback routine */ struct dma_async_tx_descriptor * async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, - unsigned int src_offset, size_t len, - struct async_submit_ctl *submit) + unsigned int src_offset, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { - struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY, + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY, &dest, 1, &src, 1, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; - if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) { + if (device) { dma_addr_t dma_dest, dma_src; - unsigned long dma_prep_flags = 0; + unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; - if (submit->cb_fn) - dma_prep_flags |= DMA_PREP_INTERRUPT; - if (submit->flags & ASYNC_TX_FENCE) - dma_prep_flags |= DMA_PREP_FENCE; dma_dest = dma_map_page(device->dev, dest, dest_offset, len, DMA_FROM_DEVICE); @@ -70,13 +67,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, if (tx) { pr_debug("%s: (async) len: %zu\n", __func__, len); - async_tx_submit(chan, tx, submit); + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); } else { void *dest_buf, *src_buf; pr_debug("%s: (sync) len: %zu\n", __func__, len); /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset; src_buf = kmap_atomic(src, KM_USER1) + src_offset; @@ -86,13 +83,26 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, kunmap_atomic(dest_buf, KM_USER0); kunmap_atomic(src_buf, KM_USER1); - async_tx_sync_epilog(submit); + async_tx_sync_epilog(cb_fn, cb_param); } return tx; } EXPORT_SYMBOL_GPL(async_memcpy); +static int __init async_memcpy_init(void) +{ + return 0; +} + +static void __exit async_memcpy_exit(void) +{ + do { } while (0); +} + +module_init(async_memcpy_init); +module_exit(async_memcpy_exit); + MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("asynchronous memcpy api"); MODULE_LICENSE("GPL"); diff --git a/trunk/crypto/async_tx/async_memset.c b/trunk/crypto/async_tx/async_memset.c index 58e4a8752aee..5b5eb99bb244 100644 --- a/trunk/crypto/async_tx/async_memset.c +++ b/trunk/crypto/async_tx/async_memset.c @@ -35,26 +35,26 @@ * @val: fill value * @offset: offset in pages to start transaction * @len: length in bytes - * - * honored flags: ASYNC_TX_ACK + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: memset depends on the result of this transaction + * @cb_fn: function to call when the memcpy completes + * @cb_param: parameter to pass to the callback routine */ struct dma_async_tx_descriptor * -async_memset(struct page *dest, int val, unsigned int offset, size_t len, - struct async_submit_ctl *submit) +async_memset(struct page *dest, int val, unsigned int offset, + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { - struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET, + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET, &dest, 1, NULL, 0, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; - if (device && is_dma_fill_aligned(device, offset, 0, len)) { + if (device) { dma_addr_t dma_dest; - unsigned long dma_prep_flags = 0; + unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; - if (submit->cb_fn) - dma_prep_flags |= DMA_PREP_INTERRUPT; - if (submit->flags & ASYNC_TX_FENCE) - dma_prep_flags |= DMA_PREP_FENCE; dma_dest = dma_map_page(device->dev, dest, offset, len, DMA_FROM_DEVICE); @@ -64,25 +64,38 @@ async_memset(struct page *dest, int val, unsigned int offset, size_t len, if (tx) { pr_debug("%s: (async) len: %zu\n", __func__, len); - async_tx_submit(chan, tx, submit); + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); } else { /* run the memset synchronously */ void *dest_buf; pr_debug("%s: (sync) len: %zu\n", __func__, len); - dest_buf = page_address(dest) + offset; + dest_buf = (void *) (((char *) page_address(dest)) + offset); /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); memset(dest_buf, val, len); - async_tx_sync_epilog(submit); + async_tx_sync_epilog(cb_fn, cb_param); } return tx; } EXPORT_SYMBOL_GPL(async_memset); +static int __init async_memset_init(void) +{ + return 0; +} + +static void __exit async_memset_exit(void) +{ + do { } while (0); +} + +module_init(async_memset_init); +module_exit(async_memset_exit); + MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("asynchronous memset api"); MODULE_LICENSE("GPL"); diff --git a/trunk/crypto/async_tx/async_pq.c b/trunk/crypto/async_tx/async_pq.c deleted file mode 100644 index b88db6d1dc65..000000000000 --- a/trunk/crypto/async_tx/async_pq.c +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Copyright(c) 2007 Yuri Tikhonov - * Copyright(c) 2009 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#include -#include -#include -#include -#include - -/** - * scribble - space to hold throwaway P buffer for synchronous gen_syndrome - */ -static struct page *scribble; - -static bool is_raid6_zero_block(struct page *p) -{ - return p == (void *) raid6_empty_zero_page; -} - -/* the struct page *blocks[] parameter passed to async_gen_syndrome() - * and async_syndrome_val() contains the 'P' destination address at - * blocks[disks-2] and the 'Q' destination address at blocks[disks-1] - * - * note: these are macros as they are used as lvalues - */ -#define P(b, d) (b[d-2]) -#define Q(b, d) (b[d-1]) - -/** - * do_async_gen_syndrome - asynchronously calculate P and/or Q - */ -static __async_inline struct dma_async_tx_descriptor * -do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks, - const unsigned char *scfs, unsigned int offset, int disks, - size_t len, dma_addr_t *dma_src, - struct async_submit_ctl *submit) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct dma_device *dma = chan->device; - enum dma_ctrl_flags dma_flags = 0; - enum async_tx_flags flags_orig = submit->flags; - dma_async_tx_callback cb_fn_orig = submit->cb_fn; - dma_async_tx_callback cb_param_orig = submit->cb_param; - int src_cnt = disks - 2; - unsigned char coefs[src_cnt]; - unsigned short pq_src_cnt; - dma_addr_t dma_dest[2]; - int src_off = 0; - int idx; - int i; - - /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ - if (P(blocks, disks)) - dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset, - len, DMA_BIDIRECTIONAL); - else - dma_flags |= DMA_PREP_PQ_DISABLE_P; - if (Q(blocks, disks)) - dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset, - len, DMA_BIDIRECTIONAL); - else - dma_flags |= DMA_PREP_PQ_DISABLE_Q; - - /* convert source addresses being careful to collapse 'empty' - * sources and update the coefficients accordingly - */ - for (i = 0, idx = 0; i < src_cnt; i++) { - if (is_raid6_zero_block(blocks[i])) - continue; - dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len, - DMA_TO_DEVICE); - coefs[idx] = scfs[i]; - idx++; - } - src_cnt = idx; - - while (src_cnt > 0) { - submit->flags = flags_orig; - pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags)); - /* if we are submitting additional pqs, leave the chain open, - * clear the callback parameters, and leave the destination - * buffers mapped - */ - if (src_cnt > pq_src_cnt) { - submit->flags &= ~ASYNC_TX_ACK; - submit->flags |= ASYNC_TX_FENCE; - dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; - submit->cb_fn = NULL; - submit->cb_param = NULL; - } else { - dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP; - submit->cb_fn = cb_fn_orig; - submit->cb_param = cb_param_orig; - if (cb_fn_orig) - dma_flags |= DMA_PREP_INTERRUPT; - } - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; - - /* Since we have clobbered the src_list we are committed - * to doing this asynchronously. Drivers force forward - * progress in case they can not provide a descriptor - */ - for (;;) { - tx = dma->device_prep_dma_pq(chan, dma_dest, - &dma_src[src_off], - pq_src_cnt, - &coefs[src_off], len, - dma_flags); - if (likely(tx)) - break; - async_tx_quiesce(&submit->depend_tx); - dma_async_issue_pending(chan); - } - - async_tx_submit(chan, tx, submit); - submit->depend_tx = tx; - - /* drop completed sources */ - src_cnt -= pq_src_cnt; - src_off += pq_src_cnt; - - dma_flags |= DMA_PREP_CONTINUE; - } - - return tx; -} - -/** - * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome - */ -static void -do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, - size_t len, struct async_submit_ctl *submit) -{ - void **srcs; - int i; - - if (submit->scribble) - srcs = submit->scribble; - else - srcs = (void **) blocks; - - for (i = 0; i < disks; i++) { - if (is_raid6_zero_block(blocks[i])) { - BUG_ON(i > disks - 3); /* P or Q can't be zero */ - srcs[i] = blocks[i]; - } else - srcs[i] = page_address(blocks[i]) + offset; - } - raid6_call.gen_syndrome(disks, len, srcs); - async_tx_sync_epilog(submit); -} - -/** - * async_gen_syndrome - asynchronously calculate a raid6 syndrome - * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1 - * @offset: common offset into each block (src and dest) to start transaction - * @disks: number of blocks (including missing P or Q, see below) - * @len: length of operation in bytes - * @submit: submission/completion modifiers - * - * General note: This routine assumes a field of GF(2^8) with a - * primitive polynomial of 0x11d and a generator of {02}. - * - * 'disks' note: callers can optionally omit either P or Q (but not - * both) from the calculation by setting blocks[disks-2] or - * blocks[disks-1] to NULL. When P or Q is omitted 'len' must be <= - * PAGE_SIZE as a temporary buffer of this size is used in the - * synchronous path. 'disks' always accounts for both destination - * buffers. - * - * 'blocks' note: if submit->scribble is NULL then the contents of - * 'blocks' may be overridden - */ -struct dma_async_tx_descriptor * -async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, - size_t len, struct async_submit_ctl *submit) -{ - int src_cnt = disks - 2; - struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, - &P(blocks, disks), 2, - blocks, src_cnt, len); - struct dma_device *device = chan ? chan->device : NULL; - dma_addr_t *dma_src = NULL; - - BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks))); - - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) blocks; - - if (dma_src && device && - (src_cnt <= dma_maxpq(device, 0) || - dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && - is_dma_pq_aligned(device, offset, 0, len)) { - /* run the p+q asynchronously */ - pr_debug("%s: (async) disks: %d len: %zu\n", - __func__, disks, len); - return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset, - disks, len, dma_src, submit); - } - - /* run the pq synchronously */ - pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len); - - /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); - - if (!P(blocks, disks)) { - P(blocks, disks) = scribble; - BUG_ON(len + offset > PAGE_SIZE); - } - if (!Q(blocks, disks)) { - Q(blocks, disks) = scribble; - BUG_ON(len + offset > PAGE_SIZE); - } - do_sync_gen_syndrome(blocks, offset, disks, len, submit); - - return NULL; -} -EXPORT_SYMBOL_GPL(async_gen_syndrome); - -/** - * async_syndrome_val - asynchronously validate a raid6 syndrome - * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1 - * @offset: common offset into each block (src and dest) to start transaction - * @disks: number of blocks (including missing P or Q, see below) - * @len: length of operation in bytes - * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set - * @spare: temporary result buffer for the synchronous case - * @submit: submission / completion modifiers - * - * The same notes from async_gen_syndrome apply to the 'blocks', - * and 'disks' parameters of this routine. The synchronous path - * requires a temporary result buffer and submit->scribble to be - * specified. - */ -struct dma_async_tx_descriptor * -async_syndrome_val(struct page **blocks, unsigned int offset, int disks, - size_t len, enum sum_check_flags *pqres, struct page *spare, - struct async_submit_ctl *submit) -{ - struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL, - NULL, 0, blocks, disks, - len); - struct dma_device *device = chan ? chan->device : NULL; - struct dma_async_tx_descriptor *tx; - enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; - dma_addr_t *dma_src = NULL; - - BUG_ON(disks < 4); - - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) blocks; - - if (dma_src && device && disks <= dma_maxpq(device, 0) && - is_dma_pq_aligned(device, offset, 0, len)) { - struct device *dev = device->dev; - dma_addr_t *pq = &dma_src[disks-2]; - int i; - - pr_debug("%s: (async) disks: %d len: %zu\n", - __func__, disks, len); - if (!P(blocks, disks)) - dma_flags |= DMA_PREP_PQ_DISABLE_P; - if (!Q(blocks, disks)) - dma_flags |= DMA_PREP_PQ_DISABLE_Q; - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; - for (i = 0; i < disks; i++) - if (likely(blocks[i])) { - BUG_ON(is_raid6_zero_block(blocks[i])); - dma_src[i] = dma_map_page(dev, blocks[i], - offset, len, - DMA_TO_DEVICE); - } - - for (;;) { - tx = device->device_prep_dma_pq_val(chan, pq, dma_src, - disks - 2, - raid6_gfexp, - len, pqres, - dma_flags); - if (likely(tx)) - break; - async_tx_quiesce(&submit->depend_tx); - dma_async_issue_pending(chan); - } - async_tx_submit(chan, tx, submit); - - return tx; - } else { - struct page *p_src = P(blocks, disks); - struct page *q_src = Q(blocks, disks); - enum async_tx_flags flags_orig = submit->flags; - dma_async_tx_callback cb_fn_orig = submit->cb_fn; - void *scribble = submit->scribble; - void *cb_param_orig = submit->cb_param; - void *p, *q, *s; - - pr_debug("%s: (sync) disks: %d len: %zu\n", - __func__, disks, len); - - /* caller must provide a temporary result buffer and - * allow the input parameters to be preserved - */ - BUG_ON(!spare || !scribble); - - /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); - - /* recompute p and/or q into the temporary buffer and then - * check to see the result matches the current value - */ - tx = NULL; - *pqres = 0; - if (p_src) { - init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL, - NULL, NULL, scribble); - tx = async_xor(spare, blocks, offset, disks-2, len, submit); - async_tx_quiesce(&tx); - p = page_address(p_src) + offset; - s = page_address(spare) + offset; - *pqres |= !!memcmp(p, s, len) << SUM_CHECK_P; - } - - if (q_src) { - P(blocks, disks) = NULL; - Q(blocks, disks) = spare; - init_async_submit(submit, 0, NULL, NULL, NULL, scribble); - tx = async_gen_syndrome(blocks, offset, disks, len, submit); - async_tx_quiesce(&tx); - q = page_address(q_src) + offset; - s = page_address(spare) + offset; - *pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q; - } - - /* restore P, Q and submit */ - P(blocks, disks) = p_src; - Q(blocks, disks) = q_src; - - submit->cb_fn = cb_fn_orig; - submit->cb_param = cb_param_orig; - submit->flags = flags_orig; - async_tx_sync_epilog(submit); - - return NULL; - } -} -EXPORT_SYMBOL_GPL(async_syndrome_val); - -static int __init async_pq_init(void) -{ - scribble = alloc_page(GFP_KERNEL); - - if (scribble) - return 0; - - pr_err("%s: failed to allocate required spare page\n", __func__); - - return -ENOMEM; -} - -static void __exit async_pq_exit(void) -{ - put_page(scribble); -} - -module_init(async_pq_init); -module_exit(async_pq_exit); - -MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation"); -MODULE_LICENSE("GPL"); diff --git a/trunk/crypto/async_tx/async_raid6_recov.c b/trunk/crypto/async_tx/async_raid6_recov.c deleted file mode 100644 index 6d73dde4786d..000000000000 --- a/trunk/crypto/async_tx/async_raid6_recov.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Asynchronous RAID-6 recovery calculations ASYNC_TX API. - * Copyright(c) 2009 Intel Corporation - * - * based on raid6recov.c: - * Copyright 2002 H. Peter Anvin - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 51 - * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ -#include -#include -#include -#include -#include - -static struct dma_async_tx_descriptor * -async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, - size_t len, struct async_submit_ctl *submit) -{ - struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, - &dest, 1, srcs, 2, len); - struct dma_device *dma = chan ? chan->device : NULL; - const u8 *amul, *bmul; - u8 ax, bx; - u8 *a, *b, *c; - - if (dma) { - dma_addr_t dma_dest[2]; - dma_addr_t dma_src[2]; - struct device *dev = dma->dev; - struct dma_async_tx_descriptor *tx; - enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; - - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; - dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); - dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE); - dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE); - tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef, - len, dma_flags); - if (tx) { - async_tx_submit(chan, tx, submit); - return tx; - } - - /* could not get a descriptor, unmap and fall through to - * the synchronous path - */ - dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE); - dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE); - } - - /* run the operation synchronously */ - async_tx_quiesce(&submit->depend_tx); - amul = raid6_gfmul[coef[0]]; - bmul = raid6_gfmul[coef[1]]; - a = page_address(srcs[0]); - b = page_address(srcs[1]); - c = page_address(dest); - - while (len--) { - ax = amul[*a++]; - bx = bmul[*b++]; - *c++ = ax ^ bx; - } - - return NULL; -} - -static struct dma_async_tx_descriptor * -async_mult(struct page *dest, struct page *src, u8 coef, size_t len, - struct async_submit_ctl *submit) -{ - struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, - &dest, 1, &src, 1, len); - struct dma_device *dma = chan ? chan->device : NULL; - const u8 *qmul; /* Q multiplier table */ - u8 *d, *s; - - if (dma) { - dma_addr_t dma_dest[2]; - dma_addr_t dma_src[1]; - struct device *dev = dma->dev; - struct dma_async_tx_descriptor *tx; - enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; - - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; - dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); - dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE); - tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef, - len, dma_flags); - if (tx) { - async_tx_submit(chan, tx, submit); - return tx; - } - - /* could not get a descriptor, unmap and fall through to - * the synchronous path - */ - dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE); - } - - /* no channel available, or failed to allocate a descriptor, so - * perform the operation synchronously - */ - async_tx_quiesce(&submit->depend_tx); - qmul = raid6_gfmul[coef]; - d = page_address(dest); - s = page_address(src); - - while (len--) - *d++ = qmul[*s++]; - - return NULL; -} - -static struct dma_async_tx_descriptor * -__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks, - struct async_submit_ctl *submit) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct page *p, *q, *a, *b; - struct page *srcs[2]; - unsigned char coef[2]; - enum async_tx_flags flags = submit->flags; - dma_async_tx_callback cb_fn = submit->cb_fn; - void *cb_param = submit->cb_param; - void *scribble = submit->scribble; - - p = blocks[4-2]; - q = blocks[4-1]; - - a = blocks[faila]; - b = blocks[failb]; - - /* in the 4 disk case P + Pxy == P and Q + Qxy == Q */ - /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ - srcs[0] = p; - srcs[1] = q; - coef[0] = raid6_gfexi[failb-faila]; - coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_sum_product(b, srcs, coef, bytes, submit); - - /* Dy = P+Pxy+Dx */ - srcs[0] = p; - srcs[1] = b; - init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn, - cb_param, scribble); - tx = async_xor(a, srcs, 0, 2, bytes, submit); - - return tx; - -} - -static struct dma_async_tx_descriptor * -__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks, - struct async_submit_ctl *submit) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct page *p, *q, *g, *dp, *dq; - struct page *srcs[2]; - unsigned char coef[2]; - enum async_tx_flags flags = submit->flags; - dma_async_tx_callback cb_fn = submit->cb_fn; - void *cb_param = submit->cb_param; - void *scribble = submit->scribble; - int uninitialized_var(good); - int i; - - for (i = 0; i < 3; i++) { - if (i == faila || i == failb) - continue; - else { - good = i; - break; - } - } - BUG_ON(i >= 3); - - p = blocks[5-2]; - q = blocks[5-1]; - g = blocks[good]; - - /* Compute syndrome with zero for the missing data pages - * Use the dead data pages as temporary storage for delta p and - * delta q - */ - dp = blocks[faila]; - dq = blocks[failb]; - - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_memcpy(dp, g, 0, 0, bytes, submit); - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit); - - /* compute P + Pxy */ - srcs[0] = dp; - srcs[1] = p; - init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - NULL, NULL, scribble); - tx = async_xor(dp, srcs, 0, 2, bytes, submit); - - /* compute Q + Qxy */ - srcs[0] = dq; - srcs[1] = q; - init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - NULL, NULL, scribble); - tx = async_xor(dq, srcs, 0, 2, bytes, submit); - - /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ - srcs[0] = dp; - srcs[1] = dq; - coef[0] = raid6_gfexi[failb-faila]; - coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_sum_product(dq, srcs, coef, bytes, submit); - - /* Dy = P+Pxy+Dx */ - srcs[0] = dp; - srcs[1] = dq; - init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, - cb_param, scribble); - tx = async_xor(dp, srcs, 0, 2, bytes, submit); - - return tx; -} - -static struct dma_async_tx_descriptor * -__2data_recov_n(int disks, size_t bytes, int faila, int failb, - struct page **blocks, struct async_submit_ctl *submit) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct page *p, *q, *dp, *dq; - struct page *srcs[2]; - unsigned char coef[2]; - enum async_tx_flags flags = submit->flags; - dma_async_tx_callback cb_fn = submit->cb_fn; - void *cb_param = submit->cb_param; - void *scribble = submit->scribble; - - p = blocks[disks-2]; - q = blocks[disks-1]; - - /* Compute syndrome with zero for the missing data pages - * Use the dead data pages as temporary storage for - * delta p and delta q - */ - dp = blocks[faila]; - blocks[faila] = (void *)raid6_empty_zero_page; - blocks[disks-2] = dp; - dq = blocks[failb]; - blocks[failb] = (void *)raid6_empty_zero_page; - blocks[disks-1] = dq; - - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_gen_syndrome(blocks, 0, disks, bytes, submit); - - /* Restore pointer table */ - blocks[faila] = dp; - blocks[failb] = dq; - blocks[disks-2] = p; - blocks[disks-1] = q; - - /* compute P + Pxy */ - srcs[0] = dp; - srcs[1] = p; - init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - NULL, NULL, scribble); - tx = async_xor(dp, srcs, 0, 2, bytes, submit); - - /* compute Q + Qxy */ - srcs[0] = dq; - srcs[1] = q; - init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - NULL, NULL, scribble); - tx = async_xor(dq, srcs, 0, 2, bytes, submit); - - /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ - srcs[0] = dp; - srcs[1] = dq; - coef[0] = raid6_gfexi[failb-faila]; - coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_sum_product(dq, srcs, coef, bytes, submit); - - /* Dy = P+Pxy+Dx */ - srcs[0] = dp; - srcs[1] = dq; - init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, - cb_param, scribble); - tx = async_xor(dp, srcs, 0, 2, bytes, submit); - - return tx; -} - -/** - * async_raid6_2data_recov - asynchronously calculate two missing data blocks - * @disks: number of disks in the RAID-6 array - * @bytes: block size - * @faila: first failed drive index - * @failb: second failed drive index - * @blocks: array of source pointers where the last two entries are p and q - * @submit: submission/completion modifiers - */ -struct dma_async_tx_descriptor * -async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, - struct page **blocks, struct async_submit_ctl *submit) -{ - BUG_ON(faila == failb); - if (failb < faila) - swap(faila, failb); - - pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); - - /* we need to preserve the contents of 'blocks' for the async - * case, so punt to synchronous if a scribble buffer is not available - */ - if (!submit->scribble) { - void **ptrs = (void **) blocks; - int i; - - async_tx_quiesce(&submit->depend_tx); - for (i = 0; i < disks; i++) - ptrs[i] = page_address(blocks[i]); - - raid6_2data_recov(disks, bytes, faila, failb, ptrs); - - async_tx_sync_epilog(submit); - - return NULL; - } - - switch (disks) { - case 4: - /* dma devices do not uniformly understand a zero source pq - * operation (in contrast to the synchronous case), so - * explicitly handle the 4 disk special case - */ - return __2data_recov_4(bytes, faila, failb, blocks, submit); - case 5: - /* dma devices do not uniformly understand a single - * source pq operation (in contrast to the synchronous - * case), so explicitly handle the 5 disk special case - */ - return __2data_recov_5(bytes, faila, failb, blocks, submit); - default: - return __2data_recov_n(disks, bytes, faila, failb, blocks, submit); - } -} -EXPORT_SYMBOL_GPL(async_raid6_2data_recov); - -/** - * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block - * @disks: number of disks in the RAID-6 array - * @bytes: block size - * @faila: failed drive index - * @blocks: array of source pointers where the last two entries are p and q - * @submit: submission/completion modifiers - */ -struct dma_async_tx_descriptor * -async_raid6_datap_recov(int disks, size_t bytes, int faila, - struct page **blocks, struct async_submit_ctl *submit) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct page *p, *q, *dq; - u8 coef; - enum async_tx_flags flags = submit->flags; - dma_async_tx_callback cb_fn = submit->cb_fn; - void *cb_param = submit->cb_param; - void *scribble = submit->scribble; - struct page *srcs[2]; - - pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); - - /* we need to preserve the contents of 'blocks' for the async - * case, so punt to synchronous if a scribble buffer is not available - */ - if (!scribble) { - void **ptrs = (void **) blocks; - int i; - - async_tx_quiesce(&submit->depend_tx); - for (i = 0; i < disks; i++) - ptrs[i] = page_address(blocks[i]); - - raid6_datap_recov(disks, bytes, faila, ptrs); - - async_tx_sync_epilog(submit); - - return NULL; - } - - p = blocks[disks-2]; - q = blocks[disks-1]; - - /* Compute syndrome with zero for the missing data page - * Use the dead data page as temporary storage for delta q - */ - dq = blocks[faila]; - blocks[faila] = (void *)raid6_empty_zero_page; - blocks[disks-1] = dq; - - /* in the 4 disk case we only need to perform a single source - * multiplication - */ - if (disks == 4) { - int good = faila == 0 ? 1 : 0; - struct page *g = blocks[good]; - - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, - scribble); - tx = async_memcpy(p, g, 0, 0, bytes, submit); - - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, - scribble); - tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit); - } else { - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, - scribble); - tx = async_gen_syndrome(blocks, 0, disks, bytes, submit); - } - - /* Restore pointer table */ - blocks[faila] = dq; - blocks[disks-1] = q; - - /* calculate g^{-faila} */ - coef = raid6_gfinv[raid6_gfexp[faila]]; - - srcs[0] = dq; - srcs[1] = q; - init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - NULL, NULL, scribble); - tx = async_xor(dq, srcs, 0, 2, bytes, submit); - - init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble); - tx = async_mult(dq, dq, coef, bytes, submit); - - srcs[0] = p; - srcs[1] = dq; - init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, - cb_param, scribble); - tx = async_xor(p, srcs, 0, 2, bytes, submit); - - return tx; -} -EXPORT_SYMBOL_GPL(async_raid6_datap_recov); - -MODULE_AUTHOR("Dan Williams "); -MODULE_DESCRIPTION("asynchronous RAID-6 recovery api"); -MODULE_LICENSE("GPL"); diff --git a/trunk/crypto/async_tx/async_tx.c b/trunk/crypto/async_tx/async_tx.c index f9cdf04fe7c0..06eb6cc09fef 100644 --- a/trunk/crypto/async_tx/async_tx.c +++ b/trunk/crypto/async_tx/async_tx.c @@ -42,21 +42,16 @@ static void __exit async_tx_exit(void) async_dmaengine_put(); } -module_init(async_tx_init); -module_exit(async_tx_exit); - /** * __async_tx_find_channel - find a channel to carry out the operation or let * the transaction execute synchronously - * @submit: transaction dependency and submission modifiers + * @depend_tx: transaction dependency * @tx_type: transaction type */ struct dma_chan * -__async_tx_find_channel(struct async_submit_ctl *submit, - enum dma_transaction_type tx_type) +__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type) { - struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; - /* see if we can keep the chain on one channel */ if (depend_tx && dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) @@ -64,6 +59,17 @@ __async_tx_find_channel(struct async_submit_ctl *submit, return async_dma_find_channel(tx_type); } EXPORT_SYMBOL_GPL(__async_tx_find_channel); +#else +static int __init async_tx_init(void) +{ + printk(KERN_INFO "async_tx: api initialized (sync-only)\n"); + return 0; +} + +static void __exit async_tx_exit(void) +{ + do { } while (0); +} #endif @@ -77,14 +83,10 @@ static void async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, struct dma_async_tx_descriptor *tx) { - struct dma_chan *chan = depend_tx->chan; - struct dma_device *device = chan->device; + struct dma_chan *chan; + struct dma_device *device; struct dma_async_tx_descriptor *intr_tx = (void *) ~0; - #ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH - BUG(); - #endif - /* first check to see if we can still append to depend_tx */ spin_lock_bh(&depend_tx->lock); if (depend_tx->parent && depend_tx->chan == tx->chan) { @@ -94,11 +96,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, } spin_unlock_bh(&depend_tx->lock); - /* attached dependency, flush the parent channel */ - if (!intr_tx) { - device->device_issue_pending(chan); + if (!intr_tx) return; - } + + chan = depend_tx->chan; + device = chan->device; /* see if we can schedule an interrupt * otherwise poll for completion @@ -132,7 +134,6 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, intr_tx->tx_submit(intr_tx); async_tx_ack(intr_tx); } - device->device_issue_pending(chan); } else { if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) panic("%s: DMA_ERROR waiting for depend_tx\n", @@ -143,14 +144,13 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, /** - * submit_disposition - flags for routing an incoming operation + * submit_disposition - while holding depend_tx->lock we must avoid submitting + * new operations to prevent a circular locking dependency with + * drivers that already hold a channel lock when calling + * async_tx_run_dependencies. * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly - * - * while holding depend_tx->lock we must avoid submitting new operations - * to prevent a circular locking dependency with drivers that already - * hold a channel lock when calling async_tx_run_dependencies. */ enum submit_disposition { ASYNC_TX_SUBMITTED, @@ -160,12 +160,11 @@ enum submit_disposition { void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, - struct async_submit_ctl *submit) + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { - struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; - - tx->callback = submit->cb_fn; - tx->callback_param = submit->cb_param; + tx->callback = cb_fn; + tx->callback_param = cb_param; if (depend_tx) { enum submit_disposition s; @@ -221,29 +220,30 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, tx->tx_submit(tx); } - if (submit->flags & ASYNC_TX_ACK) + if (flags & ASYNC_TX_ACK) async_tx_ack(tx); - if (depend_tx) + if (depend_tx && (flags & ASYNC_TX_DEP_ACK)) async_tx_ack(depend_tx); } EXPORT_SYMBOL_GPL(async_tx_submit); /** - * async_trigger_callback - schedules the callback function to be run - * @submit: submission and completion parameters - * - * honored flags: ASYNC_TX_ACK - * - * The callback is run after any dependent operations have completed. + * async_trigger_callback - schedules the callback function to be run after + * any dependent operations have been completed. + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: 'callback' requires the completion of this transaction + * @cb_fn: function to call after depend_tx completes + * @cb_param: parameter to pass to the callback routine */ struct dma_async_tx_descriptor * -async_trigger_callback(struct async_submit_ctl *submit) +async_trigger_callback(enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { struct dma_chan *chan; struct dma_device *device; struct dma_async_tx_descriptor *tx; - struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; if (depend_tx) { chan = depend_tx->chan; @@ -262,14 +262,14 @@ async_trigger_callback(struct async_submit_ctl *submit) if (tx) { pr_debug("%s: (async)\n", __func__); - async_tx_submit(chan, tx, submit); + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); } else { pr_debug("%s: (sync)\n", __func__); /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); - async_tx_sync_epilog(submit); + async_tx_sync_epilog(cb_fn, cb_param); } return tx; @@ -295,6 +295,9 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx) } EXPORT_SYMBOL_GPL(async_tx_quiesce); +module_init(async_tx_init); +module_exit(async_tx_exit); + MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API"); MODULE_LICENSE("GPL"); diff --git a/trunk/crypto/async_tx/async_xor.c b/trunk/crypto/async_tx/async_xor.c index b459a9034aac..90dd3f8bd283 100644 --- a/trunk/crypto/async_tx/async_xor.c +++ b/trunk/crypto/async_tx/async_xor.c @@ -33,16 +33,19 @@ /* do_async_xor - dma map the pages and perform the xor with an engine */ static __async_inline struct dma_async_tx_descriptor * do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, - unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src, - struct async_submit_ctl *submit) + unsigned int offset, int src_cnt, size_t len, + enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { struct dma_device *dma = chan->device; + dma_addr_t *dma_src = (dma_addr_t *) src_list; struct dma_async_tx_descriptor *tx = NULL; int src_off = 0; int i; - dma_async_tx_callback cb_fn_orig = submit->cb_fn; - void *cb_param_orig = submit->cb_param; - enum async_tx_flags flags_orig = submit->flags; + dma_async_tx_callback _cb_fn; + void *_cb_param; + enum async_tx_flags async_flags; enum dma_ctrl_flags dma_flags; int xor_src_cnt; dma_addr_t dma_dest; @@ -60,27 +63,25 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, } while (src_cnt) { - submit->flags = flags_orig; + async_flags = flags; dma_flags = 0; - xor_src_cnt = min(src_cnt, (int)dma->max_xor); + xor_src_cnt = min(src_cnt, dma->max_xor); /* if we are submitting additional xors, leave the chain open, * clear the callback parameters, and leave the destination * buffer mapped */ if (src_cnt > xor_src_cnt) { - submit->flags &= ~ASYNC_TX_ACK; - submit->flags |= ASYNC_TX_FENCE; + async_flags &= ~ASYNC_TX_ACK; dma_flags = DMA_COMPL_SKIP_DEST_UNMAP; - submit->cb_fn = NULL; - submit->cb_param = NULL; + _cb_fn = NULL; + _cb_param = NULL; } else { - submit->cb_fn = cb_fn_orig; - submit->cb_param = cb_param_orig; + _cb_fn = cb_fn; + _cb_param = cb_param; } - if (submit->cb_fn) + if (_cb_fn) dma_flags |= DMA_PREP_INTERRUPT; - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; + /* Since we have clobbered the src_list we are committed * to doing this asynchronously. Drivers force forward progress * in case they can not provide a descriptor @@ -89,7 +90,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, xor_src_cnt, len, dma_flags); if (unlikely(!tx)) - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); /* spin wait for the preceeding transactions to complete */ while (unlikely(!tx)) { @@ -100,8 +101,11 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, dma_flags); } - async_tx_submit(chan, tx, submit); - submit->depend_tx = tx; + async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn, + _cb_param); + + depend_tx = tx; + flags |= ASYNC_TX_DEP_ACK; if (src_cnt > xor_src_cnt) { /* drop completed sources */ @@ -120,27 +124,23 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, static void do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, struct async_submit_ctl *submit) + int src_cnt, size_t len, enum async_tx_flags flags, + dma_async_tx_callback cb_fn, void *cb_param) { int i; int xor_src_cnt; int src_off = 0; void *dest_buf; - void **srcs; - - if (submit->scribble) - srcs = submit->scribble; - else - srcs = (void **) src_list; + void **srcs = (void **) src_list; - /* convert to buffer pointers */ + /* reuse the 'src_list' array to convert to buffer pointers */ for (i = 0; i < src_cnt; i++) srcs[i] = page_address(src_list[i]) + offset; /* set destination address */ dest_buf = page_address(dest) + offset; - if (submit->flags & ASYNC_TX_XOR_ZERO_DST) + if (flags & ASYNC_TX_XOR_ZERO_DST) memset(dest_buf, 0, len); while (src_cnt > 0) { @@ -153,70 +153,61 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, src_off += xor_src_cnt; } - async_tx_sync_epilog(submit); + async_tx_sync_epilog(cb_fn, cb_param); } /** * async_xor - attempt to xor a set of blocks with a dma engine. + * xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST + * flag must be set to not include dest data in the calculation. The + * assumption with dma eninges is that they only use the destination + * buffer as a source when it is explicity specified in the source list. * @dest: destination page - * @src_list: array of source pages - * @offset: common src/dst offset to start transaction + * @src_list: array of source pages (if the dest is also a source it must be + * at index zero). The contents of this array may be overwritten. + * @offset: offset in pages to start transaction * @src_cnt: number of source pages * @len: length in bytes - * @submit: submission / completion modifiers - * - * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST - * - * xor_blocks always uses the dest as a source so the - * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in - * the calculation. The assumption with dma eninges is that they only - * use the destination buffer as a source when it is explicity specified - * in the source list. - * - * src_list note: if the dest is also a source it must be at index zero. - * The contents of this array will be overwritten if a scribble region - * is not specified. + * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, + * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: xor depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine */ struct dma_async_tx_descriptor * async_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, struct async_submit_ctl *submit) + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { - struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR, + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR, &dest, 1, src_list, src_cnt, len); - dma_addr_t *dma_src = NULL; - BUG_ON(src_cnt <= 1); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) src_list; - - if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) { + if (chan) { /* run the xor asynchronously */ pr_debug("%s (async): len: %zu\n", __func__, len); return do_async_xor(chan, dest, src_list, offset, src_cnt, len, - dma_src, submit); + flags, depend_tx, cb_fn, cb_param); } else { /* run the xor synchronously */ pr_debug("%s (sync): len: %zu\n", __func__, len); - WARN_ONCE(chan, "%s: no space for dma address conversion\n", - __func__); /* in the sync case the dest is an implied source * (assumes the dest is the first source) */ - if (submit->flags & ASYNC_TX_XOR_DROP_DST) { + if (flags & ASYNC_TX_XOR_DROP_DST) { src_cnt--; src_list++; } /* wait for any prerequisite operations */ - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); - do_sync_xor(dest, src_list, offset, src_cnt, len, submit); + do_sync_xor(dest, src_list, offset, src_cnt, len, + flags, cb_fn, cb_param); return NULL; } @@ -231,94 +222,104 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len) } /** - * async_xor_val - attempt a xor parity check with a dma engine. + * async_xor_zero_sum - attempt a xor parity check with a dma engine. * @dest: destination page used if the xor is performed synchronously - * @src_list: array of source pages + * @src_list: array of source pages. The dest page must be listed as a source + * at index zero. The contents of this array may be overwritten. * @offset: offset in pages to start transaction * @src_cnt: number of source pages * @len: length in bytes * @result: 0 if sum == 0 else non-zero - * @submit: submission / completion modifiers - * - * honored flags: ASYNC_TX_ACK - * - * src_list note: if the dest is also a source it must be at index zero. - * The contents of this array will be overwritten if a scribble region - * is not specified. + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: xor depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine */ struct dma_async_tx_descriptor * -async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum sum_check_flags *result, - struct async_submit_ctl *submit) +async_xor_zero_sum(struct page *dest, struct page **src_list, + unsigned int offset, int src_cnt, size_t len, + u32 *result, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) { - struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL, + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM, &dest, 1, src_list, src_cnt, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; - dma_addr_t *dma_src = NULL; BUG_ON(src_cnt <= 1); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) src_list; - - if (dma_src && device && src_cnt <= device->max_xor && - is_dma_xor_aligned(device, offset, 0, len)) { - unsigned long dma_prep_flags = 0; + if (device && src_cnt <= device->max_xor) { + dma_addr_t *dma_src = (dma_addr_t *) src_list; + unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; int i; pr_debug("%s: (async) len: %zu\n", __func__, len); - if (submit->cb_fn) - dma_prep_flags |= DMA_PREP_INTERRUPT; - if (submit->flags & ASYNC_TX_FENCE) - dma_prep_flags |= DMA_PREP_FENCE; for (i = 0; i < src_cnt; i++) dma_src[i] = dma_map_page(device->dev, src_list[i], offset, len, DMA_TO_DEVICE); - tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt, - len, result, - dma_prep_flags); + tx = device->device_prep_dma_zero_sum(chan, dma_src, src_cnt, + len, result, + dma_prep_flags); if (unlikely(!tx)) { - async_tx_quiesce(&submit->depend_tx); + async_tx_quiesce(&depend_tx); while (!tx) { dma_async_issue_pending(chan); - tx = device->device_prep_dma_xor_val(chan, + tx = device->device_prep_dma_zero_sum(chan, dma_src, src_cnt, len, result, dma_prep_flags); } } - async_tx_submit(chan, tx, submit); + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); } else { - enum async_tx_flags flags_orig = submit->flags; + unsigned long xor_flags = flags; pr_debug("%s: (sync) len: %zu\n", __func__, len); - WARN_ONCE(device && src_cnt <= device->max_xor, - "%s: no space for dma address conversion\n", - __func__); - submit->flags |= ASYNC_TX_XOR_DROP_DST; - submit->flags &= ~ASYNC_TX_ACK; + xor_flags |= ASYNC_TX_XOR_DROP_DST; + xor_flags &= ~ASYNC_TX_ACK; - tx = async_xor(dest, src_list, offset, src_cnt, len, submit); + tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags, + depend_tx, NULL, NULL); async_tx_quiesce(&tx); - *result = !page_is_zero(dest, offset, len) << SUM_CHECK_P; + *result = page_is_zero(dest, offset, len) ? 0 : 1; - async_tx_sync_epilog(submit); - submit->flags = flags_orig; + async_tx_sync_epilog(cb_fn, cb_param); } return tx; } -EXPORT_SYMBOL_GPL(async_xor_val); +EXPORT_SYMBOL_GPL(async_xor_zero_sum); + +static int __init async_xor_init(void) +{ + #ifdef CONFIG_ASYNC_TX_DMA + /* To conserve stack space the input src_list (array of page pointers) + * is reused to hold the array of dma addresses passed to the driver. + * This conversion is only possible when dma_addr_t is less than the + * the size of a pointer. HIGHMEM64G is known to violate this + * assumption. + */ + BUILD_BUG_ON(sizeof(dma_addr_t) > sizeof(struct page *)); + #endif + + return 0; +} + +static void __exit async_xor_exit(void) +{ + do { } while (0); +} + +module_init(async_xor_init); +module_exit(async_xor_exit); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api"); diff --git a/trunk/crypto/async_tx/raid6test.c b/trunk/crypto/async_tx/raid6test.c deleted file mode 100644 index 3ec27c7e62ea..000000000000 --- a/trunk/crypto/async_tx/raid6test.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * asynchronous raid6 recovery self test - * Copyright (c) 2009, Intel Corporation. - * - * based on drivers/md/raid6test/test.c: - * Copyright 2002-2007 H. Peter Anvin - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ -#include -#include - -#undef pr -#define pr(fmt, args...) pr_info("raid6test: " fmt, ##args) - -#define NDISKS 16 /* Including P and Q */ - -static struct page *dataptrs[NDISKS]; -static addr_conv_t addr_conv[NDISKS]; -static struct page *data[NDISKS+3]; -static struct page *spare; -static struct page *recovi; -static struct page *recovj; - -static void callback(void *param) -{ - struct completion *cmp = param; - - complete(cmp); -} - -static void makedata(int disks) -{ - int i, j; - - for (i = 0; i < disks; i++) { - for (j = 0; j < PAGE_SIZE/sizeof(u32); j += sizeof(u32)) { - u32 *p = page_address(data[i]) + j; - - *p = random32(); - } - - dataptrs[i] = data[i]; - } -} - -static char disk_type(int d, int disks) -{ - if (d == disks - 2) - return 'P'; - else if (d == disks - 1) - return 'Q'; - else - return 'D'; -} - -/* Recover two failed blocks. */ -static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, struct page **ptrs) -{ - struct async_submit_ctl submit; - struct completion cmp; - struct dma_async_tx_descriptor *tx = NULL; - enum sum_check_flags result = ~0; - - if (faila > failb) - swap(faila, failb); - - if (failb == disks-1) { - if (faila == disks-2) { - /* P+Q failure. Just rebuild the syndrome. */ - init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv); - tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit); - } else { - struct page *blocks[disks]; - struct page *dest; - int count = 0; - int i; - - /* data+Q failure. Reconstruct data from P, - * then rebuild syndrome - */ - for (i = disks; i-- ; ) { - if (i == faila || i == failb) - continue; - blocks[count++] = ptrs[i]; - } - dest = ptrs[faila]; - init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, - NULL, NULL, addr_conv); - tx = async_xor(dest, blocks, 0, count, bytes, &submit); - - init_async_submit(&submit, 0, tx, NULL, NULL, addr_conv); - tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit); - } - } else { - if (failb == disks-2) { - /* data+P failure. */ - init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv); - tx = async_raid6_datap_recov(disks, bytes, faila, ptrs, &submit); - } else { - /* data+data failure. */ - init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv); - tx = async_raid6_2data_recov(disks, bytes, faila, failb, ptrs, &submit); - } - } - init_completion(&cmp); - init_async_submit(&submit, ASYNC_TX_ACK, tx, callback, &cmp, addr_conv); - tx = async_syndrome_val(ptrs, 0, disks, bytes, &result, spare, &submit); - async_tx_issue_pending(tx); - - if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) - pr("%s: timeout! (faila: %d failb: %d disks: %d)\n", - __func__, faila, failb, disks); - - if (result != 0) - pr("%s: validation failure! faila: %d failb: %d sum_check_flags: %x\n", - __func__, faila, failb, result); -} - -static int test_disks(int i, int j, int disks) -{ - int erra, errb; - - memset(page_address(recovi), 0xf0, PAGE_SIZE); - memset(page_address(recovj), 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs); - - erra = memcmp(page_address(data[i]), page_address(recovi), PAGE_SIZE); - errb = memcmp(page_address(data[j]), page_address(recovj), PAGE_SIZE); - - pr("%s(%d, %d): faila=%3d(%c) failb=%3d(%c) %s\n", - __func__, i, j, i, disk_type(i, disks), j, disk_type(j, disks), - (!erra && !errb) ? "OK" : !erra ? "ERRB" : !errb ? "ERRA" : "ERRAB"); - - dataptrs[i] = data[i]; - dataptrs[j] = data[j]; - - return erra || errb; -} - -static int test(int disks, int *tests) -{ - struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - struct completion cmp; - int err = 0; - int i, j; - - recovi = data[disks]; - recovj = data[disks+1]; - spare = data[disks+2]; - - makedata(disks); - - /* Nuke syndromes */ - memset(page_address(data[disks-2]), 0xee, PAGE_SIZE); - memset(page_address(data[disks-1]), 0xee, PAGE_SIZE); - - /* Generate assumed good syndrome */ - init_completion(&cmp); - init_async_submit(&submit, ASYNC_TX_ACK, NULL, callback, &cmp, addr_conv); - tx = async_gen_syndrome(dataptrs, 0, disks, PAGE_SIZE, &submit); - async_tx_issue_pending(tx); - - if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) { - pr("error: initial gen_syndrome(%d) timed out\n", disks); - return 1; - } - - pr("testing the %d-disk case...\n", disks); - for (i = 0; i < disks-1; i++) - for (j = i+1; j < disks; j++) { - (*tests)++; - err += test_disks(i, j, disks); - } - - return err; -} - - -static int raid6_test(void) -{ - int err = 0; - int tests = 0; - int i; - - for (i = 0; i < NDISKS+3; i++) { - data[i] = alloc_page(GFP_KERNEL); - if (!data[i]) { - while (i--) - put_page(data[i]); - return -ENOMEM; - } - } - - /* the 4-disk and 5-disk cases are special for the recovery code */ - if (NDISKS > 4) - err += test(4, &tests); - if (NDISKS > 5) - err += test(5, &tests); - err += test(NDISKS, &tests); - - pr("\n"); - pr("complete (%d tests, %d failure%s)\n", - tests, err, err == 1 ? "" : "s"); - - for (i = 0; i < NDISKS+3; i++) - put_page(data[i]); - - return 0; -} - -static void raid6_test_exit(void) -{ -} - -/* when compiled-in wait for drivers to load first (assumes dma drivers - * are also compliled-in) - */ -late_initcall(raid6_test); -module_exit(raid6_test_exit); -MODULE_AUTHOR("Dan Williams "); -MODULE_DESCRIPTION("asynchronous RAID-6 recovery self tests"); -MODULE_LICENSE("GPL"); diff --git a/trunk/drivers/cdrom/cdrom.c b/trunk/drivers/cdrom/cdrom.c index 614da5b8613a..71d1b9bab70b 100644 --- a/trunk/drivers/cdrom/cdrom.c +++ b/trunk/drivers/cdrom/cdrom.c @@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info, return 0; } -static int cdrom_sysctl_info(ctl_table *ctl, int write, +static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int pos; @@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, goto done; doit: mutex_unlock(&cdrom_mutex); - return proc_dostring(ctl, write, buffer, lenp, ppos); + return proc_dostring(ctl, write, filp, buffer, lenp, ppos); done: printk(KERN_INFO "cdrom: info buffer too small\n"); goto doit; @@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void) mutex_unlock(&cdrom_mutex); } -static int cdrom_sysctl_handler(ctl_table *ctl, int write, +static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) { diff --git a/trunk/drivers/char/Kconfig b/trunk/drivers/char/Kconfig index 08a6f50ae791..6a06913b01d3 100644 --- a/trunk/drivers/char/Kconfig +++ b/trunk/drivers/char/Kconfig @@ -1087,14 +1087,6 @@ config MMTIMER The mmtimer device allows direct userspace access to the Altix system timer. -config UV_MMTIMER - tristate "UV_MMTIMER Memory mapped RTC for SGI UV" - depends on X86_UV - default m - help - The uv_mmtimer device allows direct userspace access to the - UV system timer. - source "drivers/char/tpm/Kconfig" config TELCLOCK diff --git a/trunk/drivers/char/Makefile b/trunk/drivers/char/Makefile index 19a79dd79eee..66f779ad4f4c 100644 --- a/trunk/drivers/char/Makefile +++ b/trunk/drivers/char/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MSPEC) += mspec.o obj-$(CONFIG_MMTIMER) += mmtimer.o -obj-$(CONFIG_UV_MMTIMER) += uv_mmtimer.o obj-$(CONFIG_VIOTAPE) += viotape.o obj-$(CONFIG_HVCS) += hvcs.o obj-$(CONFIG_IBM_BSR) += bsr.o diff --git a/trunk/drivers/char/bfin-otp.c b/trunk/drivers/char/bfin-otp.c index e3dd24bff514..0a01329451e4 100644 --- a/trunk/drivers/char/bfin-otp.c +++ b/trunk/drivers/char/bfin-otp.c @@ -1,7 +1,8 @@ /* * Blackfin On-Chip OTP Memory Interface + * Supports BF52x/BF54x * - * Copyright 2007-2009 Analog Devices Inc. + * Copyright 2007-2008 Analog Devices Inc. * * Enter bugs at http://blackfin.uclinux.org/ * @@ -16,10 +17,8 @@ #include #include #include -#include #include -#include #include #define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args) @@ -31,6 +30,39 @@ static DEFINE_MUTEX(bfin_otp_lock); +/* OTP Boot ROM functions */ +#define _BOOTROM_OTP_COMMAND 0xEF000018 +#define _BOOTROM_OTP_READ 0xEF00001A +#define _BOOTROM_OTP_WRITE 0xEF00001C + +static u32 (* const otp_command)(u32 command, u32 value) = (void *)_BOOTROM_OTP_COMMAND; +static u32 (* const otp_read)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_READ; +static u32 (* const otp_write)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_WRITE; + +/* otp_command(): defines for "command" */ +#define OTP_INIT 0x00000001 +#define OTP_CLOSE 0x00000002 + +/* otp_{read,write}(): defines for "flags" */ +#define OTP_LOWER_HALF 0x00000000 /* select upper/lower 64-bit half (bit 0) */ +#define OTP_UPPER_HALF 0x00000001 +#define OTP_NO_ECC 0x00000010 /* do not use ECC */ +#define OTP_LOCK 0x00000020 /* sets page protection bit for page */ +#define OTP_ACCESS_READ 0x00001000 +#define OTP_ACCESS_READWRITE 0x00002000 + +/* Return values for all functions */ +#define OTP_SUCCESS 0x00000000 +#define OTP_MASTER_ERROR 0x001 +#define OTP_WRITE_ERROR 0x003 +#define OTP_READ_ERROR 0x005 +#define OTP_ACC_VIO_ERROR 0x009 +#define OTP_DATA_MULT_ERROR 0x011 +#define OTP_ECC_MULT_ERROR 0x021 +#define OTP_PREV_WR_ERROR 0x041 +#define OTP_DATA_SB_WARN 0x100 +#define OTP_ECC_SB_WARN 0x200 + /** * bfin_otp_read - Read OTP pages * @@ -54,11 +86,9 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count, page = *pos / (sizeof(u64) * 2); while (bytes_done < count) { flags = (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF); - stamp("processing page %i (0x%x:%s)", page, flags, - (flags & OTP_UPPER_HALF ? "upper" : "lower")); - ret = bfrom_OtpRead(page, flags, &content); + stamp("processing page %i (%s)", page, (flags == OTP_UPPER_HALF ? "upper" : "lower")); + ret = otp_read(page, flags, &content); if (ret & OTP_MASTER_ERROR) { - stamp("error from otp: 0x%x", ret); bytes_done = -EIO; break; } @@ -66,7 +96,7 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count, bytes_done = -EFAULT; break; } - if (flags & OTP_UPPER_HALF) + if (flags == OTP_UPPER_HALF) ++page; bytes_done += sizeof(content); *pos += sizeof(content); @@ -78,53 +108,14 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count, } #ifdef CONFIG_BFIN_OTP_WRITE_ENABLE -static bool allow_writes; - -/** - * bfin_otp_init_timing - setup OTP timing parameters - * - * Required before doing any write operation. Algorithms from HRM. - */ -static u32 bfin_otp_init_timing(void) -{ - u32 tp1, tp2, tp3, timing; - - tp1 = get_sclk() / 1000000; - tp2 = (2 * get_sclk() / 10000000) << 8; - tp3 = (0x1401) << 15; - timing = tp1 | tp2 | tp3; - if (bfrom_OtpCommand(OTP_INIT, timing)) - return 0; - - return timing; -} - -/** - * bfin_otp_deinit_timing - set timings to only allow reads - * - * Should be called after all writes are done. - */ -static void bfin_otp_deinit_timing(u32 timing) -{ - /* mask bits [31:15] so that any attempts to write fail */ - bfrom_OtpCommand(OTP_CLOSE, 0); - bfrom_OtpCommand(OTP_INIT, timing & ~(-1 << 15)); - bfrom_OtpCommand(OTP_CLOSE, 0); -} - /** - * bfin_otp_write - write OTP pages + * bfin_otp_write - Write OTP pages * * All writes must be in half page chunks (half page == 64 bits). */ static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t count, loff_t *pos) { - ssize_t bytes_done; - u32 timing, page, base_flags, flags, ret; - u64 content; - - if (!allow_writes) - return -EACCES; + stampit(); if (count % sizeof(u64)) return -EMSGSIZE; @@ -132,96 +123,20 @@ static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t if (mutex_lock_interruptible(&bfin_otp_lock)) return -ERESTARTSYS; - stampit(); - - timing = bfin_otp_init_timing(); - if (timing == 0) { - mutex_unlock(&bfin_otp_lock); - return -EIO; - } - - base_flags = OTP_CHECK_FOR_PREV_WRITE; - - bytes_done = 0; - page = *pos / (sizeof(u64) * 2); - while (bytes_done < count) { - flags = base_flags | (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF); - stamp("processing page %i (0x%x:%s) from %p", page, flags, - (flags & OTP_UPPER_HALF ? "upper" : "lower"), buff + bytes_done); - if (copy_from_user(&content, buff + bytes_done, sizeof(content))) { - bytes_done = -EFAULT; - break; - } - ret = bfrom_OtpWrite(page, flags, &content); - if (ret & OTP_MASTER_ERROR) { - stamp("error from otp: 0x%x", ret); - bytes_done = -EIO; - break; - } - if (flags & OTP_UPPER_HALF) - ++page; - bytes_done += sizeof(content); - *pos += sizeof(content); - } - - bfin_otp_deinit_timing(timing); + /* need otp_init() documentation before this can be implemented */ mutex_unlock(&bfin_otp_lock); - return bytes_done; -} - -static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg) -{ - stampit(); - - switch (cmd) { - case OTPLOCK: { - u32 timing; - int ret = -EIO; - - if (!allow_writes) - return -EACCES; - - if (mutex_lock_interruptible(&bfin_otp_lock)) - return -ERESTARTSYS; - - timing = bfin_otp_init_timing(); - if (timing) { - u32 otp_result = bfrom_OtpWrite(arg, OTP_LOCK, NULL); - stamp("locking page %lu resulted in 0x%x", arg, otp_result); - if (!(otp_result & OTP_MASTER_ERROR)) - ret = 0; - - bfin_otp_deinit_timing(timing); - } - - mutex_unlock(&bfin_otp_lock); - - return ret; - } - - case MEMLOCK: - allow_writes = false; - return 0; - - case MEMUNLOCK: - allow_writes = true; - return 0; - } - return -EINVAL; } #else # define bfin_otp_write NULL -# define bfin_otp_ioctl NULL #endif static struct file_operations bfin_otp_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = bfin_otp_ioctl, - .read = bfin_otp_read, - .write = bfin_otp_write, + .owner = THIS_MODULE, + .read = bfin_otp_read, + .write = bfin_otp_write, }; static struct miscdevice bfin_otp_misc_device = { diff --git a/trunk/drivers/char/hpet.c b/trunk/drivers/char/hpet.c index 70a770ac0138..4a9f3492b921 100644 --- a/trunk/drivers/char/hpet.c +++ b/trunk/drivers/char/hpet.c @@ -166,8 +166,9 @@ static irqreturn_t hpet_interrupt(int irq, void *data) unsigned long m, t; t = devp->hd_ireqfreq; - m = read_counter(&devp->hd_timer->hpet_compare); - write_counter(t + m, &devp->hd_timer->hpet_compare); + m = read_counter(&devp->hd_hpet->hpet_mc); + write_counter(t + m + devp->hd_hpets->hp_delta, + &devp->hd_timer->hpet_compare); } if (devp->hd_flags & HPET_SHARED_IRQ) @@ -503,25 +504,21 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp) g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK; if (devp->hd_flags & HPET_PERIODIC) { + write_counter(t, &timer->hpet_compare); g |= Tn_TYPE_CNF_MASK; - v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK; + v |= Tn_TYPE_CNF_MASK; + writeq(v, &timer->hpet_config); + v |= Tn_VAL_SET_CNF_MASK; writeq(v, &timer->hpet_config); local_irq_save(flags); - /* - * NOTE: First we modify the hidden accumulator + /* NOTE: what we modify here is a hidden accumulator * register supported by periodic-capable comparators. * We never want to modify the (single) counter; that - * would affect all the comparators. The value written - * is the counter value when the first interrupt is due. + * would affect all the comparators. */ m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); - /* - * Then we modify the comparator, indicating the period - * for subsequent interrupt. - */ - write_counter(t, &timer->hpet_compare); } else { local_irq_save(flags); m = read_counter(&hpet->hpet_mc); diff --git a/trunk/drivers/char/mem.c b/trunk/drivers/char/mem.c index 6c8b65d069e5..0aede1d6a9ea 100644 --- a/trunk/drivers/char/mem.c +++ b/trunk/drivers/char/mem.c @@ -690,7 +690,7 @@ static ssize_t read_zero(struct file * file, char __user * buf, if (chunk > PAGE_SIZE) chunk = PAGE_SIZE; /* Just for latency reasons */ - unwritten = __clear_user(buf, chunk); + unwritten = clear_user(buf, chunk); written += chunk - unwritten; if (unwritten) break; diff --git a/trunk/drivers/char/mwave/mwavedd.c b/trunk/drivers/char/mwave/mwavedd.c index a4ec50c95072..94ad2c3bfc4a 100644 --- a/trunk/drivers/char/mwave/mwavedd.c +++ b/trunk/drivers/char/mwave/mwavedd.c @@ -281,6 +281,12 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd, case IOCTL_MW_REGISTER_IPC: { unsigned int ipcnum = (unsigned int) ioarg; + PRINTK_3(TRACE_MWAVE, + "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC" + " ipcnum %x entry usIntCount %x\n", + ipcnum, + pDrvData->IPCs[ipcnum].usIntCount); + if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) { PRINTK_ERROR(KERN_ERR_MWAVE "mwavedd::mwave_ioctl:" @@ -289,12 +295,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd, ipcnum); return -EINVAL; } - PRINTK_3(TRACE_MWAVE, - "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC" - " ipcnum %x entry usIntCount %x\n", - ipcnum, - pDrvData->IPCs[ipcnum].usIntCount); - lock_kernel(); pDrvData->IPCs[ipcnum].bIsHere = FALSE; pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; @@ -310,6 +310,11 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd, case IOCTL_MW_GET_IPC: { unsigned int ipcnum = (unsigned int) ioarg; + PRINTK_3(TRACE_MWAVE, + "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC" + " ipcnum %x, usIntCount %x\n", + ipcnum, + pDrvData->IPCs[ipcnum].usIntCount); if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) { PRINTK_ERROR(KERN_ERR_MWAVE "mwavedd::mwave_ioctl:" @@ -317,11 +322,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd, " Invalid ipcnum %x\n", ipcnum); return -EINVAL; } - PRINTK_3(TRACE_MWAVE, - "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC" - " ipcnum %x, usIntCount %x\n", - ipcnum, - pDrvData->IPCs[ipcnum].usIntCount); lock_kernel(); if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) { diff --git a/trunk/drivers/char/random.c b/trunk/drivers/char/random.c index 04b505e5a5e2..d8a9255e1a3f 100644 --- a/trunk/drivers/char/random.c +++ b/trunk/drivers/char/random.c @@ -1231,7 +1231,7 @@ static char sysctl_bootid[16]; * as an ASCII string in the standard UUID format. If accesses via the * sysctl system call, it is returned as 16 bytes of binary data. */ -static int proc_do_uuid(ctl_table *table, int write, +static int proc_do_uuid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { ctl_table fake_table; @@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, fake_table.data = buf; fake_table.maxlen = sizeof(buf); - return proc_dostring(&fake_table, write, buffer, lenp, ppos); + return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos); } static int uuid_strategy(ctl_table *table, diff --git a/trunk/drivers/char/rio/rioctrl.c b/trunk/drivers/char/rio/rioctrl.c index 74339559f0b9..eecee0f576d2 100644 --- a/trunk/drivers/char/rio/rioctrl.c +++ b/trunk/drivers/char/rio/rioctrl.c @@ -873,7 +873,7 @@ int riocontrol(struct rio_info *p, dev_t dev, int cmd, unsigned long arg, int su /* ** It is important that the product code is an unsigned object! */ - if (DownLoad.ProductCode >= MAX_PRODUCT) { + if (DownLoad.ProductCode > MAX_PRODUCT) { rio_dprintk(RIO_DEBUG_CTRL, "RIO_DOWNLOAD: Bad product code %d passed\n", DownLoad.ProductCode); p->RIOError.Error = NO_SUCH_PRODUCT; return -ENXIO; diff --git a/trunk/drivers/char/uv_mmtimer.c b/trunk/drivers/char/uv_mmtimer.c deleted file mode 100644 index 867b67be9f0a..000000000000 --- a/trunk/drivers/char/uv_mmtimer.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Timer device implementation for SGI UV platform. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 2009 Silicon Graphics, Inc. All rights reserved. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_AUTHOR("Dimitri Sivanich "); -MODULE_DESCRIPTION("SGI UV Memory Mapped RTC Timer"); -MODULE_LICENSE("GPL"); - -/* name of the device, usually in /dev */ -#define UV_MMTIMER_NAME "mmtimer" -#define UV_MMTIMER_DESC "SGI UV Memory Mapped RTC Timer" -#define UV_MMTIMER_VERSION "1.0" - -static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg); -static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma); - -/* - * Period in femtoseconds (10^-15 s) - */ -static unsigned long uv_mmtimer_femtoperiod; - -static const struct file_operations uv_mmtimer_fops = { - .owner = THIS_MODULE, - .mmap = uv_mmtimer_mmap, - .unlocked_ioctl = uv_mmtimer_ioctl, -}; - -/** - * uv_mmtimer_ioctl - ioctl interface for /dev/uv_mmtimer - * @file: file structure for the device - * @cmd: command to execute - * @arg: optional argument to command - * - * Executes the command specified by @cmd. Returns 0 for success, < 0 for - * failure. - * - * Valid commands: - * - * %MMTIMER_GETOFFSET - Should return the offset (relative to the start - * of the page where the registers are mapped) for the counter in question. - * - * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15) - * seconds - * - * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address - * specified by @arg - * - * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter - * - * %MMTIMER_MMAPAVAIL - Returns 1 if registers can be mmap'd into userspace - * - * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it - * in the address specified by @arg. - */ -static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret = 0; - - switch (cmd) { - case MMTIMER_GETOFFSET: /* offset of the counter */ - /* - * UV RTC register is on its own page - */ - if (PAGE_SIZE <= (1 << 16)) - ret = ((UV_LOCAL_MMR_BASE | UVH_RTC) & (PAGE_SIZE-1)) - / 8; - else - ret = -ENOSYS; - break; - - case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */ - if (copy_to_user((unsigned long __user *)arg, - &uv_mmtimer_femtoperiod, sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETFREQ: /* frequency in Hz */ - if (copy_to_user((unsigned long __user *)arg, - &sn_rtc_cycles_per_second, - sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETBITS: /* number of bits in the clock */ - ret = hweight64(UVH_RTC_REAL_TIME_CLOCK_MASK); - break; - - case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */ - ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0; - break; - - case MMTIMER_GETCOUNTER: - if (copy_to_user((unsigned long __user *)arg, - (unsigned long *)uv_local_mmr_address(UVH_RTC), - sizeof(unsigned long))) - ret = -EFAULT; - break; - default: - ret = -ENOTTY; - break; - } - return ret; -} - -/** - * uv_mmtimer_mmap - maps the clock's registers into userspace - * @file: file structure for the device - * @vma: VMA to map the registers into - * - * Calls remap_pfn_range() to map the clock's registers into - * the calling process' address space. - */ -static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma) -{ - unsigned long uv_mmtimer_addr; - - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - if (vma->vm_flags & VM_WRITE) - return -EPERM; - - if (PAGE_SIZE > (1 << 16)) - return -ENOSYS; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - uv_mmtimer_addr = UV_LOCAL_MMR_BASE | UVH_RTC; - uv_mmtimer_addr &= ~(PAGE_SIZE - 1); - uv_mmtimer_addr &= 0xfffffffffffffffUL; - - if (remap_pfn_range(vma, vma->vm_start, uv_mmtimer_addr >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - printk(KERN_ERR "remap_pfn_range failed in uv_mmtimer_mmap\n"); - return -EAGAIN; - } - - return 0; -} - -static struct miscdevice uv_mmtimer_miscdev = { - MISC_DYNAMIC_MINOR, - UV_MMTIMER_NAME, - &uv_mmtimer_fops -}; - - -/** - * uv_mmtimer_init - device initialization routine - * - * Does initial setup for the uv_mmtimer device. - */ -static int __init uv_mmtimer_init(void) -{ - if (!is_uv_system()) { - printk(KERN_ERR "%s: Hardware unsupported\n", UV_MMTIMER_NAME); - return -1; - } - - /* - * Sanity check the cycles/sec variable - */ - if (sn_rtc_cycles_per_second < 100000) { - printk(KERN_ERR "%s: unable to determine clock frequency\n", - UV_MMTIMER_NAME); - return -1; - } - - uv_mmtimer_femtoperiod = ((unsigned long)1E15 + - sn_rtc_cycles_per_second / 2) / - sn_rtc_cycles_per_second; - - if (misc_register(&uv_mmtimer_miscdev)) { - printk(KERN_ERR "%s: failed to register device\n", - UV_MMTIMER_NAME); - return -1; - } - - printk(KERN_INFO "%s: v%s, %ld MHz\n", UV_MMTIMER_DESC, - UV_MMTIMER_VERSION, - sn_rtc_cycles_per_second/(unsigned long)1E6); - - return 0; -} - -module_init(uv_mmtimer_init); diff --git a/trunk/drivers/dca/dca-core.c b/trunk/drivers/dca/dca-core.c index 52e6bb70a490..25b743abfb59 100644 --- a/trunk/drivers/dca/dca-core.c +++ b/trunk/drivers/dca/dca-core.c @@ -28,7 +28,7 @@ #include #include -#define DCA_VERSION "1.12.1" +#define DCA_VERSION "1.8" MODULE_VERSION(DCA_VERSION); MODULE_LICENSE("GPL"); @@ -36,92 +36,20 @@ MODULE_AUTHOR("Intel Corporation"); static DEFINE_SPINLOCK(dca_lock); -static LIST_HEAD(dca_domains); - -static struct pci_bus *dca_pci_rc_from_dev(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_bus *bus = pdev->bus; - - while (bus->parent) - bus = bus->parent; - - return bus; -} - -static struct dca_domain *dca_allocate_domain(struct pci_bus *rc) -{ - struct dca_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_NOWAIT); - if (!domain) - return NULL; - - INIT_LIST_HEAD(&domain->dca_providers); - domain->pci_rc = rc; - - return domain; -} - -static void dca_free_domain(struct dca_domain *domain) -{ - list_del(&domain->node); - kfree(domain); -} - -static struct dca_domain *dca_find_domain(struct pci_bus *rc) -{ - struct dca_domain *domain; - - list_for_each_entry(domain, &dca_domains, node) - if (domain->pci_rc == rc) - return domain; - - return NULL; -} - -static struct dca_domain *dca_get_domain(struct device *dev) -{ - struct pci_bus *rc; - struct dca_domain *domain; - - rc = dca_pci_rc_from_dev(dev); - domain = dca_find_domain(rc); - - if (!domain) { - domain = dca_allocate_domain(rc); - if (domain) - list_add(&domain->node, &dca_domains); - } - - return domain; -} +static LIST_HEAD(dca_providers); static struct dca_provider *dca_find_provider_by_dev(struct device *dev) { - struct dca_provider *dca; - struct pci_bus *rc; - struct dca_domain *domain; - - if (dev) { - rc = dca_pci_rc_from_dev(dev); - domain = dca_find_domain(rc); - if (!domain) - return NULL; - } else { - if (!list_empty(&dca_domains)) - domain = list_first_entry(&dca_domains, - struct dca_domain, - node); - else - return NULL; - } + struct dca_provider *dca, *ret = NULL; - list_for_each_entry(dca, &domain->dca_providers, node) - if ((!dev) || (dca->ops->dev_managed(dca, dev))) - return dca; + list_for_each_entry(dca, &dca_providers, node) { + if ((!dev) || (dca->ops->dev_managed(dca, dev))) { + ret = dca; + break; + } + } - return NULL; + return ret; } /** @@ -133,8 +61,6 @@ int dca_add_requester(struct device *dev) struct dca_provider *dca; int err, slot = -ENODEV; unsigned long flags; - struct pci_bus *pci_rc; - struct dca_domain *domain; if (!dev) return -EFAULT; @@ -148,14 +74,7 @@ int dca_add_requester(struct device *dev) return -EEXIST; } - pci_rc = dca_pci_rc_from_dev(dev); - domain = dca_find_domain(pci_rc); - if (!domain) { - spin_unlock_irqrestore(&dca_lock, flags); - return -ENODEV; - } - - list_for_each_entry(dca, &domain->dca_providers, node) { + list_for_each_entry(dca, &dca_providers, node) { slot = dca->ops->add_requester(dca, dev); if (slot >= 0) break; @@ -303,19 +222,13 @@ int register_dca_provider(struct dca_provider *dca, struct device *dev) { int err; unsigned long flags; - struct dca_domain *domain; err = dca_sysfs_add_provider(dca, dev); if (err) return err; spin_lock_irqsave(&dca_lock, flags); - domain = dca_get_domain(dev); - if (!domain) { - spin_unlock_irqrestore(&dca_lock, flags); - return -ENODEV; - } - list_add(&dca->node, &domain->dca_providers); + list_add(&dca->node, &dca_providers); spin_unlock_irqrestore(&dca_lock, flags); blocking_notifier_call_chain(&dca_provider_chain, @@ -328,24 +241,15 @@ EXPORT_SYMBOL_GPL(register_dca_provider); * unregister_dca_provider - remove a dca provider * @dca - struct created by alloc_dca_provider() */ -void unregister_dca_provider(struct dca_provider *dca, struct device *dev) +void unregister_dca_provider(struct dca_provider *dca) { unsigned long flags; - struct pci_bus *pci_rc; - struct dca_domain *domain; blocking_notifier_call_chain(&dca_provider_chain, DCA_PROVIDER_REMOVE, NULL); spin_lock_irqsave(&dca_lock, flags); - list_del(&dca->node); - - pci_rc = dca_pci_rc_from_dev(dev); - domain = dca_find_domain(pci_rc); - if (list_empty(&domain->dca_providers)) - dca_free_domain(domain); - spin_unlock_irqrestore(&dca_lock, flags); dca_sysfs_remove_provider(dca); @@ -372,7 +276,7 @@ EXPORT_SYMBOL_GPL(dca_unregister_notify); static int __init dca_init(void) { - pr_info("dca service started, version %s\n", DCA_VERSION); + printk(KERN_ERR "dca service started, version %s\n", DCA_VERSION); return dca_sysfs_init(); } diff --git a/trunk/drivers/dma/Kconfig b/trunk/drivers/dma/Kconfig index 5903a88351bf..81e1020fb514 100644 --- a/trunk/drivers/dma/Kconfig +++ b/trunk/drivers/dma/Kconfig @@ -17,15 +17,11 @@ if DMADEVICES comment "DMA Devices" -config ASYNC_TX_DISABLE_CHANNEL_SWITCH - bool - config INTEL_IOATDMA tristate "Intel I/OAT DMA support" depends on PCI && X86 select DMA_ENGINE select DCA - select ASYNC_TX_DISABLE_CHANNEL_SWITCH help Enable support for the Intel(R) I/OAT DMA engine present in recent Intel Xeon chipsets. @@ -101,14 +97,6 @@ config TXX9_DMAC Support the TXx9 SoC internal DMA controller. This can be integrated in chips such as the Toshiba TX4927/38/39. -config SH_DMAE - tristate "Renesas SuperH DMAC support" - depends on SUPERH && SH_DMA - depends on !SH_DMA_API - select DMA_ENGINE - help - Enable support for the Renesas SuperH DMA controllers. - config DMA_ENGINE bool @@ -128,7 +116,7 @@ config NET_DMA config ASYNC_TX_DMA bool "Async_tx: Offload support for the async_tx api" - depends on DMA_ENGINE + depends on DMA_ENGINE && !HIGHMEM64G help This allows the async_tx api to take advantage of offload engines for memcpy, memset, xor, and raid6 p+q operations. If your platform has diff --git a/trunk/drivers/dma/Makefile b/trunk/drivers/dma/Makefile index eca71ba78ae9..40e1e0083571 100644 --- a/trunk/drivers/dma/Makefile +++ b/trunk/drivers/dma/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_DMATEST) += dmatest.o -obj-$(CONFIG_INTEL_IOATDMA) += ioat/ +obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +ioatdma-objs := ioat.o ioat_dma.o ioat_dca.o obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_FSL_DMA) += fsldma.o obj-$(CONFIG_MV_XOR) += mv_xor.o @@ -9,4 +10,3 @@ obj-$(CONFIG_DW_DMAC) += dw_dmac.o obj-$(CONFIG_AT_HDMAC) += at_hdmac.o obj-$(CONFIG_MX3_IPU) += ipu/ obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o -obj-$(CONFIG_SH_DMAE) += shdma.o diff --git a/trunk/drivers/dma/at_hdmac.c b/trunk/drivers/dma/at_hdmac.c index 7585c4164bd5..c8522e6f1ad2 100644 --- a/trunk/drivers/dma/at_hdmac.c +++ b/trunk/drivers/dma/at_hdmac.c @@ -87,7 +87,6 @@ static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan, desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys); if (desc) { memset(desc, 0, sizeof(struct at_desc)); - INIT_LIST_HEAD(&desc->tx_list); dma_async_tx_descriptor_init(&desc->txd, chan); /* txd.flags will be overwritten in prep functions */ desc->txd.flags = DMA_CTRL_ACK; @@ -151,11 +150,11 @@ static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc) struct at_desc *child; spin_lock_bh(&atchan->lock); - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) dev_vdbg(chan2dev(&atchan->chan_common), "moving child desc %p to freelist\n", child); - list_splice_init(&desc->tx_list, &atchan->free_list); + list_splice_init(&desc->txd.tx_list, &atchan->free_list); dev_vdbg(chan2dev(&atchan->chan_common), "moving desc %p to freelist\n", desc); list_add(&desc->desc_node, &atchan->free_list); @@ -248,33 +247,30 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc) param = txd->callback_param; /* move children to free_list */ - list_splice_init(&desc->tx_list, &atchan->free_list); + list_splice_init(&txd->tx_list, &atchan->free_list); /* move myself to free_list */ list_move(&desc->desc_node, &atchan->free_list); /* unmap dma addresses */ - if (!atchan->chan_common.private) { - struct device *parent = chan2parent(&atchan->chan_common); - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(parent, - desc->lli.daddr, - desc->len, DMA_FROM_DEVICE); - else - dma_unmap_page(parent, - desc->lli.daddr, - desc->len, DMA_FROM_DEVICE); - } - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(parent, - desc->lli.saddr, - desc->len, DMA_TO_DEVICE); - else - dma_unmap_page(parent, - desc->lli.saddr, - desc->len, DMA_TO_DEVICE); - } + if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + } + if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); } /* @@ -338,7 +334,7 @@ static void atc_cleanup_descriptors(struct at_dma_chan *atchan) /* This one is currently in progress */ return; - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) if (!(child->lli.ctrla & ATC_DONE)) /* Currently in progress */ return; @@ -411,7 +407,7 @@ static void atc_handle_error(struct at_dma_chan *atchan) dev_crit(chan2dev(&atchan->chan_common), " cookie: %d\n", bad_desc->txd.cookie); atc_dump_lli(atchan, &bad_desc->lli); - list_for_each_entry(child, &bad_desc->tx_list, desc_node) + list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) atc_dump_lli(atchan, &child->lli); /* Pretend the descriptor completed successfully */ @@ -591,7 +587,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, prev->lli.dscr = desc->txd.phys; /* insert the link descriptor to the LD ring */ list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; } @@ -650,6 +646,8 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, reg_width = atslave->reg_width; + sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction); + ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla; ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN; @@ -689,7 +687,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, prev->lli.dscr = desc->txd.phys; /* insert the link descriptor to the LD ring */ list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; total_len += len; @@ -731,7 +729,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, prev->lli.dscr = desc->txd.phys; /* insert the link descriptor to the LD ring */ list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; total_len += len; diff --git a/trunk/drivers/dma/at_hdmac_regs.h b/trunk/drivers/dma/at_hdmac_regs.h index 495457e3dc4b..4c972afc49ec 100644 --- a/trunk/drivers/dma/at_hdmac_regs.h +++ b/trunk/drivers/dma/at_hdmac_regs.h @@ -165,7 +165,6 @@ struct at_desc { struct at_lli lli; /* THEN values for driver housekeeping */ - struct list_head tx_list; struct dma_async_tx_descriptor txd; struct list_head desc_node; size_t len; diff --git a/trunk/drivers/dma/dmaengine.c b/trunk/drivers/dma/dmaengine.c index bd0b248de2cf..5a87384ea4ff 100644 --- a/trunk/drivers/dma/dmaengine.c +++ b/trunk/drivers/dma/dmaengine.c @@ -608,40 +608,6 @@ void dmaengine_put(void) } EXPORT_SYMBOL(dmaengine_put); -static bool device_has_all_tx_types(struct dma_device *device) -{ - /* A device that satisfies this test has channels that will never cause - * an async_tx channel switch event as all possible operation types can - * be handled. - */ - #ifdef CONFIG_ASYNC_TX_DMA - if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask)) - return false; - #endif - - #if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE) - if (!dma_has_cap(DMA_MEMCPY, device->cap_mask)) - return false; - #endif - - #if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE) - if (!dma_has_cap(DMA_MEMSET, device->cap_mask)) - return false; - #endif - - #if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE) - if (!dma_has_cap(DMA_XOR, device->cap_mask)) - return false; - #endif - - #if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE) - if (!dma_has_cap(DMA_PQ, device->cap_mask)) - return false; - #endif - - return true; -} - static int get_dma_id(struct dma_device *device) { int rc; @@ -678,12 +644,8 @@ int dma_async_device_register(struct dma_device *device) !device->device_prep_dma_memcpy); BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor); - BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) && - !device->device_prep_dma_xor_val); - BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) && - !device->device_prep_dma_pq); - BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) && - !device->device_prep_dma_pq_val); + BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && + !device->device_prep_dma_zero_sum); BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && !device->device_prep_dma_memset); BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && @@ -699,12 +661,6 @@ int dma_async_device_register(struct dma_device *device) BUG_ON(!device->device_issue_pending); BUG_ON(!device->dev); - /* note: this only matters in the - * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case - */ - if (device_has_all_tx_types(device)) - dma_cap_set(DMA_ASYNC_TX, device->cap_mask); - idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); if (!idr_ref) return -ENOMEM; @@ -977,29 +933,55 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, { tx->chan = chan; spin_lock_init(&tx->lock); + INIT_LIST_HEAD(&tx->tx_list); } EXPORT_SYMBOL(dma_async_tx_descriptor_init); /* dma_wait_for_async_tx - spin wait for a transaction to complete * @tx: in-flight transaction to wait on + * + * This routine assumes that tx was obtained from a call to async_memcpy, + * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped + * and submitted). Walking the parent chain is only meant to cover for DMA + * drivers that do not implement the DMA_INTERRUPT capability and may race with + * the driver's descriptor cleanup routine. */ enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { - unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); + enum dma_status status; + struct dma_async_tx_descriptor *iter; + struct dma_async_tx_descriptor *parent; if (!tx) return DMA_SUCCESS; - while (tx->cookie == -EBUSY) { - if (time_after_eq(jiffies, dma_sync_wait_timeout)) { - pr_err("%s timeout waiting for descriptor submission\n", - __func__); - return DMA_ERROR; - } - cpu_relax(); - } - return dma_sync_wait(tx->chan, tx->cookie); + WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" + " %s\n", __func__, dma_chan_name(tx->chan)); + + /* poll through the dependency chain, return when tx is complete */ + do { + iter = tx; + + /* find the root of the unsubmitted dependency chain */ + do { + parent = iter->parent; + if (!parent) + break; + else + iter = parent; + } while (parent); + + /* there is a small window for ->parent == NULL and + * ->cookie == -EBUSY + */ + while (iter->cookie == -EBUSY) + cpu_relax(); + + status = dma_sync_wait(iter->chan, iter->cookie); + } while (status == DMA_IN_PROGRESS || (iter != tx)); + + return status; } EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); diff --git a/trunk/drivers/dma/dmatest.c b/trunk/drivers/dma/dmatest.c index a32a4cf7b1e0..d93017fc7872 100644 --- a/trunk/drivers/dma/dmatest.c +++ b/trunk/drivers/dma/dmatest.c @@ -48,11 +48,6 @@ module_param(xor_sources, uint, S_IRUGO); MODULE_PARM_DESC(xor_sources, "Number of xor source buffers (default: 3)"); -static unsigned int pq_sources = 3; -module_param(pq_sources, uint, S_IRUGO); -MODULE_PARM_DESC(pq_sources, - "Number of p+q source buffers (default: 3)"); - /* * Initialization patterns. All bytes in the source buffer has bit 7 * set, all bytes in the destination buffer has bit 7 cleared. @@ -237,7 +232,6 @@ static int dmatest_func(void *data) dma_cookie_t cookie; enum dma_status status; enum dma_ctrl_flags flags; - u8 pq_coefs[pq_sources]; int ret; int src_cnt; int dst_cnt; @@ -254,11 +248,6 @@ static int dmatest_func(void *data) else if (thread->type == DMA_XOR) { src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ dst_cnt = 1; - } else if (thread->type == DMA_PQ) { - src_cnt = pq_sources | 1; /* force odd to ensure dst = src */ - dst_cnt = 2; - for (i = 0; i < pq_sources; i++) - pq_coefs[i] = 1; } else goto err_srcs; @@ -294,7 +283,6 @@ static int dmatest_func(void *data) dma_addr_t dma_dsts[dst_cnt]; struct completion cmp; unsigned long tmo = msecs_to_jiffies(3000); - u8 align = 0; total_tests++; @@ -302,18 +290,6 @@ static int dmatest_func(void *data) src_off = dmatest_random() % (test_buf_size - len + 1); dst_off = dmatest_random() % (test_buf_size - len + 1); - /* honor alignment restrictions */ - if (thread->type == DMA_MEMCPY) - align = dev->copy_align; - else if (thread->type == DMA_XOR) - align = dev->xor_align; - else if (thread->type == DMA_PQ) - align = dev->pq_align; - - len = (len >> align) << align; - src_off = (src_off >> align) << align; - dst_off = (dst_off >> align) << align; - dmatest_init_srcs(thread->srcs, src_off, len); dmatest_init_dsts(thread->dsts, dst_off, len); @@ -330,7 +306,6 @@ static int dmatest_func(void *data) DMA_BIDIRECTIONAL); } - if (thread->type == DMA_MEMCPY) tx = dev->device_prep_dma_memcpy(chan, dma_dsts[0] + dst_off, @@ -341,15 +316,6 @@ static int dmatest_func(void *data) dma_dsts[0] + dst_off, dma_srcs, xor_sources, len, flags); - else if (thread->type == DMA_PQ) { - dma_addr_t dma_pq[dst_cnt]; - - for (i = 0; i < dst_cnt; i++) - dma_pq[i] = dma_dsts[i] + dst_off; - tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs, - pq_sources, pq_coefs, - len, flags); - } if (!tx) { for (i = 0; i < src_cnt; i++) @@ -493,8 +459,6 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty op = "copy"; else if (type == DMA_XOR) op = "xor"; - else if (type == DMA_PQ) - op = "pq"; else return -EINVAL; @@ -550,10 +514,6 @@ static int dmatest_add_channel(struct dma_chan *chan) cnt = dmatest_add_threads(dtc, DMA_XOR); thread_count += cnt > 0 ? cnt : 0; } - if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { - cnt = dmatest_add_threads(dtc, DMA_PQ); - thread_count += cnt > 0 ?: 0; - } pr_info("dmatest: Started %u threads using %s\n", thread_count, dma_chan_name(chan)); diff --git a/trunk/drivers/dma/dw_dmac.c b/trunk/drivers/dma/dw_dmac.c index 2eea823516a7..933c143b6a74 100644 --- a/trunk/drivers/dma/dw_dmac.c +++ b/trunk/drivers/dma/dw_dmac.c @@ -116,7 +116,7 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc) { struct dw_desc *child; - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) dma_sync_single_for_cpu(chan2parent(&dwc->chan), child->txd.phys, sizeof(child->lli), DMA_TO_DEVICE); @@ -137,11 +137,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc) dwc_sync_desc_for_cpu(dwc, desc); spin_lock_bh(&dwc->lock); - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) dev_vdbg(chan2dev(&dwc->chan), "moving child desc %p to freelist\n", child); - list_splice_init(&desc->tx_list, &dwc->free_list); + list_splice_init(&desc->txd.tx_list, &dwc->free_list); dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc); list_add(&desc->desc_node, &dwc->free_list); spin_unlock_bh(&dwc->lock); @@ -209,28 +209,19 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc) param = txd->callback_param; dwc_sync_desc_for_cpu(dwc, desc); - list_splice_init(&desc->tx_list, &dwc->free_list); + list_splice_init(&txd->tx_list, &dwc->free_list); list_move(&desc->desc_node, &dwc->free_list); - if (!dwc->chan.private) { - struct device *parent = chan2parent(&dwc->chan); - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(parent, desc->lli.dar, - desc->len, DMA_FROM_DEVICE); - else - dma_unmap_page(parent, desc->lli.dar, - desc->len, DMA_FROM_DEVICE); - } - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(parent, desc->lli.sar, - desc->len, DMA_TO_DEVICE); - else - dma_unmap_page(parent, desc->lli.sar, - desc->len, DMA_TO_DEVICE); - } - } + /* + * We use dma_unmap_page() regardless of how the buffers were + * mapped before they were submitted... + */ + if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) + dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar, + desc->len, DMA_FROM_DEVICE); + if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) + dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar, + desc->len, DMA_TO_DEVICE); /* * The API requires that no submissions are done from a @@ -298,7 +289,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc) /* This one is currently in progress */ return; - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) if (child->lli.llp == llp) /* Currently in progress */ return; @@ -365,7 +356,7 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc) dev_printk(KERN_CRIT, chan2dev(&dwc->chan), " cookie: %d\n", bad_desc->txd.cookie); dwc_dump_lli(dwc, &bad_desc->lli); - list_for_each_entry(child, &bad_desc->tx_list, desc_node) + list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) dwc_dump_lli(dwc, &child->lli); /* Pretend the descriptor completed successfully */ @@ -617,7 +608,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; } @@ -667,6 +658,8 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, reg_width = dws->reg_width; prev = first = NULL; + sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction); + switch (direction) { case DMA_TO_DEVICE: ctllo = (DWC_DEFAULT_CTLLO @@ -707,7 +700,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, sizeof(prev->lli), DMA_TO_DEVICE); list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; total_len += len; @@ -753,7 +746,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, sizeof(prev->lli), DMA_TO_DEVICE); list_add_tail(&desc->desc_node, - &first->tx_list); + &first->txd.tx_list); } prev = desc; total_len += len; @@ -909,7 +902,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) break; } - INIT_LIST_HEAD(&desc->tx_list); dma_async_tx_descriptor_init(&desc->txd, chan); desc->txd.tx_submit = dwc_tx_submit; desc->txd.flags = DMA_CTRL_ACK; diff --git a/trunk/drivers/dma/dw_dmac_regs.h b/trunk/drivers/dma/dw_dmac_regs.h index d9a939f67f46..13a580767031 100644 --- a/trunk/drivers/dma/dw_dmac_regs.h +++ b/trunk/drivers/dma/dw_dmac_regs.h @@ -217,7 +217,6 @@ struct dw_desc { /* THEN values for driver housekeeping */ struct list_head desc_node; - struct list_head tx_list; struct dma_async_tx_descriptor txd; size_t len; }; diff --git a/trunk/drivers/dma/fsldma.c b/trunk/drivers/dma/fsldma.c index 296f9e747fac..ef87a8984145 100644 --- a/trunk/drivers/dma/fsldma.c +++ b/trunk/drivers/dma/fsldma.c @@ -34,7 +34,6 @@ #include #include -#include #include "fsldma.h" static void dma_init(struct fsl_dma_chan *fsl_chan) @@ -280,41 +279,29 @@ static void fsl_chan_set_dest_loop_size(struct fsl_dma_chan *fsl_chan, int size) } } -/** - * fsl_chan_set_request_count - Set DMA Request Count for external control - * @fsl_chan : Freescale DMA channel - * @size : Number of bytes to transfer in a single request - * - * The Freescale DMA channel can be controlled by the external signal DREQ#. - * The DMA request count is how many bytes are allowed to transfer before - * pausing the channel, after which a new assertion of DREQ# resumes channel - * operation. - * - * A size of 0 disables external pause control. The maximum size is 1024. - */ -static void fsl_chan_set_request_count(struct fsl_dma_chan *fsl_chan, int size) -{ - BUG_ON(size > 1024); - DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, - DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) - | ((__ilog2(size) << 24) & 0x0f000000), - 32); -} - /** * fsl_chan_toggle_ext_pause - Toggle channel external pause status * @fsl_chan : Freescale DMA channel - * @enable : 0 is disabled, 1 is enabled. + * @size : Pause control size, 0 for disable external pause control. + * The maximum is 1024. * - * The Freescale DMA channel can be controlled by the external signal DREQ#. - * The DMA Request Count feature should be used in addition to this feature - * to set the number of bytes to transfer before pausing the channel. + * The Freescale DMA channel can be controlled by the external + * signal DREQ#. The pause control size is how many bytes are allowed + * to transfer before pausing the channel, after which a new assertion + * of DREQ# resumes channel operation. */ -static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int enable) +static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int size) { - if (enable) + if (size > 1024) + return; + + if (size) { + DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, + DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) + | ((__ilog2(size) << 24) & 0x0f000000), + 32); fsl_chan->feature |= FSL_DMA_CHAN_PAUSE_EXT; - else + } else fsl_chan->feature &= ~FSL_DMA_CHAN_PAUSE_EXT; } @@ -339,8 +326,7 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable) static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) { struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan); - struct fsl_desc_sw *desc = tx_to_fsl_desc(tx); - struct fsl_desc_sw *child; + struct fsl_desc_sw *desc; unsigned long flags; dma_cookie_t cookie; @@ -348,7 +334,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) spin_lock_irqsave(&fsl_chan->desc_lock, flags); cookie = fsl_chan->common.cookie; - list_for_each_entry(child, &desc->tx_list, node) { + list_for_each_entry(desc, &tx->tx_list, node) { cookie++; if (cookie < 0) cookie = 1; @@ -357,8 +343,8 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) } fsl_chan->common.cookie = cookie; - append_ld_queue(fsl_chan, desc); - list_splice_init(&desc->tx_list, fsl_chan->ld_queue.prev); + append_ld_queue(fsl_chan, tx_to_fsl_desc(tx)); + list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev); spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); @@ -380,7 +366,6 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor( desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc); if (desc_sw) { memset(desc_sw, 0, sizeof(struct fsl_desc_sw)); - INIT_LIST_HEAD(&desc_sw->tx_list); dma_async_tx_descriptor_init(&desc_sw->async_tx, &fsl_chan->common); desc_sw->async_tx.tx_submit = fsl_dma_tx_submit; @@ -470,7 +455,7 @@ fsl_dma_prep_interrupt(struct dma_chan *chan, unsigned long flags) new->async_tx.flags = flags; /* Insert the link descriptor to the LD ring */ - list_add_tail(&new->node, &new->tx_list); + list_add_tail(&new->node, &new->async_tx.tx_list); /* Set End-of-link to the last link descriptor of new list*/ set_ld_eol(fsl_chan, new); @@ -528,7 +513,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( dma_dest += copy; /* Insert the link descriptor to the LD ring */ - list_add_tail(&new->node, &first->tx_list); + list_add_tail(&new->node, &first->async_tx.tx_list); } while (len); new->async_tx.flags = flags; /* client is in control of this ack */ @@ -543,7 +528,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( if (!first) return NULL; - list = &first->tx_list; + list = &first->async_tx.tx_list; list_for_each_entry_safe_reverse(new, prev, list, node) { list_del(&new->node); dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys); @@ -552,229 +537,6 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( return NULL; } -/** - * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction - * @chan: DMA channel - * @sgl: scatterlist to transfer to/from - * @sg_len: number of entries in @scatterlist - * @direction: DMA direction - * @flags: DMAEngine flags - * - * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the - * DMA_SLAVE API, this gets the device-specific information from the - * chan->private variable. - */ -static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg( - struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long flags) -{ - struct fsl_dma_chan *fsl_chan; - struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL; - struct fsl_dma_slave *slave; - struct list_head *tx_list; - size_t copy; - - int i; - struct scatterlist *sg; - size_t sg_used; - size_t hw_used; - struct fsl_dma_hw_addr *hw; - dma_addr_t dma_dst, dma_src; - - if (!chan) - return NULL; - - if (!chan->private) - return NULL; - - fsl_chan = to_fsl_chan(chan); - slave = chan->private; - - if (list_empty(&slave->addresses)) - return NULL; - - hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry); - hw_used = 0; - - /* - * Build the hardware transaction to copy from the scatterlist to - * the hardware, or from the hardware to the scatterlist - * - * If you are copying from the hardware to the scatterlist and it - * takes two hardware entries to fill an entire page, then both - * hardware entries will be coalesced into the same page - * - * If you are copying from the scatterlist to the hardware and a - * single page can fill two hardware entries, then the data will - * be read out of the page into the first hardware entry, and so on - */ - for_each_sg(sgl, sg, sg_len, i) { - sg_used = 0; - - /* Loop until the entire scatterlist entry is used */ - while (sg_used < sg_dma_len(sg)) { - - /* - * If we've used up the current hardware address/length - * pair, we need to load a new one - * - * This is done in a while loop so that descriptors with - * length == 0 will be skipped - */ - while (hw_used >= hw->length) { - - /* - * If the current hardware entry is the last - * entry in the list, we're finished - */ - if (list_is_last(&hw->entry, &slave->addresses)) - goto finished; - - /* Get the next hardware address/length pair */ - hw = list_entry(hw->entry.next, - struct fsl_dma_hw_addr, entry); - hw_used = 0; - } - - /* Allocate the link descriptor from DMA pool */ - new = fsl_dma_alloc_descriptor(fsl_chan); - if (!new) { - dev_err(fsl_chan->dev, "No free memory for " - "link descriptor\n"); - goto fail; - } -#ifdef FSL_DMA_LD_DEBUG - dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new); -#endif - - /* - * Calculate the maximum number of bytes to transfer, - * making sure it is less than the DMA controller limit - */ - copy = min_t(size_t, sg_dma_len(sg) - sg_used, - hw->length - hw_used); - copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT); - - /* - * DMA_FROM_DEVICE - * from the hardware to the scatterlist - * - * DMA_TO_DEVICE - * from the scatterlist to the hardware - */ - if (direction == DMA_FROM_DEVICE) { - dma_src = hw->address + hw_used; - dma_dst = sg_dma_address(sg) + sg_used; - } else { - dma_src = sg_dma_address(sg) + sg_used; - dma_dst = hw->address + hw_used; - } - - /* Fill in the descriptor */ - set_desc_cnt(fsl_chan, &new->hw, copy); - set_desc_src(fsl_chan, &new->hw, dma_src); - set_desc_dest(fsl_chan, &new->hw, dma_dst); - - /* - * If this is not the first descriptor, chain the - * current descriptor after the previous descriptor - */ - if (!first) { - first = new; - } else { - set_desc_next(fsl_chan, &prev->hw, - new->async_tx.phys); - } - - new->async_tx.cookie = 0; - async_tx_ack(&new->async_tx); - - prev = new; - sg_used += copy; - hw_used += copy; - - /* Insert the link descriptor into the LD ring */ - list_add_tail(&new->node, &first->tx_list); - } - } - -finished: - - /* All of the hardware address/length pairs had length == 0 */ - if (!first || !new) - return NULL; - - new->async_tx.flags = flags; - new->async_tx.cookie = -EBUSY; - - /* Set End-of-link to the last link descriptor of new list */ - set_ld_eol(fsl_chan, new); - - /* Enable extra controller features */ - if (fsl_chan->set_src_loop_size) - fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size); - - if (fsl_chan->set_dest_loop_size) - fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size); - - if (fsl_chan->toggle_ext_start) - fsl_chan->toggle_ext_start(fsl_chan, slave->external_start); - - if (fsl_chan->toggle_ext_pause) - fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause); - - if (fsl_chan->set_request_count) - fsl_chan->set_request_count(fsl_chan, slave->request_count); - - return &first->async_tx; - -fail: - /* If first was not set, then we failed to allocate the very first - * descriptor, and we're done */ - if (!first) - return NULL; - - /* - * First is set, so all of the descriptors we allocated have been added - * to first->tx_list, INCLUDING "first" itself. Therefore we - * must traverse the list backwards freeing each descriptor in turn - * - * We're re-using variables for the loop, oh well - */ - tx_list = &first->tx_list; - list_for_each_entry_safe_reverse(new, prev, tx_list, node) { - list_del_init(&new->node); - dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys); - } - - return NULL; -} - -static void fsl_dma_device_terminate_all(struct dma_chan *chan) -{ - struct fsl_dma_chan *fsl_chan; - struct fsl_desc_sw *desc, *tmp; - unsigned long flags; - - if (!chan) - return; - - fsl_chan = to_fsl_chan(chan); - - /* Halt the DMA engine */ - dma_halt(fsl_chan); - - spin_lock_irqsave(&fsl_chan->desc_lock, flags); - - /* Remove and free all of the descriptors in the LD queue */ - list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) { - list_del(&desc->node); - dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys); - } - - spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); -} - /** * fsl_dma_update_completed_cookie - Update the completed cookie. * @fsl_chan : Freescale DMA channel @@ -1121,7 +883,6 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; - new_fsl_chan->set_request_count = fsl_chan_set_request_count; } spin_lock_init(&new_fsl_chan->desc_lock); @@ -1201,15 +962,12 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev, dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask); dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask); - dma_cap_set(DMA_SLAVE, fdev->common.cap_mask); fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources; fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources; fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt; fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy; fdev->common.device_is_tx_complete = fsl_dma_is_complete; fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending; - fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg; - fdev->common.device_terminate_all = fsl_dma_device_terminate_all; fdev->common.dev = &dev->dev; fdev->irq = irq_of_parse_and_map(dev->node, 0); diff --git a/trunk/drivers/dma/fsldma.h b/trunk/drivers/dma/fsldma.h index 0df14cbb8ca3..dc7f26865797 100644 --- a/trunk/drivers/dma/fsldma.h +++ b/trunk/drivers/dma/fsldma.h @@ -90,7 +90,6 @@ struct fsl_dma_ld_hw { struct fsl_desc_sw { struct fsl_dma_ld_hw hw; struct list_head node; - struct list_head tx_list; struct dma_async_tx_descriptor async_tx; struct list_head *ld; void *priv; @@ -144,11 +143,10 @@ struct fsl_dma_chan { struct tasklet_struct tasklet; u32 feature; - void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int enable); + void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int size); void (*toggle_ext_start)(struct fsl_dma_chan *fsl_chan, int enable); void (*set_src_loop_size)(struct fsl_dma_chan *fsl_chan, int size); void (*set_dest_loop_size)(struct fsl_dma_chan *fsl_chan, int size); - void (*set_request_count)(struct fsl_dma_chan *fsl_chan, int size); }; #define to_fsl_chan(chan) container_of(chan, struct fsl_dma_chan, common) diff --git a/trunk/drivers/dma/ioat.c b/trunk/drivers/dma/ioat.c new file mode 100644 index 000000000000..2225bb6ba3d1 --- /dev/null +++ b/trunk/drivers/dma/ioat.c @@ -0,0 +1,202 @@ +/* + * Intel I/OAT DMA Linux driver + * Copyright(c) 2007 - 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +/* + * This driver supports an Intel I/OAT DMA engine, which does asynchronous + * copy operations. + */ + +#include +#include +#include +#include +#include +#include "ioatdma.h" +#include "ioatdma_registers.h" +#include "ioatdma_hw.h" + +MODULE_VERSION(IOAT_DMA_VERSION); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); + +static struct pci_device_id ioat_pci_tbl[] = { + /* I/OAT v1 platforms */ + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) }, + { PCI_DEVICE(PCI_VENDOR_ID_UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) }, + + /* I/OAT v2 platforms */ + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) }, + + /* I/OAT v3 platforms */ + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) }, + { 0, } +}; + +struct ioat_device { + struct pci_dev *pdev; + void __iomem *iobase; + struct ioatdma_device *dma; + struct dca_provider *dca; +}; + +static int __devinit ioat_probe(struct pci_dev *pdev, + const struct pci_device_id *id); +static void __devexit ioat_remove(struct pci_dev *pdev); + +static int ioat_dca_enabled = 1; +module_param(ioat_dca_enabled, int, 0644); +MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)"); + +static struct pci_driver ioat_pci_driver = { + .name = "ioatdma", + .id_table = ioat_pci_tbl, + .probe = ioat_probe, + .remove = __devexit_p(ioat_remove), +}; + +static int __devinit ioat_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + void __iomem *iobase; + struct ioat_device *device; + unsigned long mmio_start, mmio_len; + int err; + + err = pci_enable_device(pdev); + if (err) + goto err_enable_device; + + err = pci_request_regions(pdev, ioat_pci_driver.name); + if (err) + goto err_request_regions; + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) + goto err_set_dma_mask; + + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) + goto err_set_dma_mask; + + mmio_start = pci_resource_start(pdev, 0); + mmio_len = pci_resource_len(pdev, 0); + iobase = ioremap(mmio_start, mmio_len); + if (!iobase) { + err = -ENOMEM; + goto err_ioremap; + } + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + err = -ENOMEM; + goto err_kzalloc; + } + device->pdev = pdev; + pci_set_drvdata(pdev, device); + device->iobase = iobase; + + pci_set_master(pdev); + + switch (readb(iobase + IOAT_VER_OFFSET)) { + case IOAT_VER_1_2: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat_dca_init(pdev, iobase); + break; + case IOAT_VER_2_0: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat2_dca_init(pdev, iobase); + break; + case IOAT_VER_3_0: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat3_dca_init(pdev, iobase); + break; + default: + err = -ENODEV; + break; + } + if (!device->dma) + err = -ENODEV; + + if (err) + goto err_version; + + return 0; + +err_version: + kfree(device); +err_kzalloc: + iounmap(iobase); +err_ioremap: +err_set_dma_mask: + pci_release_regions(pdev); + pci_disable_device(pdev); +err_request_regions: +err_enable_device: + return err; +} + +static void __devexit ioat_remove(struct pci_dev *pdev) +{ + struct ioat_device *device = pci_get_drvdata(pdev); + + dev_err(&pdev->dev, "Removing dma and dca services\n"); + if (device->dca) { + unregister_dca_provider(device->dca); + free_dca_provider(device->dca); + device->dca = NULL; + } + + if (device->dma) { + ioat_dma_remove(device->dma); + device->dma = NULL; + } + + kfree(device); +} + +static int __init ioat_init_module(void) +{ + return pci_register_driver(&ioat_pci_driver); +} +module_init(ioat_init_module); + +static void __exit ioat_exit_module(void) +{ + pci_unregister_driver(&ioat_pci_driver); +} +module_exit(ioat_exit_module); diff --git a/trunk/drivers/dma/ioat/Makefile b/trunk/drivers/dma/ioat/Makefile deleted file mode 100644 index 8997d3fb9051..000000000000 --- a/trunk/drivers/dma/ioat/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o -ioatdma-objs := pci.o dma.o dma_v2.o dma_v3.o dca.o diff --git a/trunk/drivers/dma/ioat/dma.c b/trunk/drivers/dma/ioat/dma.c deleted file mode 100644 index c524d36d3c2e..000000000000 --- a/trunk/drivers/dma/ioat/dma.c +++ /dev/null @@ -1,1238 +0,0 @@ -/* - * Intel I/OAT DMA Linux driver - * Copyright(c) 2004 - 2009 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - */ - -/* - * This driver supports an Intel I/OAT DMA engine, which does asynchronous - * copy operations. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "dma.h" -#include "registers.h" -#include "hw.h" - -int ioat_pending_level = 4; -module_param(ioat_pending_level, int, 0644); -MODULE_PARM_DESC(ioat_pending_level, - "high-water mark for pushing ioat descriptors (default: 4)"); - -/* internal functions */ -static void ioat1_cleanup(struct ioat_dma_chan *ioat); -static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat); - -/** - * ioat_dma_do_interrupt - handler used for single vector interrupt mode - * @irq: interrupt id - * @data: interrupt data - */ -static irqreturn_t ioat_dma_do_interrupt(int irq, void *data) -{ - struct ioatdma_device *instance = data; - struct ioat_chan_common *chan; - unsigned long attnstatus; - int bit; - u8 intrctrl; - - intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET); - - if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN)) - return IRQ_NONE; - - if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) { - writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); - return IRQ_NONE; - } - - attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET); - for_each_bit(bit, &attnstatus, BITS_PER_LONG) { - chan = ioat_chan_by_index(instance, bit); - tasklet_schedule(&chan->cleanup_task); - } - - writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); - return IRQ_HANDLED; -} - -/** - * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode - * @irq: interrupt id - * @data: interrupt data - */ -static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data) -{ - struct ioat_chan_common *chan = data; - - tasklet_schedule(&chan->cleanup_task); - - return IRQ_HANDLED; -} - -static void ioat1_cleanup_tasklet(unsigned long data); - -/* common channel initialization */ -void ioat_init_channel(struct ioatdma_device *device, - struct ioat_chan_common *chan, int idx, - void (*timer_fn)(unsigned long), - void (*tasklet)(unsigned long), - unsigned long ioat) -{ - struct dma_device *dma = &device->common; - - chan->device = device; - chan->reg_base = device->reg_base + (0x80 * (idx + 1)); - spin_lock_init(&chan->cleanup_lock); - chan->common.device = dma; - list_add_tail(&chan->common.device_node, &dma->channels); - device->idx[idx] = chan; - init_timer(&chan->timer); - chan->timer.function = timer_fn; - chan->timer.data = ioat; - tasklet_init(&chan->cleanup_task, tasklet, ioat); - tasklet_disable(&chan->cleanup_task); -} - -static void ioat1_timer_event(unsigned long data); - -/** - * ioat1_dma_enumerate_channels - find and initialize the device's channels - * @device: the device to be enumerated - */ -static int ioat1_enumerate_channels(struct ioatdma_device *device) -{ - u8 xfercap_scale; - u32 xfercap; - int i; - struct ioat_dma_chan *ioat; - struct device *dev = &device->pdev->dev; - struct dma_device *dma = &device->common; - - INIT_LIST_HEAD(&dma->channels); - dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET); - dma->chancnt &= 0x1f; /* bits [4:0] valid */ - if (dma->chancnt > ARRAY_SIZE(device->idx)) { - dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n", - dma->chancnt, ARRAY_SIZE(device->idx)); - dma->chancnt = ARRAY_SIZE(device->idx); - } - xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET); - xfercap_scale &= 0x1f; /* bits [4:0] valid */ - xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); - dev_dbg(dev, "%s: xfercap = %d\n", __func__, xfercap); - -#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL - if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) - dma->chancnt--; -#endif - for (i = 0; i < dma->chancnt; i++) { - ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL); - if (!ioat) - break; - - ioat_init_channel(device, &ioat->base, i, - ioat1_timer_event, - ioat1_cleanup_tasklet, - (unsigned long) ioat); - ioat->xfercap = xfercap; - spin_lock_init(&ioat->desc_lock); - INIT_LIST_HEAD(&ioat->free_desc); - INIT_LIST_HEAD(&ioat->used_desc); - } - dma->chancnt = i; - return i; -} - -/** - * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended - * descriptors to hw - * @chan: DMA channel handle - */ -static inline void -__ioat1_dma_memcpy_issue_pending(struct ioat_dma_chan *ioat) -{ - void __iomem *reg_base = ioat->base.reg_base; - - dev_dbg(to_dev(&ioat->base), "%s: pending: %d\n", - __func__, ioat->pending); - ioat->pending = 0; - writeb(IOAT_CHANCMD_APPEND, reg_base + IOAT1_CHANCMD_OFFSET); -} - -static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(chan); - - if (ioat->pending > 0) { - spin_lock_bh(&ioat->desc_lock); - __ioat1_dma_memcpy_issue_pending(ioat); - spin_unlock_bh(&ioat->desc_lock); - } -} - -/** - * ioat1_reset_channel - restart a channel - * @ioat: IOAT DMA channel handle - */ -static void ioat1_reset_channel(struct ioat_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - void __iomem *reg_base = chan->reg_base; - u32 chansts, chanerr; - - dev_warn(to_dev(chan), "reset\n"); - chanerr = readl(reg_base + IOAT_CHANERR_OFFSET); - chansts = *chan->completion & IOAT_CHANSTS_STATUS; - if (chanerr) { - dev_err(to_dev(chan), - "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n", - chan_num(chan), chansts, chanerr); - writel(chanerr, reg_base + IOAT_CHANERR_OFFSET); - } - - /* - * whack it upside the head with a reset - * and wait for things to settle out. - * force the pending count to a really big negative - * to make sure no one forces an issue_pending - * while we're waiting. - */ - - ioat->pending = INT_MIN; - writeb(IOAT_CHANCMD_RESET, - reg_base + IOAT_CHANCMD_OFFSET(chan->device->version)); - set_bit(IOAT_RESET_PENDING, &chan->state); - mod_timer(&chan->timer, jiffies + RESET_DELAY); -} - -static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx) -{ - struct dma_chan *c = tx->chan; - struct ioat_dma_chan *ioat = to_ioat_chan(c); - struct ioat_desc_sw *desc = tx_to_ioat_desc(tx); - struct ioat_chan_common *chan = &ioat->base; - struct ioat_desc_sw *first; - struct ioat_desc_sw *chain_tail; - dma_cookie_t cookie; - - spin_lock_bh(&ioat->desc_lock); - /* cookie incr and addition to used_list must be atomic */ - cookie = c->cookie; - cookie++; - if (cookie < 0) - cookie = 1; - c->cookie = cookie; - tx->cookie = cookie; - dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie); - - /* write address into NextDescriptor field of last desc in chain */ - first = to_ioat_desc(desc->tx_list.next); - chain_tail = to_ioat_desc(ioat->used_desc.prev); - /* make descriptor updates globally visible before chaining */ - wmb(); - chain_tail->hw->next = first->txd.phys; - list_splice_tail_init(&desc->tx_list, &ioat->used_desc); - dump_desc_dbg(ioat, chain_tail); - dump_desc_dbg(ioat, first); - - if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state)) - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - - ioat->active += desc->hw->tx_cnt; - ioat->pending += desc->hw->tx_cnt; - if (ioat->pending >= ioat_pending_level) - __ioat1_dma_memcpy_issue_pending(ioat); - spin_unlock_bh(&ioat->desc_lock); - - return cookie; -} - -/** - * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair - * @ioat: the channel supplying the memory pool for the descriptors - * @flags: allocation flags - */ -static struct ioat_desc_sw * -ioat_dma_alloc_descriptor(struct ioat_dma_chan *ioat, gfp_t flags) -{ - struct ioat_dma_descriptor *desc; - struct ioat_desc_sw *desc_sw; - struct ioatdma_device *ioatdma_device; - dma_addr_t phys; - - ioatdma_device = ioat->base.device; - desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys); - if (unlikely(!desc)) - return NULL; - - desc_sw = kzalloc(sizeof(*desc_sw), flags); - if (unlikely(!desc_sw)) { - pci_pool_free(ioatdma_device->dma_pool, desc, phys); - return NULL; - } - - memset(desc, 0, sizeof(*desc)); - - INIT_LIST_HEAD(&desc_sw->tx_list); - dma_async_tx_descriptor_init(&desc_sw->txd, &ioat->base.common); - desc_sw->txd.tx_submit = ioat1_tx_submit; - desc_sw->hw = desc; - desc_sw->txd.phys = phys; - set_desc_id(desc_sw, -1); - - return desc_sw; -} - -static int ioat_initial_desc_count = 256; -module_param(ioat_initial_desc_count, int, 0644); -MODULE_PARM_DESC(ioat_initial_desc_count, - "ioat1: initial descriptors per channel (default: 256)"); -/** - * ioat1_dma_alloc_chan_resources - returns the number of allocated descriptors - * @chan: the channel to be filled out - */ -static int ioat1_dma_alloc_chan_resources(struct dma_chan *c) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - struct ioat_chan_common *chan = &ioat->base; - struct ioat_desc_sw *desc; - u32 chanerr; - int i; - LIST_HEAD(tmp_list); - - /* have we already been set up? */ - if (!list_empty(&ioat->free_desc)) - return ioat->desccount; - - /* Setup register to interrupt and write completion status on error */ - writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET); - - chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); - if (chanerr) { - dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr); - writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET); - } - - /* Allocate descriptors */ - for (i = 0; i < ioat_initial_desc_count; i++) { - desc = ioat_dma_alloc_descriptor(ioat, GFP_KERNEL); - if (!desc) { - dev_err(to_dev(chan), "Only %d initial descriptors\n", i); - break; - } - set_desc_id(desc, i); - list_add_tail(&desc->node, &tmp_list); - } - spin_lock_bh(&ioat->desc_lock); - ioat->desccount = i; - list_splice(&tmp_list, &ioat->free_desc); - spin_unlock_bh(&ioat->desc_lock); - - /* allocate a completion writeback area */ - /* doing 2 32bit writes to mmio since 1 64b write doesn't work */ - chan->completion = pci_pool_alloc(chan->device->completion_pool, - GFP_KERNEL, &chan->completion_dma); - memset(chan->completion, 0, sizeof(*chan->completion)); - writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF, - chan->reg_base + IOAT_CHANCMP_OFFSET_LOW); - writel(((u64) chan->completion_dma) >> 32, - chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH); - - tasklet_enable(&chan->cleanup_task); - ioat1_dma_start_null_desc(ioat); /* give chain to dma device */ - dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n", - __func__, ioat->desccount); - return ioat->desccount; -} - -/** - * ioat1_dma_free_chan_resources - release all the descriptors - * @chan: the channel to be cleaned - */ -static void ioat1_dma_free_chan_resources(struct dma_chan *c) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - struct ioat_chan_common *chan = &ioat->base; - struct ioatdma_device *ioatdma_device = chan->device; - struct ioat_desc_sw *desc, *_desc; - int in_use_descs = 0; - - /* Before freeing channel resources first check - * if they have been previously allocated for this channel. - */ - if (ioat->desccount == 0) - return; - - tasklet_disable(&chan->cleanup_task); - del_timer_sync(&chan->timer); - ioat1_cleanup(ioat); - - /* Delay 100ms after reset to allow internal DMA logic to quiesce - * before removing DMA descriptor resources. - */ - writeb(IOAT_CHANCMD_RESET, - chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version)); - mdelay(100); - - spin_lock_bh(&ioat->desc_lock); - list_for_each_entry_safe(desc, _desc, &ioat->used_desc, node) { - dev_dbg(to_dev(chan), "%s: freeing %d from used list\n", - __func__, desc_id(desc)); - dump_desc_dbg(ioat, desc); - in_use_descs++; - list_del(&desc->node); - pci_pool_free(ioatdma_device->dma_pool, desc->hw, - desc->txd.phys); - kfree(desc); - } - list_for_each_entry_safe(desc, _desc, - &ioat->free_desc, node) { - list_del(&desc->node); - pci_pool_free(ioatdma_device->dma_pool, desc->hw, - desc->txd.phys); - kfree(desc); - } - spin_unlock_bh(&ioat->desc_lock); - - pci_pool_free(ioatdma_device->completion_pool, - chan->completion, - chan->completion_dma); - - /* one is ok since we left it on there on purpose */ - if (in_use_descs > 1) - dev_err(to_dev(chan), "Freeing %d in use descriptors!\n", - in_use_descs - 1); - - chan->last_completion = 0; - chan->completion_dma = 0; - ioat->pending = 0; - ioat->desccount = 0; -} - -/** - * ioat1_dma_get_next_descriptor - return the next available descriptor - * @ioat: IOAT DMA channel handle - * - * Gets the next descriptor from the chain, and must be called with the - * channel's desc_lock held. Allocates more descriptors if the channel - * has run out. - */ -static struct ioat_desc_sw * -ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat) -{ - struct ioat_desc_sw *new; - - if (!list_empty(&ioat->free_desc)) { - new = to_ioat_desc(ioat->free_desc.next); - list_del(&new->node); - } else { - /* try to get another desc */ - new = ioat_dma_alloc_descriptor(ioat, GFP_ATOMIC); - if (!new) { - dev_err(to_dev(&ioat->base), "alloc failed\n"); - return NULL; - } - } - dev_dbg(to_dev(&ioat->base), "%s: allocated: %d\n", - __func__, desc_id(new)); - prefetch(new->hw); - return new; -} - -static struct dma_async_tx_descriptor * -ioat1_dma_prep_memcpy(struct dma_chan *c, dma_addr_t dma_dest, - dma_addr_t dma_src, size_t len, unsigned long flags) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - struct ioat_desc_sw *desc; - size_t copy; - LIST_HEAD(chain); - dma_addr_t src = dma_src; - dma_addr_t dest = dma_dest; - size_t total_len = len; - struct ioat_dma_descriptor *hw = NULL; - int tx_cnt = 0; - - spin_lock_bh(&ioat->desc_lock); - desc = ioat1_dma_get_next_descriptor(ioat); - do { - if (!desc) - break; - - tx_cnt++; - copy = min_t(size_t, len, ioat->xfercap); - - hw = desc->hw; - hw->size = copy; - hw->ctl = 0; - hw->src_addr = src; - hw->dst_addr = dest; - - list_add_tail(&desc->node, &chain); - - len -= copy; - dest += copy; - src += copy; - if (len) { - struct ioat_desc_sw *next; - - async_tx_ack(&desc->txd); - next = ioat1_dma_get_next_descriptor(ioat); - hw->next = next ? next->txd.phys : 0; - dump_desc_dbg(ioat, desc); - desc = next; - } else - hw->next = 0; - } while (len); - - if (!desc) { - struct ioat_chan_common *chan = &ioat->base; - - dev_err(to_dev(chan), - "chan%d - get_next_desc failed\n", chan_num(chan)); - list_splice(&chain, &ioat->free_desc); - spin_unlock_bh(&ioat->desc_lock); - return NULL; - } - spin_unlock_bh(&ioat->desc_lock); - - desc->txd.flags = flags; - desc->len = total_len; - list_splice(&chain, &desc->tx_list); - hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); - hw->ctl_f.compl_write = 1; - hw->tx_cnt = tx_cnt; - dump_desc_dbg(ioat, desc); - - return &desc->txd; -} - -static void ioat1_cleanup_tasklet(unsigned long data) -{ - struct ioat_dma_chan *chan = (void *)data; - - ioat1_cleanup(chan); - writew(IOAT_CHANCTRL_RUN, chan->base.reg_base + IOAT_CHANCTRL_OFFSET); -} - -void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, - size_t len, struct ioat_dma_descriptor *hw) -{ - struct pci_dev *pdev = chan->device->pdev; - size_t offset = len - hw->size; - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) - ioat_unmap(pdev, hw->dst_addr - offset, len, - PCI_DMA_FROMDEVICE, flags, 1); - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) - ioat_unmap(pdev, hw->src_addr - offset, len, - PCI_DMA_TODEVICE, flags, 0); -} - -unsigned long ioat_get_current_completion(struct ioat_chan_common *chan) -{ - unsigned long phys_complete; - u64 completion; - - completion = *chan->completion; - phys_complete = ioat_chansts_to_addr(completion); - - dev_dbg(to_dev(chan), "%s: phys_complete: %#llx\n", __func__, - (unsigned long long) phys_complete); - - if (is_ioat_halted(completion)) { - u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); - dev_err(to_dev(chan), "Channel halted, chanerr = %x\n", - chanerr); - - /* TODO do something to salvage the situation */ - } - - return phys_complete; -} - -bool ioat_cleanup_preamble(struct ioat_chan_common *chan, - unsigned long *phys_complete) -{ - *phys_complete = ioat_get_current_completion(chan); - if (*phys_complete == chan->last_completion) - return false; - clear_bit(IOAT_COMPLETION_ACK, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - - return true; -} - -static void __cleanup(struct ioat_dma_chan *ioat, unsigned long phys_complete) -{ - struct ioat_chan_common *chan = &ioat->base; - struct list_head *_desc, *n; - struct dma_async_tx_descriptor *tx; - - dev_dbg(to_dev(chan), "%s: phys_complete: %lx\n", - __func__, phys_complete); - list_for_each_safe(_desc, n, &ioat->used_desc) { - struct ioat_desc_sw *desc; - - prefetch(n); - desc = list_entry(_desc, typeof(*desc), node); - tx = &desc->txd; - /* - * Incoming DMA requests may use multiple descriptors, - * due to exceeding xfercap, perhaps. If so, only the - * last one will have a cookie, and require unmapping. - */ - dump_desc_dbg(ioat, desc); - if (tx->cookie) { - chan->completed_cookie = tx->cookie; - tx->cookie = 0; - ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw); - ioat->active -= desc->hw->tx_cnt; - if (tx->callback) { - tx->callback(tx->callback_param); - tx->callback = NULL; - } - } - - if (tx->phys != phys_complete) { - /* - * a completed entry, but not the last, so clean - * up if the client is done with the descriptor - */ - if (async_tx_test_ack(tx)) - list_move_tail(&desc->node, &ioat->free_desc); - } else { - /* - * last used desc. Do not remove, so we can - * append from it. - */ - - /* if nothing else is pending, cancel the - * completion timeout - */ - if (n == &ioat->used_desc) { - dev_dbg(to_dev(chan), - "%s cancel completion timeout\n", - __func__); - clear_bit(IOAT_COMPLETION_PENDING, &chan->state); - } - - /* TODO check status bits? */ - break; - } - } - - chan->last_completion = phys_complete; -} - -/** - * ioat1_cleanup - cleanup up finished descriptors - * @chan: ioat channel to be cleaned up - * - * To prevent lock contention we defer cleanup when the locks are - * contended with a terminal timeout that forces cleanup and catches - * completion notification errors. - */ -static void ioat1_cleanup(struct ioat_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - unsigned long phys_complete; - - prefetch(chan->completion); - - if (!spin_trylock_bh(&chan->cleanup_lock)) - return; - - if (!ioat_cleanup_preamble(chan, &phys_complete)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - if (!spin_trylock_bh(&ioat->desc_lock)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - __cleanup(ioat, phys_complete); - - spin_unlock_bh(&ioat->desc_lock); - spin_unlock_bh(&chan->cleanup_lock); -} - -static void ioat1_timer_event(unsigned long data) -{ - struct ioat_dma_chan *ioat = (void *) data; - struct ioat_chan_common *chan = &ioat->base; - - dev_dbg(to_dev(chan), "%s: state: %lx\n", __func__, chan->state); - - spin_lock_bh(&chan->cleanup_lock); - if (test_and_clear_bit(IOAT_RESET_PENDING, &chan->state)) { - struct ioat_desc_sw *desc; - - spin_lock_bh(&ioat->desc_lock); - - /* restart active descriptors */ - desc = to_ioat_desc(ioat->used_desc.prev); - ioat_set_chainaddr(ioat, desc->txd.phys); - ioat_start(chan); - - ioat->pending = 0; - set_bit(IOAT_COMPLETION_PENDING, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - spin_unlock_bh(&ioat->desc_lock); - } else if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) { - unsigned long phys_complete; - - spin_lock_bh(&ioat->desc_lock); - /* if we haven't made progress and we have already - * acknowledged a pending completion once, then be more - * forceful with a restart - */ - if (ioat_cleanup_preamble(chan, &phys_complete)) - __cleanup(ioat, phys_complete); - else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) - ioat1_reset_channel(ioat); - else { - u64 status = ioat_chansts(chan); - - /* manually update the last completion address */ - if (ioat_chansts_to_addr(status) != 0) - *chan->completion = status; - - set_bit(IOAT_COMPLETION_ACK, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - } - spin_unlock_bh(&ioat->desc_lock); - } - spin_unlock_bh(&chan->cleanup_lock); -} - -static enum dma_status -ioat1_dma_is_complete(struct dma_chan *c, dma_cookie_t cookie, - dma_cookie_t *done, dma_cookie_t *used) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - - if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS) - return DMA_SUCCESS; - - ioat1_cleanup(ioat); - - return ioat_is_complete(c, cookie, done, used); -} - -static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - struct ioat_desc_sw *desc; - struct ioat_dma_descriptor *hw; - - spin_lock_bh(&ioat->desc_lock); - - desc = ioat1_dma_get_next_descriptor(ioat); - - if (!desc) { - dev_err(to_dev(chan), - "Unable to start null desc - get next desc failed\n"); - spin_unlock_bh(&ioat->desc_lock); - return; - } - - hw = desc->hw; - hw->ctl = 0; - hw->ctl_f.null = 1; - hw->ctl_f.int_en = 1; - hw->ctl_f.compl_write = 1; - /* set size to non-zero value (channel returns error when size is 0) */ - hw->size = NULL_DESC_BUFFER_SIZE; - hw->src_addr = 0; - hw->dst_addr = 0; - async_tx_ack(&desc->txd); - hw->next = 0; - list_add_tail(&desc->node, &ioat->used_desc); - dump_desc_dbg(ioat, desc); - - ioat_set_chainaddr(ioat, desc->txd.phys); - ioat_start(chan); - spin_unlock_bh(&ioat->desc_lock); -} - -/* - * Perform a IOAT transaction to verify the HW works. - */ -#define IOAT_TEST_SIZE 2000 - -static void __devinit ioat_dma_test_callback(void *dma_async_param) -{ - struct completion *cmp = dma_async_param; - - complete(cmp); -} - -/** - * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works. - * @device: device to be tested - */ -int __devinit ioat_dma_self_test(struct ioatdma_device *device) -{ - int i; - u8 *src; - u8 *dest; - struct dma_device *dma = &device->common; - struct device *dev = &device->pdev->dev; - struct dma_chan *dma_chan; - struct dma_async_tx_descriptor *tx; - dma_addr_t dma_dest, dma_src; - dma_cookie_t cookie; - int err = 0; - struct completion cmp; - unsigned long tmo; - unsigned long flags; - - src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); - if (!src) - return -ENOMEM; - dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); - if (!dest) { - kfree(src); - return -ENOMEM; - } - - /* Fill in src buffer */ - for (i = 0; i < IOAT_TEST_SIZE; i++) - src[i] = (u8)i; - - /* Start copy, using first DMA channel */ - dma_chan = container_of(dma->channels.next, struct dma_chan, - device_node); - if (dma->device_alloc_chan_resources(dma_chan) < 1) { - dev_err(dev, "selftest cannot allocate chan resource\n"); - err = -ENODEV; - goto out; - } - - dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE); - dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE); - flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE | - DMA_PREP_INTERRUPT; - tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src, - IOAT_TEST_SIZE, flags); - if (!tx) { - dev_err(dev, "Self-test prep failed, disabling\n"); - err = -ENODEV; - goto free_resources; - } - - async_tx_ack(tx); - init_completion(&cmp); - tx->callback = ioat_dma_test_callback; - tx->callback_param = &cmp; - cookie = tx->tx_submit(tx); - if (cookie < 0) { - dev_err(dev, "Self-test setup failed, disabling\n"); - err = -ENODEV; - goto free_resources; - } - dma->device_issue_pending(dma_chan); - - tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - - if (tmo == 0 || - dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) - != DMA_SUCCESS) { - dev_err(dev, "Self-test copy timed out, disabling\n"); - err = -ENODEV; - goto free_resources; - } - if (memcmp(src, dest, IOAT_TEST_SIZE)) { - dev_err(dev, "Self-test copy failed compare, disabling\n"); - err = -ENODEV; - goto free_resources; - } - -free_resources: - dma->device_free_chan_resources(dma_chan); -out: - kfree(src); - kfree(dest); - return err; -} - -static char ioat_interrupt_style[32] = "msix"; -module_param_string(ioat_interrupt_style, ioat_interrupt_style, - sizeof(ioat_interrupt_style), 0644); -MODULE_PARM_DESC(ioat_interrupt_style, - "set ioat interrupt style: msix (default), " - "msix-single-vector, msi, intx)"); - -/** - * ioat_dma_setup_interrupts - setup interrupt handler - * @device: ioat device - */ -static int ioat_dma_setup_interrupts(struct ioatdma_device *device) -{ - struct ioat_chan_common *chan; - struct pci_dev *pdev = device->pdev; - struct device *dev = &pdev->dev; - struct msix_entry *msix; - int i, j, msixcnt; - int err = -EINVAL; - u8 intrctrl = 0; - - if (!strcmp(ioat_interrupt_style, "msix")) - goto msix; - if (!strcmp(ioat_interrupt_style, "msix-single-vector")) - goto msix_single_vector; - if (!strcmp(ioat_interrupt_style, "msi")) - goto msi; - if (!strcmp(ioat_interrupt_style, "intx")) - goto intx; - dev_err(dev, "invalid ioat_interrupt_style %s\n", ioat_interrupt_style); - goto err_no_irq; - -msix: - /* The number of MSI-X vectors should equal the number of channels */ - msixcnt = device->common.chancnt; - for (i = 0; i < msixcnt; i++) - device->msix_entries[i].entry = i; - - err = pci_enable_msix(pdev, device->msix_entries, msixcnt); - if (err < 0) - goto msi; - if (err > 0) - goto msix_single_vector; - - for (i = 0; i < msixcnt; i++) { - msix = &device->msix_entries[i]; - chan = ioat_chan_by_index(device, i); - err = devm_request_irq(dev, msix->vector, - ioat_dma_do_interrupt_msix, 0, - "ioat-msix", chan); - if (err) { - for (j = 0; j < i; j++) { - msix = &device->msix_entries[j]; - chan = ioat_chan_by_index(device, j); - devm_free_irq(dev, msix->vector, chan); - } - goto msix_single_vector; - } - } - intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL; - goto done; - -msix_single_vector: - msix = &device->msix_entries[0]; - msix->entry = 0; - err = pci_enable_msix(pdev, device->msix_entries, 1); - if (err) - goto msi; - - err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0, - "ioat-msix", device); - if (err) { - pci_disable_msix(pdev); - goto msi; - } - goto done; - -msi: - err = pci_enable_msi(pdev); - if (err) - goto intx; - - err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt, 0, - "ioat-msi", device); - if (err) { - pci_disable_msi(pdev); - goto intx; - } - goto done; - -intx: - err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt, - IRQF_SHARED, "ioat-intx", device); - if (err) - goto err_no_irq; - -done: - if (device->intr_quirk) - device->intr_quirk(device); - intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN; - writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET); - return 0; - -err_no_irq: - /* Disable all interrupt generation */ - writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET); - dev_err(dev, "no usable interrupts\n"); - return err; -} - -static void ioat_disable_interrupts(struct ioatdma_device *device) -{ - /* Disable all interrupt generation */ - writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET); -} - -int __devinit ioat_probe(struct ioatdma_device *device) -{ - int err = -ENODEV; - struct dma_device *dma = &device->common; - struct pci_dev *pdev = device->pdev; - struct device *dev = &pdev->dev; - - /* DMA coherent memory pool for DMA descriptor allocations */ - device->dma_pool = pci_pool_create("dma_desc_pool", pdev, - sizeof(struct ioat_dma_descriptor), - 64, 0); - if (!device->dma_pool) { - err = -ENOMEM; - goto err_dma_pool; - } - - device->completion_pool = pci_pool_create("completion_pool", pdev, - sizeof(u64), SMP_CACHE_BYTES, - SMP_CACHE_BYTES); - - if (!device->completion_pool) { - err = -ENOMEM; - goto err_completion_pool; - } - - device->enumerate_channels(device); - - dma_cap_set(DMA_MEMCPY, dma->cap_mask); - dma->dev = &pdev->dev; - - if (!dma->chancnt) { - dev_err(dev, "zero channels detected\n"); - goto err_setup_interrupts; - } - - err = ioat_dma_setup_interrupts(device); - if (err) - goto err_setup_interrupts; - - err = device->self_test(device); - if (err) - goto err_self_test; - - return 0; - -err_self_test: - ioat_disable_interrupts(device); -err_setup_interrupts: - pci_pool_destroy(device->completion_pool); -err_completion_pool: - pci_pool_destroy(device->dma_pool); -err_dma_pool: - return err; -} - -int __devinit ioat_register(struct ioatdma_device *device) -{ - int err = dma_async_device_register(&device->common); - - if (err) { - ioat_disable_interrupts(device); - pci_pool_destroy(device->completion_pool); - pci_pool_destroy(device->dma_pool); - } - - return err; -} - -/* ioat1_intr_quirk - fix up dma ctrl register to enable / disable msi */ -static void ioat1_intr_quirk(struct ioatdma_device *device) -{ - struct pci_dev *pdev = device->pdev; - u32 dmactrl; - - pci_read_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, &dmactrl); - if (pdev->msi_enabled) - dmactrl |= IOAT_PCI_DMACTRL_MSI_EN; - else - dmactrl &= ~IOAT_PCI_DMACTRL_MSI_EN; - pci_write_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, dmactrl); -} - -static ssize_t ring_size_show(struct dma_chan *c, char *page) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - - return sprintf(page, "%d\n", ioat->desccount); -} -static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size); - -static ssize_t ring_active_show(struct dma_chan *c, char *page) -{ - struct ioat_dma_chan *ioat = to_ioat_chan(c); - - return sprintf(page, "%d\n", ioat->active); -} -static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active); - -static ssize_t cap_show(struct dma_chan *c, char *page) -{ - struct dma_device *dma = c->device; - - return sprintf(page, "copy%s%s%s%s%s%s\n", - dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "", - dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "", - dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "", - dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "", - dma_has_cap(DMA_MEMSET, dma->cap_mask) ? " fill" : "", - dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : ""); - -} -struct ioat_sysfs_entry ioat_cap_attr = __ATTR_RO(cap); - -static ssize_t version_show(struct dma_chan *c, char *page) -{ - struct dma_device *dma = c->device; - struct ioatdma_device *device = to_ioatdma_device(dma); - - return sprintf(page, "%d.%d\n", - device->version >> 4, device->version & 0xf); -} -struct ioat_sysfs_entry ioat_version_attr = __ATTR_RO(version); - -static struct attribute *ioat1_attrs[] = { - &ring_size_attr.attr, - &ring_active_attr.attr, - &ioat_cap_attr.attr, - &ioat_version_attr.attr, - NULL, -}; - -static ssize_t -ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - struct ioat_sysfs_entry *entry; - struct ioat_chan_common *chan; - - entry = container_of(attr, struct ioat_sysfs_entry, attr); - chan = container_of(kobj, struct ioat_chan_common, kobj); - - if (!entry->show) - return -EIO; - return entry->show(&chan->common, page); -} - -struct sysfs_ops ioat_sysfs_ops = { - .show = ioat_attr_show, -}; - -static struct kobj_type ioat1_ktype = { - .sysfs_ops = &ioat_sysfs_ops, - .default_attrs = ioat1_attrs, -}; - -void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type) -{ - struct dma_device *dma = &device->common; - struct dma_chan *c; - - list_for_each_entry(c, &dma->channels, device_node) { - struct ioat_chan_common *chan = to_chan_common(c); - struct kobject *parent = &c->dev->device.kobj; - int err; - - err = kobject_init_and_add(&chan->kobj, type, parent, "quickdata"); - if (err) { - dev_warn(to_dev(chan), - "sysfs init error (%d), continuing...\n", err); - kobject_put(&chan->kobj); - set_bit(IOAT_KOBJ_INIT_FAIL, &chan->state); - } - } -} - -void ioat_kobject_del(struct ioatdma_device *device) -{ - struct dma_device *dma = &device->common; - struct dma_chan *c; - - list_for_each_entry(c, &dma->channels, device_node) { - struct ioat_chan_common *chan = to_chan_common(c); - - if (!test_bit(IOAT_KOBJ_INIT_FAIL, &chan->state)) { - kobject_del(&chan->kobj); - kobject_put(&chan->kobj); - } - } -} - -int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca) -{ - struct pci_dev *pdev = device->pdev; - struct dma_device *dma; - int err; - - device->intr_quirk = ioat1_intr_quirk; - device->enumerate_channels = ioat1_enumerate_channels; - device->self_test = ioat_dma_self_test; - dma = &device->common; - dma->device_prep_dma_memcpy = ioat1_dma_prep_memcpy; - dma->device_issue_pending = ioat1_dma_memcpy_issue_pending; - dma->device_alloc_chan_resources = ioat1_dma_alloc_chan_resources; - dma->device_free_chan_resources = ioat1_dma_free_chan_resources; - dma->device_is_tx_complete = ioat1_dma_is_complete; - - err = ioat_probe(device); - if (err) - return err; - ioat_set_tcp_copy_break(4096); - err = ioat_register(device); - if (err) - return err; - ioat_kobject_add(device, &ioat1_ktype); - - if (dca) - device->dca = ioat_dca_init(pdev, device->reg_base); - - return err; -} - -void __devexit ioat_dma_remove(struct ioatdma_device *device) -{ - struct dma_device *dma = &device->common; - - ioat_disable_interrupts(device); - - ioat_kobject_del(device); - - dma_async_device_unregister(dma); - - pci_pool_destroy(device->dma_pool); - pci_pool_destroy(device->completion_pool); - - INIT_LIST_HEAD(&dma->channels); -} diff --git a/trunk/drivers/dma/ioat/dma.h b/trunk/drivers/dma/ioat/dma.h deleted file mode 100644 index c14fdfeb7f33..000000000000 --- a/trunk/drivers/dma/ioat/dma.h +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef IOATDMA_H -#define IOATDMA_H - -#include -#include "hw.h" -#include "registers.h" -#include -#include -#include -#include -#include - -#define IOAT_DMA_VERSION "4.00" - -#define IOAT_LOW_COMPLETION_MASK 0xffffffc0 -#define IOAT_DMA_DCA_ANY_CPU ~0 - -#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common) -#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) -#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, txd) -#define to_dev(ioat_chan) (&(ioat_chan)->device->pdev->dev) - -#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80) - -/* - * workaround for IOAT ver.3.0 null descriptor issue - * (channel returns error when size is 0) - */ -#define NULL_DESC_BUFFER_SIZE 1 - -/** - * struct ioatdma_device - internal representation of a IOAT device - * @pdev: PCI-Express device - * @reg_base: MMIO register space base address - * @dma_pool: for allocating DMA descriptors - * @common: embedded struct dma_device - * @version: version of ioatdma device - * @msix_entries: irq handlers - * @idx: per channel data - * @dca: direct cache access context - * @intr_quirk: interrupt setup quirk (for ioat_v1 devices) - * @enumerate_channels: hw version specific channel enumeration - * @cleanup_tasklet: select between the v2 and v3 cleanup routines - * @timer_fn: select between the v2 and v3 timer watchdog routines - * @self_test: hardware version specific self test for each supported op type - * - * Note: the v3 cleanup routine supports raid operations - */ -struct ioatdma_device { - struct pci_dev *pdev; - void __iomem *reg_base; - struct pci_pool *dma_pool; - struct pci_pool *completion_pool; - struct dma_device common; - u8 version; - struct msix_entry msix_entries[4]; - struct ioat_chan_common *idx[4]; - struct dca_provider *dca; - void (*intr_quirk)(struct ioatdma_device *device); - int (*enumerate_channels)(struct ioatdma_device *device); - void (*cleanup_tasklet)(unsigned long data); - void (*timer_fn)(unsigned long data); - int (*self_test)(struct ioatdma_device *device); -}; - -struct ioat_chan_common { - struct dma_chan common; - void __iomem *reg_base; - unsigned long last_completion; - spinlock_t cleanup_lock; - dma_cookie_t completed_cookie; - unsigned long state; - #define IOAT_COMPLETION_PENDING 0 - #define IOAT_COMPLETION_ACK 1 - #define IOAT_RESET_PENDING 2 - #define IOAT_KOBJ_INIT_FAIL 3 - struct timer_list timer; - #define COMPLETION_TIMEOUT msecs_to_jiffies(100) - #define IDLE_TIMEOUT msecs_to_jiffies(2000) - #define RESET_DELAY msecs_to_jiffies(100) - struct ioatdma_device *device; - dma_addr_t completion_dma; - u64 *completion; - struct tasklet_struct cleanup_task; - struct kobject kobj; -}; - -struct ioat_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct dma_chan *, char *); -}; - -/** - * struct ioat_dma_chan - internal representation of a DMA channel - */ -struct ioat_dma_chan { - struct ioat_chan_common base; - - size_t xfercap; /* XFERCAP register value expanded out */ - - spinlock_t desc_lock; - struct list_head free_desc; - struct list_head used_desc; - - int pending; - u16 desccount; - u16 active; -}; - -static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c) -{ - return container_of(c, struct ioat_chan_common, common); -} - -static inline struct ioat_dma_chan *to_ioat_chan(struct dma_chan *c) -{ - struct ioat_chan_common *chan = to_chan_common(c); - - return container_of(chan, struct ioat_dma_chan, base); -} - -/** - * ioat_is_complete - poll the status of an ioat transaction - * @c: channel handle - * @cookie: transaction identifier - * @done: if set, updated with last completed transaction - * @used: if set, updated with last used transaction - */ -static inline enum dma_status -ioat_is_complete(struct dma_chan *c, dma_cookie_t cookie, - dma_cookie_t *done, dma_cookie_t *used) -{ - struct ioat_chan_common *chan = to_chan_common(c); - dma_cookie_t last_used; - dma_cookie_t last_complete; - - last_used = c->cookie; - last_complete = chan->completed_cookie; - - if (done) - *done = last_complete; - if (used) - *used = last_used; - - return dma_async_is_complete(cookie, last_complete, last_used); -} - -/* wrapper around hardware descriptor format + additional software fields */ - -/** - * struct ioat_desc_sw - wrapper around hardware descriptor - * @hw: hardware DMA descriptor (for memcpy) - * @node: this descriptor will either be on the free list, - * or attached to a transaction list (tx_list) - * @txd: the generic software descriptor for all engines - * @id: identifier for debug - */ -struct ioat_desc_sw { - struct ioat_dma_descriptor *hw; - struct list_head node; - size_t len; - struct list_head tx_list; - struct dma_async_tx_descriptor txd; - #ifdef DEBUG - int id; - #endif -}; - -#ifdef DEBUG -#define set_desc_id(desc, i) ((desc)->id = (i)) -#define desc_id(desc) ((desc)->id) -#else -#define set_desc_id(desc, i) -#define desc_id(desc) (0) -#endif - -static inline void -__dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw, - struct dma_async_tx_descriptor *tx, int id) -{ - struct device *dev = to_dev(chan); - - dev_dbg(dev, "desc[%d]: (%#llx->%#llx) cookie: %d flags: %#x" - " ctl: %#x (op: %d int_en: %d compl: %d)\n", id, - (unsigned long long) tx->phys, - (unsigned long long) hw->next, tx->cookie, tx->flags, - hw->ctl, hw->ctl_f.op, hw->ctl_f.int_en, hw->ctl_f.compl_write); -} - -#define dump_desc_dbg(c, d) \ - ({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; }) - -static inline void ioat_set_tcp_copy_break(unsigned long copybreak) -{ - #ifdef CONFIG_NET_DMA - sysctl_tcp_dma_copybreak = copybreak; - #endif -} - -static inline struct ioat_chan_common * -ioat_chan_by_index(struct ioatdma_device *device, int index) -{ - return device->idx[index]; -} - -static inline u64 ioat_chansts(struct ioat_chan_common *chan) -{ - u8 ver = chan->device->version; - u64 status; - u32 status_lo; - - /* We need to read the low address first as this causes the - * chipset to latch the upper bits for the subsequent read - */ - status_lo = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_LOW(ver)); - status = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_HIGH(ver)); - status <<= 32; - status |= status_lo; - - return status; -} - -static inline void ioat_start(struct ioat_chan_common *chan) -{ - u8 ver = chan->device->version; - - writeb(IOAT_CHANCMD_START, chan->reg_base + IOAT_CHANCMD_OFFSET(ver)); -} - -static inline u64 ioat_chansts_to_addr(u64 status) -{ - return status & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR; -} - -static inline u32 ioat_chanerr(struct ioat_chan_common *chan) -{ - return readl(chan->reg_base + IOAT_CHANERR_OFFSET); -} - -static inline void ioat_suspend(struct ioat_chan_common *chan) -{ - u8 ver = chan->device->version; - - writeb(IOAT_CHANCMD_SUSPEND, chan->reg_base + IOAT_CHANCMD_OFFSET(ver)); -} - -static inline void ioat_set_chainaddr(struct ioat_dma_chan *ioat, u64 addr) -{ - struct ioat_chan_common *chan = &ioat->base; - - writel(addr & 0x00000000FFFFFFFF, - chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW); - writel(addr >> 32, - chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH); -} - -static inline bool is_ioat_active(unsigned long status) -{ - return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE); -} - -static inline bool is_ioat_idle(unsigned long status) -{ - return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_DONE); -} - -static inline bool is_ioat_halted(unsigned long status) -{ - return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED); -} - -static inline bool is_ioat_suspended(unsigned long status) -{ - return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED); -} - -/* channel was fatally programmed */ -static inline bool is_ioat_bug(unsigned long err) -{ - return !!(err & (IOAT_CHANERR_SRC_ADDR_ERR|IOAT_CHANERR_DEST_ADDR_ERR| - IOAT_CHANERR_NEXT_ADDR_ERR|IOAT_CHANERR_CONTROL_ERR| - IOAT_CHANERR_LENGTH_ERR)); -} - -static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len, - int direction, enum dma_ctrl_flags flags, bool dst) -{ - if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) || - (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE))) - pci_unmap_single(pdev, addr, len, direction); - else - pci_unmap_page(pdev, addr, len, direction); -} - -int __devinit ioat_probe(struct ioatdma_device *device); -int __devinit ioat_register(struct ioatdma_device *device); -int __devinit ioat1_dma_probe(struct ioatdma_device *dev, int dca); -int __devinit ioat_dma_self_test(struct ioatdma_device *device); -void __devexit ioat_dma_remove(struct ioatdma_device *device); -struct dca_provider * __devinit ioat_dca_init(struct pci_dev *pdev, - void __iomem *iobase); -unsigned long ioat_get_current_completion(struct ioat_chan_common *chan); -void ioat_init_channel(struct ioatdma_device *device, - struct ioat_chan_common *chan, int idx, - void (*timer_fn)(unsigned long), - void (*tasklet)(unsigned long), - unsigned long ioat); -void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, - size_t len, struct ioat_dma_descriptor *hw); -bool ioat_cleanup_preamble(struct ioat_chan_common *chan, - unsigned long *phys_complete); -void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type); -void ioat_kobject_del(struct ioatdma_device *device); -extern struct sysfs_ops ioat_sysfs_ops; -extern struct ioat_sysfs_entry ioat_version_attr; -extern struct ioat_sysfs_entry ioat_cap_attr; -#endif /* IOATDMA_H */ diff --git a/trunk/drivers/dma/ioat/dma_v2.c b/trunk/drivers/dma/ioat/dma_v2.c deleted file mode 100644 index 96ffab7d37a7..000000000000 --- a/trunk/drivers/dma/ioat/dma_v2.c +++ /dev/null @@ -1,871 +0,0 @@ -/* - * Intel I/OAT DMA Linux driver - * Copyright(c) 2004 - 2009 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - */ - -/* - * This driver supports an Intel I/OAT DMA engine (versions >= 2), which - * does asynchronous data movement and checksumming operations. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "dma.h" -#include "dma_v2.h" -#include "registers.h" -#include "hw.h" - -int ioat_ring_alloc_order = 8; -module_param(ioat_ring_alloc_order, int, 0644); -MODULE_PARM_DESC(ioat_ring_alloc_order, - "ioat2+: allocate 2^n descriptors per channel" - " (default: 8 max: 16)"); -static int ioat_ring_max_alloc_order = IOAT_MAX_ORDER; -module_param(ioat_ring_max_alloc_order, int, 0644); -MODULE_PARM_DESC(ioat_ring_max_alloc_order, - "ioat2+: upper limit for ring size (default: 16)"); - -void __ioat2_issue_pending(struct ioat2_dma_chan *ioat) -{ - void * __iomem reg_base = ioat->base.reg_base; - - ioat->pending = 0; - ioat->dmacount += ioat2_ring_pending(ioat); - ioat->issued = ioat->head; - /* make descriptor updates globally visible before notifying channel */ - wmb(); - writew(ioat->dmacount, reg_base + IOAT_CHAN_DMACOUNT_OFFSET); - dev_dbg(to_dev(&ioat->base), - "%s: head: %#x tail: %#x issued: %#x count: %#x\n", - __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount); -} - -void ioat2_issue_pending(struct dma_chan *chan) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(chan); - - spin_lock_bh(&ioat->ring_lock); - if (ioat->pending == 1) - __ioat2_issue_pending(ioat); - spin_unlock_bh(&ioat->ring_lock); -} - -/** - * ioat2_update_pending - log pending descriptors - * @ioat: ioat2+ channel - * - * set pending to '1' unless pending is already set to '2', pending == 2 - * indicates that submission is temporarily blocked due to an in-flight - * reset. If we are already above the ioat_pending_level threshold then - * just issue pending. - * - * called with ring_lock held - */ -static void ioat2_update_pending(struct ioat2_dma_chan *ioat) -{ - if (unlikely(ioat->pending == 2)) - return; - else if (ioat2_ring_pending(ioat) > ioat_pending_level) - __ioat2_issue_pending(ioat); - else - ioat->pending = 1; -} - -static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat) -{ - struct ioat_ring_ent *desc; - struct ioat_dma_descriptor *hw; - int idx; - - if (ioat2_ring_space(ioat) < 1) { - dev_err(to_dev(&ioat->base), - "Unable to start null desc - ring full\n"); - return; - } - - dev_dbg(to_dev(&ioat->base), "%s: head: %#x tail: %#x issued: %#x\n", - __func__, ioat->head, ioat->tail, ioat->issued); - idx = ioat2_desc_alloc(ioat, 1); - desc = ioat2_get_ring_ent(ioat, idx); - - hw = desc->hw; - hw->ctl = 0; - hw->ctl_f.null = 1; - hw->ctl_f.int_en = 1; - hw->ctl_f.compl_write = 1; - /* set size to non-zero value (channel returns error when size is 0) */ - hw->size = NULL_DESC_BUFFER_SIZE; - hw->src_addr = 0; - hw->dst_addr = 0; - async_tx_ack(&desc->txd); - ioat2_set_chainaddr(ioat, desc->txd.phys); - dump_desc_dbg(ioat, desc); - __ioat2_issue_pending(ioat); -} - -static void ioat2_start_null_desc(struct ioat2_dma_chan *ioat) -{ - spin_lock_bh(&ioat->ring_lock); - __ioat2_start_null_desc(ioat); - spin_unlock_bh(&ioat->ring_lock); -} - -static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete) -{ - struct ioat_chan_common *chan = &ioat->base; - struct dma_async_tx_descriptor *tx; - struct ioat_ring_ent *desc; - bool seen_current = false; - u16 active; - int i; - - dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n", - __func__, ioat->head, ioat->tail, ioat->issued); - - active = ioat2_ring_active(ioat); - for (i = 0; i < active && !seen_current; i++) { - prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1)); - desc = ioat2_get_ring_ent(ioat, ioat->tail + i); - tx = &desc->txd; - dump_desc_dbg(ioat, desc); - if (tx->cookie) { - ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw); - chan->completed_cookie = tx->cookie; - tx->cookie = 0; - if (tx->callback) { - tx->callback(tx->callback_param); - tx->callback = NULL; - } - } - - if (tx->phys == phys_complete) - seen_current = true; - } - ioat->tail += i; - BUG_ON(!seen_current); /* no active descs have written a completion? */ - - chan->last_completion = phys_complete; - if (ioat->head == ioat->tail) { - dev_dbg(to_dev(chan), "%s: cancel completion timeout\n", - __func__); - clear_bit(IOAT_COMPLETION_PENDING, &chan->state); - mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); - } -} - -/** - * ioat2_cleanup - clean finished descriptors (advance tail pointer) - * @chan: ioat channel to be cleaned up - */ -static void ioat2_cleanup(struct ioat2_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - unsigned long phys_complete; - - prefetch(chan->completion); - - if (!spin_trylock_bh(&chan->cleanup_lock)) - return; - - if (!ioat_cleanup_preamble(chan, &phys_complete)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - if (!spin_trylock_bh(&ioat->ring_lock)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - __cleanup(ioat, phys_complete); - - spin_unlock_bh(&ioat->ring_lock); - spin_unlock_bh(&chan->cleanup_lock); -} - -void ioat2_cleanup_tasklet(unsigned long data) -{ - struct ioat2_dma_chan *ioat = (void *) data; - - ioat2_cleanup(ioat); - writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET); -} - -void __ioat2_restart_chan(struct ioat2_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - - /* set the tail to be re-issued */ - ioat->issued = ioat->tail; - ioat->dmacount = 0; - set_bit(IOAT_COMPLETION_PENDING, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - - dev_dbg(to_dev(chan), - "%s: head: %#x tail: %#x issued: %#x count: %#x\n", - __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount); - - if (ioat2_ring_pending(ioat)) { - struct ioat_ring_ent *desc; - - desc = ioat2_get_ring_ent(ioat, ioat->tail); - ioat2_set_chainaddr(ioat, desc->txd.phys); - __ioat2_issue_pending(ioat); - } else - __ioat2_start_null_desc(ioat); -} - -static void ioat2_restart_channel(struct ioat2_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - unsigned long phys_complete; - u32 status; - - status = ioat_chansts(chan); - if (is_ioat_active(status) || is_ioat_idle(status)) - ioat_suspend(chan); - while (is_ioat_active(status) || is_ioat_idle(status)) { - status = ioat_chansts(chan); - cpu_relax(); - } - - if (ioat_cleanup_preamble(chan, &phys_complete)) - __cleanup(ioat, phys_complete); - - __ioat2_restart_chan(ioat); -} - -void ioat2_timer_event(unsigned long data) -{ - struct ioat2_dma_chan *ioat = (void *) data; - struct ioat_chan_common *chan = &ioat->base; - - spin_lock_bh(&chan->cleanup_lock); - if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) { - unsigned long phys_complete; - u64 status; - - spin_lock_bh(&ioat->ring_lock); - status = ioat_chansts(chan); - - /* when halted due to errors check for channel - * programming errors before advancing the completion state - */ - if (is_ioat_halted(status)) { - u32 chanerr; - - chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); - BUG_ON(is_ioat_bug(chanerr)); - } - - /* if we haven't made progress and we have already - * acknowledged a pending completion once, then be more - * forceful with a restart - */ - if (ioat_cleanup_preamble(chan, &phys_complete)) - __cleanup(ioat, phys_complete); - else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) - ioat2_restart_channel(ioat); - else { - set_bit(IOAT_COMPLETION_ACK, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - } - spin_unlock_bh(&ioat->ring_lock); - } else { - u16 active; - - /* if the ring is idle, empty, and oversized try to step - * down the size - */ - spin_lock_bh(&ioat->ring_lock); - active = ioat2_ring_active(ioat); - if (active == 0 && ioat->alloc_order > ioat_get_alloc_order()) - reshape_ring(ioat, ioat->alloc_order-1); - spin_unlock_bh(&ioat->ring_lock); - - /* keep shrinking until we get back to our minimum - * default size - */ - if (ioat->alloc_order > ioat_get_alloc_order()) - mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); - } - spin_unlock_bh(&chan->cleanup_lock); -} - -/** - * ioat2_enumerate_channels - find and initialize the device's channels - * @device: the device to be enumerated - */ -int ioat2_enumerate_channels(struct ioatdma_device *device) -{ - struct ioat2_dma_chan *ioat; - struct device *dev = &device->pdev->dev; - struct dma_device *dma = &device->common; - u8 xfercap_log; - int i; - - INIT_LIST_HEAD(&dma->channels); - dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET); - dma->chancnt &= 0x1f; /* bits [4:0] valid */ - if (dma->chancnt > ARRAY_SIZE(device->idx)) { - dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n", - dma->chancnt, ARRAY_SIZE(device->idx)); - dma->chancnt = ARRAY_SIZE(device->idx); - } - xfercap_log = readb(device->reg_base + IOAT_XFERCAP_OFFSET); - xfercap_log &= 0x1f; /* bits [4:0] valid */ - if (xfercap_log == 0) - return 0; - dev_dbg(dev, "%s: xfercap = %d\n", __func__, 1 << xfercap_log); - - /* FIXME which i/oat version is i7300? */ -#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL - if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) - dma->chancnt--; -#endif - for (i = 0; i < dma->chancnt; i++) { - ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL); - if (!ioat) - break; - - ioat_init_channel(device, &ioat->base, i, - device->timer_fn, - device->cleanup_tasklet, - (unsigned long) ioat); - ioat->xfercap_log = xfercap_log; - spin_lock_init(&ioat->ring_lock); - } - dma->chancnt = i; - return i; -} - -static dma_cookie_t ioat2_tx_submit_unlock(struct dma_async_tx_descriptor *tx) -{ - struct dma_chan *c = tx->chan; - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_chan_common *chan = &ioat->base; - dma_cookie_t cookie = c->cookie; - - cookie++; - if (cookie < 0) - cookie = 1; - tx->cookie = cookie; - c->cookie = cookie; - dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie); - - if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state)) - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - ioat2_update_pending(ioat); - spin_unlock_bh(&ioat->ring_lock); - - return cookie; -} - -static struct ioat_ring_ent *ioat2_alloc_ring_ent(struct dma_chan *chan, gfp_t flags) -{ - struct ioat_dma_descriptor *hw; - struct ioat_ring_ent *desc; - struct ioatdma_device *dma; - dma_addr_t phys; - - dma = to_ioatdma_device(chan->device); - hw = pci_pool_alloc(dma->dma_pool, flags, &phys); - if (!hw) - return NULL; - memset(hw, 0, sizeof(*hw)); - - desc = kmem_cache_alloc(ioat2_cache, flags); - if (!desc) { - pci_pool_free(dma->dma_pool, hw, phys); - return NULL; - } - memset(desc, 0, sizeof(*desc)); - - dma_async_tx_descriptor_init(&desc->txd, chan); - desc->txd.tx_submit = ioat2_tx_submit_unlock; - desc->hw = hw; - desc->txd.phys = phys; - return desc; -} - -static void ioat2_free_ring_ent(struct ioat_ring_ent *desc, struct dma_chan *chan) -{ - struct ioatdma_device *dma; - - dma = to_ioatdma_device(chan->device); - pci_pool_free(dma->dma_pool, desc->hw, desc->txd.phys); - kmem_cache_free(ioat2_cache, desc); -} - -static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gfp_t flags) -{ - struct ioat_ring_ent **ring; - int descs = 1 << order; - int i; - - if (order > ioat_get_max_alloc_order()) - return NULL; - - /* allocate the array to hold the software ring */ - ring = kcalloc(descs, sizeof(*ring), flags); - if (!ring) - return NULL; - for (i = 0; i < descs; i++) { - ring[i] = ioat2_alloc_ring_ent(c, flags); - if (!ring[i]) { - while (i--) - ioat2_free_ring_ent(ring[i], c); - kfree(ring); - return NULL; - } - set_desc_id(ring[i], i); - } - - /* link descs */ - for (i = 0; i < descs-1; i++) { - struct ioat_ring_ent *next = ring[i+1]; - struct ioat_dma_descriptor *hw = ring[i]->hw; - - hw->next = next->txd.phys; - } - ring[i]->hw->next = ring[0]->txd.phys; - - return ring; -} - -/* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring - * @chan: channel to be initialized - */ -int ioat2_alloc_chan_resources(struct dma_chan *c) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_chan_common *chan = &ioat->base; - struct ioat_ring_ent **ring; - u32 chanerr; - int order; - - /* have we already been set up? */ - if (ioat->ring) - return 1 << ioat->alloc_order; - - /* Setup register to interrupt and write completion status on error */ - writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET); - - chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); - if (chanerr) { - dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr); - writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET); - } - - /* allocate a completion writeback area */ - /* doing 2 32bit writes to mmio since 1 64b write doesn't work */ - chan->completion = pci_pool_alloc(chan->device->completion_pool, - GFP_KERNEL, &chan->completion_dma); - if (!chan->completion) - return -ENOMEM; - - memset(chan->completion, 0, sizeof(*chan->completion)); - writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF, - chan->reg_base + IOAT_CHANCMP_OFFSET_LOW); - writel(((u64) chan->completion_dma) >> 32, - chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH); - - order = ioat_get_alloc_order(); - ring = ioat2_alloc_ring(c, order, GFP_KERNEL); - if (!ring) - return -ENOMEM; - - spin_lock_bh(&ioat->ring_lock); - ioat->ring = ring; - ioat->head = 0; - ioat->issued = 0; - ioat->tail = 0; - ioat->pending = 0; - ioat->alloc_order = order; - spin_unlock_bh(&ioat->ring_lock); - - tasklet_enable(&chan->cleanup_task); - ioat2_start_null_desc(ioat); - - return 1 << ioat->alloc_order; -} - -bool reshape_ring(struct ioat2_dma_chan *ioat, int order) -{ - /* reshape differs from normal ring allocation in that we want - * to allocate a new software ring while only - * extending/truncating the hardware ring - */ - struct ioat_chan_common *chan = &ioat->base; - struct dma_chan *c = &chan->common; - const u16 curr_size = ioat2_ring_mask(ioat) + 1; - const u16 active = ioat2_ring_active(ioat); - const u16 new_size = 1 << order; - struct ioat_ring_ent **ring; - u16 i; - - if (order > ioat_get_max_alloc_order()) - return false; - - /* double check that we have at least 1 free descriptor */ - if (active == curr_size) - return false; - - /* when shrinking, verify that we can hold the current active - * set in the new ring - */ - if (active >= new_size) - return false; - - /* allocate the array to hold the software ring */ - ring = kcalloc(new_size, sizeof(*ring), GFP_NOWAIT); - if (!ring) - return false; - - /* allocate/trim descriptors as needed */ - if (new_size > curr_size) { - /* copy current descriptors to the new ring */ - for (i = 0; i < curr_size; i++) { - u16 curr_idx = (ioat->tail+i) & (curr_size-1); - u16 new_idx = (ioat->tail+i) & (new_size-1); - - ring[new_idx] = ioat->ring[curr_idx]; - set_desc_id(ring[new_idx], new_idx); - } - - /* add new descriptors to the ring */ - for (i = curr_size; i < new_size; i++) { - u16 new_idx = (ioat->tail+i) & (new_size-1); - - ring[new_idx] = ioat2_alloc_ring_ent(c, GFP_NOWAIT); - if (!ring[new_idx]) { - while (i--) { - u16 new_idx = (ioat->tail+i) & (new_size-1); - - ioat2_free_ring_ent(ring[new_idx], c); - } - kfree(ring); - return false; - } - set_desc_id(ring[new_idx], new_idx); - } - - /* hw link new descriptors */ - for (i = curr_size-1; i < new_size; i++) { - u16 new_idx = (ioat->tail+i) & (new_size-1); - struct ioat_ring_ent *next = ring[(new_idx+1) & (new_size-1)]; - struct ioat_dma_descriptor *hw = ring[new_idx]->hw; - - hw->next = next->txd.phys; - } - } else { - struct ioat_dma_descriptor *hw; - struct ioat_ring_ent *next; - - /* copy current descriptors to the new ring, dropping the - * removed descriptors - */ - for (i = 0; i < new_size; i++) { - u16 curr_idx = (ioat->tail+i) & (curr_size-1); - u16 new_idx = (ioat->tail+i) & (new_size-1); - - ring[new_idx] = ioat->ring[curr_idx]; - set_desc_id(ring[new_idx], new_idx); - } - - /* free deleted descriptors */ - for (i = new_size; i < curr_size; i++) { - struct ioat_ring_ent *ent; - - ent = ioat2_get_ring_ent(ioat, ioat->tail+i); - ioat2_free_ring_ent(ent, c); - } - - /* fix up hardware ring */ - hw = ring[(ioat->tail+new_size-1) & (new_size-1)]->hw; - next = ring[(ioat->tail+new_size) & (new_size-1)]; - hw->next = next->txd.phys; - } - - dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n", - __func__, new_size); - - kfree(ioat->ring); - ioat->ring = ring; - ioat->alloc_order = order; - - return true; -} - -/** - * ioat2_alloc_and_lock - common descriptor alloc boilerplate for ioat2,3 ops - * @idx: gets starting descriptor index on successful allocation - * @ioat: ioat2,3 channel (ring) to operate on - * @num_descs: allocation length - */ -int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs) -{ - struct ioat_chan_common *chan = &ioat->base; - - spin_lock_bh(&ioat->ring_lock); - /* never allow the last descriptor to be consumed, we need at - * least one free at all times to allow for on-the-fly ring - * resizing. - */ - while (unlikely(ioat2_ring_space(ioat) <= num_descs)) { - if (reshape_ring(ioat, ioat->alloc_order + 1) && - ioat2_ring_space(ioat) > num_descs) - break; - - if (printk_ratelimit()) - dev_dbg(to_dev(chan), - "%s: ring full! num_descs: %d (%x:%x:%x)\n", - __func__, num_descs, ioat->head, ioat->tail, - ioat->issued); - spin_unlock_bh(&ioat->ring_lock); - - /* progress reclaim in the allocation failure case we - * may be called under bh_disabled so we need to trigger - * the timer event directly - */ - spin_lock_bh(&chan->cleanup_lock); - if (jiffies > chan->timer.expires && - timer_pending(&chan->timer)) { - struct ioatdma_device *device = chan->device; - - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - spin_unlock_bh(&chan->cleanup_lock); - device->timer_fn((unsigned long) ioat); - } else - spin_unlock_bh(&chan->cleanup_lock); - return -ENOMEM; - } - - dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n", - __func__, num_descs, ioat->head, ioat->tail, ioat->issued); - - *idx = ioat2_desc_alloc(ioat, num_descs); - return 0; /* with ioat->ring_lock held */ -} - -struct dma_async_tx_descriptor * -ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, - dma_addr_t dma_src, size_t len, unsigned long flags) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_dma_descriptor *hw; - struct ioat_ring_ent *desc; - dma_addr_t dst = dma_dest; - dma_addr_t src = dma_src; - size_t total_len = len; - int num_descs; - u16 idx; - int i; - - num_descs = ioat2_xferlen_to_descs(ioat, len); - if (likely(num_descs) && - ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0) - /* pass */; - else - return NULL; - i = 0; - do { - size_t copy = min_t(size_t, len, 1 << ioat->xfercap_log); - - desc = ioat2_get_ring_ent(ioat, idx + i); - hw = desc->hw; - - hw->size = copy; - hw->ctl = 0; - hw->src_addr = src; - hw->dst_addr = dst; - - len -= copy; - dst += copy; - src += copy; - dump_desc_dbg(ioat, desc); - } while (++i < num_descs); - - desc->txd.flags = flags; - desc->len = total_len; - hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); - hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE); - hw->ctl_f.compl_write = 1; - dump_desc_dbg(ioat, desc); - /* we leave the channel locked to ensure in order submission */ - - return &desc->txd; -} - -/** - * ioat2_free_chan_resources - release all the descriptors - * @chan: the channel to be cleaned - */ -void ioat2_free_chan_resources(struct dma_chan *c) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_chan_common *chan = &ioat->base; - struct ioatdma_device *device = chan->device; - struct ioat_ring_ent *desc; - const u16 total_descs = 1 << ioat->alloc_order; - int descs; - int i; - - /* Before freeing channel resources first check - * if they have been previously allocated for this channel. - */ - if (!ioat->ring) - return; - - tasklet_disable(&chan->cleanup_task); - del_timer_sync(&chan->timer); - device->cleanup_tasklet((unsigned long) ioat); - - /* Delay 100ms after reset to allow internal DMA logic to quiesce - * before removing DMA descriptor resources. - */ - writeb(IOAT_CHANCMD_RESET, - chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version)); - mdelay(100); - - spin_lock_bh(&ioat->ring_lock); - descs = ioat2_ring_space(ioat); - dev_dbg(to_dev(chan), "freeing %d idle descriptors\n", descs); - for (i = 0; i < descs; i++) { - desc = ioat2_get_ring_ent(ioat, ioat->head + i); - ioat2_free_ring_ent(desc, c); - } - - if (descs < total_descs) - dev_err(to_dev(chan), "Freeing %d in use descriptors!\n", - total_descs - descs); - - for (i = 0; i < total_descs - descs; i++) { - desc = ioat2_get_ring_ent(ioat, ioat->tail + i); - dump_desc_dbg(ioat, desc); - ioat2_free_ring_ent(desc, c); - } - - kfree(ioat->ring); - ioat->ring = NULL; - ioat->alloc_order = 0; - pci_pool_free(device->completion_pool, chan->completion, - chan->completion_dma); - spin_unlock_bh(&ioat->ring_lock); - - chan->last_completion = 0; - chan->completion_dma = 0; - ioat->pending = 0; - ioat->dmacount = 0; -} - -enum dma_status -ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie, - dma_cookie_t *done, dma_cookie_t *used) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioatdma_device *device = ioat->base.device; - - if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS) - return DMA_SUCCESS; - - device->cleanup_tasklet((unsigned long) ioat); - - return ioat_is_complete(c, cookie, done, used); -} - -static ssize_t ring_size_show(struct dma_chan *c, char *page) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - - return sprintf(page, "%d\n", (1 << ioat->alloc_order) & ~1); -} -static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size); - -static ssize_t ring_active_show(struct dma_chan *c, char *page) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - - /* ...taken outside the lock, no need to be precise */ - return sprintf(page, "%d\n", ioat2_ring_active(ioat)); -} -static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active); - -static struct attribute *ioat2_attrs[] = { - &ring_size_attr.attr, - &ring_active_attr.attr, - &ioat_cap_attr.attr, - &ioat_version_attr.attr, - NULL, -}; - -struct kobj_type ioat2_ktype = { - .sysfs_ops = &ioat_sysfs_ops, - .default_attrs = ioat2_attrs, -}; - -int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca) -{ - struct pci_dev *pdev = device->pdev; - struct dma_device *dma; - struct dma_chan *c; - struct ioat_chan_common *chan; - int err; - - device->enumerate_channels = ioat2_enumerate_channels; - device->cleanup_tasklet = ioat2_cleanup_tasklet; - device->timer_fn = ioat2_timer_event; - device->self_test = ioat_dma_self_test; - dma = &device->common; - dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock; - dma->device_issue_pending = ioat2_issue_pending; - dma->device_alloc_chan_resources = ioat2_alloc_chan_resources; - dma->device_free_chan_resources = ioat2_free_chan_resources; - dma->device_is_tx_complete = ioat2_is_complete; - - err = ioat_probe(device); - if (err) - return err; - ioat_set_tcp_copy_break(2048); - - list_for_each_entry(c, &dma->channels, device_node) { - chan = to_chan_common(c); - writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU, - chan->reg_base + IOAT_DCACTRL_OFFSET); - } - - err = ioat_register(device); - if (err) - return err; - - ioat_kobject_add(device, &ioat2_ktype); - - if (dca) - device->dca = ioat2_dca_init(pdev, device->reg_base); - - return err; -} diff --git a/trunk/drivers/dma/ioat/dma_v2.h b/trunk/drivers/dma/ioat/dma_v2.h deleted file mode 100644 index 1d849ef74d5f..000000000000 --- a/trunk/drivers/dma/ioat/dma_v2.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef IOATDMA_V2_H -#define IOATDMA_V2_H - -#include -#include "dma.h" -#include "hw.h" - - -extern int ioat_pending_level; -extern int ioat_ring_alloc_order; - -/* - * workaround for IOAT ver.3.0 null descriptor issue - * (channel returns error when size is 0) - */ -#define NULL_DESC_BUFFER_SIZE 1 - -#define IOAT_MAX_ORDER 16 -#define ioat_get_alloc_order() \ - (min(ioat_ring_alloc_order, IOAT_MAX_ORDER)) -#define ioat_get_max_alloc_order() \ - (min(ioat_ring_max_alloc_order, IOAT_MAX_ORDER)) - -/* struct ioat2_dma_chan - ioat v2 / v3 channel attributes - * @base: common ioat channel parameters - * @xfercap_log; log2 of channel max transfer length (for fast division) - * @head: allocated index - * @issued: hardware notification point - * @tail: cleanup index - * @pending: lock free indicator for issued != head - * @dmacount: identical to 'head' except for occasionally resetting to zero - * @alloc_order: log2 of the number of allocated descriptors - * @ring: software ring buffer implementation of hardware ring - * @ring_lock: protects ring attributes - */ -struct ioat2_dma_chan { - struct ioat_chan_common base; - size_t xfercap_log; - u16 head; - u16 issued; - u16 tail; - u16 dmacount; - u16 alloc_order; - int pending; - struct ioat_ring_ent **ring; - spinlock_t ring_lock; -}; - -static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c) -{ - struct ioat_chan_common *chan = to_chan_common(c); - - return container_of(chan, struct ioat2_dma_chan, base); -} - -static inline u16 ioat2_ring_mask(struct ioat2_dma_chan *ioat) -{ - return (1 << ioat->alloc_order) - 1; -} - -/* count of descriptors in flight with the engine */ -static inline u16 ioat2_ring_active(struct ioat2_dma_chan *ioat) -{ - return (ioat->head - ioat->tail) & ioat2_ring_mask(ioat); -} - -/* count of descriptors pending submission to hardware */ -static inline u16 ioat2_ring_pending(struct ioat2_dma_chan *ioat) -{ - return (ioat->head - ioat->issued) & ioat2_ring_mask(ioat); -} - -static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat) -{ - u16 num_descs = ioat2_ring_mask(ioat) + 1; - u16 active = ioat2_ring_active(ioat); - - BUG_ON(active > num_descs); - - return num_descs - active; -} - -/* assumes caller already checked space */ -static inline u16 ioat2_desc_alloc(struct ioat2_dma_chan *ioat, u16 len) -{ - ioat->head += len; - return ioat->head - len; -} - -static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len) -{ - u16 num_descs = len >> ioat->xfercap_log; - - num_descs += !!(len & ((1 << ioat->xfercap_log) - 1)); - return num_descs; -} - -/** - * struct ioat_ring_ent - wrapper around hardware descriptor - * @hw: hardware DMA descriptor (for memcpy) - * @fill: hardware fill descriptor - * @xor: hardware xor descriptor - * @xor_ex: hardware xor extension descriptor - * @pq: hardware pq descriptor - * @pq_ex: hardware pq extension descriptor - * @pqu: hardware pq update descriptor - * @raw: hardware raw (un-typed) descriptor - * @txd: the generic software descriptor for all engines - * @len: total transaction length for unmap - * @result: asynchronous result of validate operations - * @id: identifier for debug - */ - -struct ioat_ring_ent { - union { - struct ioat_dma_descriptor *hw; - struct ioat_fill_descriptor *fill; - struct ioat_xor_descriptor *xor; - struct ioat_xor_ext_descriptor *xor_ex; - struct ioat_pq_descriptor *pq; - struct ioat_pq_ext_descriptor *pq_ex; - struct ioat_pq_update_descriptor *pqu; - struct ioat_raw_descriptor *raw; - }; - size_t len; - struct dma_async_tx_descriptor txd; - enum sum_check_flags *result; - #ifdef DEBUG - int id; - #endif -}; - -static inline struct ioat_ring_ent * -ioat2_get_ring_ent(struct ioat2_dma_chan *ioat, u16 idx) -{ - return ioat->ring[idx & ioat2_ring_mask(ioat)]; -} - -static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr) -{ - struct ioat_chan_common *chan = &ioat->base; - - writel(addr & 0x00000000FFFFFFFF, - chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW); - writel(addr >> 32, - chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH); -} - -int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca); -int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca); -struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase); -struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase); -int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs); -int ioat2_enumerate_channels(struct ioatdma_device *device); -struct dma_async_tx_descriptor * -ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, - dma_addr_t dma_src, size_t len, unsigned long flags); -void ioat2_issue_pending(struct dma_chan *chan); -int ioat2_alloc_chan_resources(struct dma_chan *c); -void ioat2_free_chan_resources(struct dma_chan *c); -enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie, - dma_cookie_t *done, dma_cookie_t *used); -void __ioat2_restart_chan(struct ioat2_dma_chan *ioat); -bool reshape_ring(struct ioat2_dma_chan *ioat, int order); -void __ioat2_issue_pending(struct ioat2_dma_chan *ioat); -void ioat2_cleanup_tasklet(unsigned long data); -void ioat2_timer_event(unsigned long data); -extern struct kobj_type ioat2_ktype; -extern struct kmem_cache *ioat2_cache; -#endif /* IOATDMA_V2_H */ diff --git a/trunk/drivers/dma/ioat/dma_v3.c b/trunk/drivers/dma/ioat/dma_v3.c deleted file mode 100644 index 35d1e33afd5b..000000000000 --- a/trunk/drivers/dma/ioat/dma_v3.c +++ /dev/null @@ -1,1223 +0,0 @@ -/* - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * BSD LICENSE - * - * Copyright(c) 2004-2009 Intel Corporation. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Support routines for v3+ hardware - */ - -#include -#include -#include -#include "registers.h" -#include "hw.h" -#include "dma.h" -#include "dma_v2.h" - -/* ioat hardware assumes at least two sources for raid operations */ -#define src_cnt_to_sw(x) ((x) + 2) -#define src_cnt_to_hw(x) ((x) - 2) - -/* provide a lookup table for setting the source address in the base or - * extended descriptor of an xor or pq descriptor - */ -static const u8 xor_idx_to_desc __read_mostly = 0xd0; -static const u8 xor_idx_to_field[] __read_mostly = { 1, 4, 5, 6, 7, 0, 1, 2 }; -static const u8 pq_idx_to_desc __read_mostly = 0xf8; -static const u8 pq_idx_to_field[] __read_mostly = { 1, 4, 5, 0, 1, 2, 4, 5 }; - -static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx) -{ - struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1]; - - return raw->field[xor_idx_to_field[idx]]; -} - -static void xor_set_src(struct ioat_raw_descriptor *descs[2], - dma_addr_t addr, u32 offset, int idx) -{ - struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1]; - - raw->field[xor_idx_to_field[idx]] = addr + offset; -} - -static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx) -{ - struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1]; - - return raw->field[pq_idx_to_field[idx]]; -} - -static void pq_set_src(struct ioat_raw_descriptor *descs[2], - dma_addr_t addr, u32 offset, u8 coef, int idx) -{ - struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *) descs[0]; - struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1]; - - raw->field[pq_idx_to_field[idx]] = addr + offset; - pq->coef[idx] = coef; -} - -static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat, - struct ioat_ring_ent *desc, int idx) -{ - struct ioat_chan_common *chan = &ioat->base; - struct pci_dev *pdev = chan->device->pdev; - size_t len = desc->len; - size_t offset = len - desc->hw->size; - struct dma_async_tx_descriptor *tx = &desc->txd; - enum dma_ctrl_flags flags = tx->flags; - - switch (desc->hw->ctl_f.op) { - case IOAT_OP_COPY: - if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */ - ioat_dma_unmap(chan, flags, len, desc->hw); - break; - case IOAT_OP_FILL: { - struct ioat_fill_descriptor *hw = desc->fill; - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) - ioat_unmap(pdev, hw->dst_addr - offset, len, - PCI_DMA_FROMDEVICE, flags, 1); - break; - } - case IOAT_OP_XOR_VAL: - case IOAT_OP_XOR: { - struct ioat_xor_descriptor *xor = desc->xor; - struct ioat_ring_ent *ext; - struct ioat_xor_ext_descriptor *xor_ex = NULL; - int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt); - struct ioat_raw_descriptor *descs[2]; - int i; - - if (src_cnt > 5) { - ext = ioat2_get_ring_ent(ioat, idx + 1); - xor_ex = ext->xor_ex; - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - descs[0] = (struct ioat_raw_descriptor *) xor; - descs[1] = (struct ioat_raw_descriptor *) xor_ex; - for (i = 0; i < src_cnt; i++) { - dma_addr_t src = xor_get_src(descs, i); - - ioat_unmap(pdev, src - offset, len, - PCI_DMA_TODEVICE, flags, 0); - } - - /* dest is a source in xor validate operations */ - if (xor->ctl_f.op == IOAT_OP_XOR_VAL) { - ioat_unmap(pdev, xor->dst_addr - offset, len, - PCI_DMA_TODEVICE, flags, 1); - break; - } - } - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) - ioat_unmap(pdev, xor->dst_addr - offset, len, - PCI_DMA_FROMDEVICE, flags, 1); - break; - } - case IOAT_OP_PQ_VAL: - case IOAT_OP_PQ: { - struct ioat_pq_descriptor *pq = desc->pq; - struct ioat_ring_ent *ext; - struct ioat_pq_ext_descriptor *pq_ex = NULL; - int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt); - struct ioat_raw_descriptor *descs[2]; - int i; - - if (src_cnt > 3) { - ext = ioat2_get_ring_ent(ioat, idx + 1); - pq_ex = ext->pq_ex; - } - - /* in the 'continue' case don't unmap the dests as sources */ - if (dmaf_p_disabled_continue(flags)) - src_cnt--; - else if (dmaf_continue(flags)) - src_cnt -= 3; - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - descs[0] = (struct ioat_raw_descriptor *) pq; - descs[1] = (struct ioat_raw_descriptor *) pq_ex; - for (i = 0; i < src_cnt; i++) { - dma_addr_t src = pq_get_src(descs, i); - - ioat_unmap(pdev, src - offset, len, - PCI_DMA_TODEVICE, flags, 0); - } - - /* the dests are sources in pq validate operations */ - if (pq->ctl_f.op == IOAT_OP_XOR_VAL) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, - len, PCI_DMA_TODEVICE, flags, 0); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, - len, PCI_DMA_TODEVICE, flags, 0); - break; - } - } - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - } - break; - } - default: - dev_err(&pdev->dev, "%s: unknown op type: %#x\n", - __func__, desc->hw->ctl_f.op); - } -} - -static bool desc_has_ext(struct ioat_ring_ent *desc) -{ - struct ioat_dma_descriptor *hw = desc->hw; - - if (hw->ctl_f.op == IOAT_OP_XOR || - hw->ctl_f.op == IOAT_OP_XOR_VAL) { - struct ioat_xor_descriptor *xor = desc->xor; - - if (src_cnt_to_sw(xor->ctl_f.src_cnt) > 5) - return true; - } else if (hw->ctl_f.op == IOAT_OP_PQ || - hw->ctl_f.op == IOAT_OP_PQ_VAL) { - struct ioat_pq_descriptor *pq = desc->pq; - - if (src_cnt_to_sw(pq->ctl_f.src_cnt) > 3) - return true; - } - - return false; -} - -/** - * __cleanup - reclaim used descriptors - * @ioat: channel (ring) to clean - * - * The difference from the dma_v2.c __cleanup() is that this routine - * handles extended descriptors and dma-unmapping raid operations. - */ -static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete) -{ - struct ioat_chan_common *chan = &ioat->base; - struct ioat_ring_ent *desc; - bool seen_current = false; - u16 active; - int i; - - dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n", - __func__, ioat->head, ioat->tail, ioat->issued); - - active = ioat2_ring_active(ioat); - for (i = 0; i < active && !seen_current; i++) { - struct dma_async_tx_descriptor *tx; - - prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1)); - desc = ioat2_get_ring_ent(ioat, ioat->tail + i); - dump_desc_dbg(ioat, desc); - tx = &desc->txd; - if (tx->cookie) { - chan->completed_cookie = tx->cookie; - ioat3_dma_unmap(ioat, desc, ioat->tail + i); - tx->cookie = 0; - if (tx->callback) { - tx->callback(tx->callback_param); - tx->callback = NULL; - } - } - - if (tx->phys == phys_complete) - seen_current = true; - - /* skip extended descriptors */ - if (desc_has_ext(desc)) { - BUG_ON(i + 1 >= active); - i++; - } - } - ioat->tail += i; - BUG_ON(!seen_current); /* no active descs have written a completion? */ - chan->last_completion = phys_complete; - if (ioat->head == ioat->tail) { - dev_dbg(to_dev(chan), "%s: cancel completion timeout\n", - __func__); - clear_bit(IOAT_COMPLETION_PENDING, &chan->state); - mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); - } -} - -static void ioat3_cleanup(struct ioat2_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - unsigned long phys_complete; - - prefetch(chan->completion); - - if (!spin_trylock_bh(&chan->cleanup_lock)) - return; - - if (!ioat_cleanup_preamble(chan, &phys_complete)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - if (!spin_trylock_bh(&ioat->ring_lock)) { - spin_unlock_bh(&chan->cleanup_lock); - return; - } - - __cleanup(ioat, phys_complete); - - spin_unlock_bh(&ioat->ring_lock); - spin_unlock_bh(&chan->cleanup_lock); -} - -static void ioat3_cleanup_tasklet(unsigned long data) -{ - struct ioat2_dma_chan *ioat = (void *) data; - - ioat3_cleanup(ioat); - writew(IOAT_CHANCTRL_RUN | IOAT3_CHANCTRL_COMPL_DCA_EN, - ioat->base.reg_base + IOAT_CHANCTRL_OFFSET); -} - -static void ioat3_restart_channel(struct ioat2_dma_chan *ioat) -{ - struct ioat_chan_common *chan = &ioat->base; - unsigned long phys_complete; - u32 status; - - status = ioat_chansts(chan); - if (is_ioat_active(status) || is_ioat_idle(status)) - ioat_suspend(chan); - while (is_ioat_active(status) || is_ioat_idle(status)) { - status = ioat_chansts(chan); - cpu_relax(); - } - - if (ioat_cleanup_preamble(chan, &phys_complete)) - __cleanup(ioat, phys_complete); - - __ioat2_restart_chan(ioat); -} - -static void ioat3_timer_event(unsigned long data) -{ - struct ioat2_dma_chan *ioat = (void *) data; - struct ioat_chan_common *chan = &ioat->base; - - spin_lock_bh(&chan->cleanup_lock); - if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) { - unsigned long phys_complete; - u64 status; - - spin_lock_bh(&ioat->ring_lock); - status = ioat_chansts(chan); - - /* when halted due to errors check for channel - * programming errors before advancing the completion state - */ - if (is_ioat_halted(status)) { - u32 chanerr; - - chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); - BUG_ON(is_ioat_bug(chanerr)); - } - - /* if we haven't made progress and we have already - * acknowledged a pending completion once, then be more - * forceful with a restart - */ - if (ioat_cleanup_preamble(chan, &phys_complete)) - __cleanup(ioat, phys_complete); - else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) - ioat3_restart_channel(ioat); - else { - set_bit(IOAT_COMPLETION_ACK, &chan->state); - mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); - } - spin_unlock_bh(&ioat->ring_lock); - } else { - u16 active; - - /* if the ring is idle, empty, and oversized try to step - * down the size - */ - spin_lock_bh(&ioat->ring_lock); - active = ioat2_ring_active(ioat); - if (active == 0 && ioat->alloc_order > ioat_get_alloc_order()) - reshape_ring(ioat, ioat->alloc_order-1); - spin_unlock_bh(&ioat->ring_lock); - - /* keep shrinking until we get back to our minimum - * default size - */ - if (ioat->alloc_order > ioat_get_alloc_order()) - mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); - } - spin_unlock_bh(&chan->cleanup_lock); -} - -static enum dma_status -ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie, - dma_cookie_t *done, dma_cookie_t *used) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - - if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS) - return DMA_SUCCESS; - - ioat3_cleanup(ioat); - - return ioat_is_complete(c, cookie, done, used); -} - -static struct dma_async_tx_descriptor * -ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value, - size_t len, unsigned long flags) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_ring_ent *desc; - size_t total_len = len; - struct ioat_fill_descriptor *fill; - int num_descs; - u64 src_data = (0x0101010101010101ULL) * (value & 0xff); - u16 idx; - int i; - - num_descs = ioat2_xferlen_to_descs(ioat, len); - if (likely(num_descs) && - ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0) - /* pass */; - else - return NULL; - i = 0; - do { - size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); - - desc = ioat2_get_ring_ent(ioat, idx + i); - fill = desc->fill; - - fill->size = xfer_size; - fill->src_data = src_data; - fill->dst_addr = dest; - fill->ctl = 0; - fill->ctl_f.op = IOAT_OP_FILL; - - len -= xfer_size; - dest += xfer_size; - dump_desc_dbg(ioat, desc); - } while (++i < num_descs); - - desc->txd.flags = flags; - desc->len = total_len; - fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); - fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE); - fill->ctl_f.compl_write = 1; - dump_desc_dbg(ioat, desc); - - /* we leave the channel locked to ensure in order submission */ - return &desc->txd; -} - -static struct dma_async_tx_descriptor * -__ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result, - dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt, - size_t len, unsigned long flags) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_ring_ent *compl_desc; - struct ioat_ring_ent *desc; - struct ioat_ring_ent *ext; - size_t total_len = len; - struct ioat_xor_descriptor *xor; - struct ioat_xor_ext_descriptor *xor_ex = NULL; - struct ioat_dma_descriptor *hw; - u32 offset = 0; - int num_descs; - int with_ext; - int i; - u16 idx; - u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR; - - BUG_ON(src_cnt < 2); - - num_descs = ioat2_xferlen_to_descs(ioat, len); - /* we need 2x the number of descriptors to cover greater than 5 - * sources - */ - if (src_cnt > 5) { - with_ext = 1; - num_descs *= 2; - } else - with_ext = 0; - - /* completion writes from the raid engine may pass completion - * writes from the legacy engine, so we need one extra null - * (legacy) descriptor to ensure all completion writes arrive in - * order. - */ - if (likely(num_descs) && - ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0) - /* pass */; - else - return NULL; - i = 0; - do { - struct ioat_raw_descriptor *descs[2]; - size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); - int s; - - desc = ioat2_get_ring_ent(ioat, idx + i); - xor = desc->xor; - - /* save a branch by unconditionally retrieving the - * extended descriptor xor_set_src() knows to not write - * to it in the single descriptor case - */ - ext = ioat2_get_ring_ent(ioat, idx + i + 1); - xor_ex = ext->xor_ex; - - descs[0] = (struct ioat_raw_descriptor *) xor; - descs[1] = (struct ioat_raw_descriptor *) xor_ex; - for (s = 0; s < src_cnt; s++) - xor_set_src(descs, src[s], offset, s); - xor->size = xfer_size; - xor->dst_addr = dest + offset; - xor->ctl = 0; - xor->ctl_f.op = op; - xor->ctl_f.src_cnt = src_cnt_to_hw(src_cnt); - - len -= xfer_size; - offset += xfer_size; - dump_desc_dbg(ioat, desc); - } while ((i += 1 + with_ext) < num_descs); - - /* last xor descriptor carries the unmap parameters and fence bit */ - desc->txd.flags = flags; - desc->len = total_len; - if (result) - desc->result = result; - xor->ctl_f.fence = !!(flags & DMA_PREP_FENCE); - - /* completion descriptor carries interrupt bit */ - compl_desc = ioat2_get_ring_ent(ioat, idx + i); - compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT; - hw = compl_desc->hw; - hw->ctl = 0; - hw->ctl_f.null = 1; - hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); - hw->ctl_f.compl_write = 1; - hw->size = NULL_DESC_BUFFER_SIZE; - dump_desc_dbg(ioat, compl_desc); - - /* we leave the channel locked to ensure in order submission */ - return &desc->txd; -} - -static struct dma_async_tx_descriptor * -ioat3_prep_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, - unsigned int src_cnt, size_t len, unsigned long flags) -{ - return __ioat3_prep_xor_lock(chan, NULL, dest, src, src_cnt, len, flags); -} - -struct dma_async_tx_descriptor * -ioat3_prep_xor_val(struct dma_chan *chan, dma_addr_t *src, - unsigned int src_cnt, size_t len, - enum sum_check_flags *result, unsigned long flags) -{ - /* the cleanup routine only sets bits on validate failure, it - * does not clear bits on validate success... so clear it here - */ - *result = 0; - - return __ioat3_prep_xor_lock(chan, result, src[0], &src[1], - src_cnt - 1, len, flags); -} - -static void -dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct ioat_ring_ent *ext) -{ - struct device *dev = to_dev(&ioat->base); - struct ioat_pq_descriptor *pq = desc->pq; - struct ioat_pq_ext_descriptor *pq_ex = ext ? ext->pq_ex : NULL; - struct ioat_raw_descriptor *descs[] = { (void *) pq, (void *) pq_ex }; - int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt); - int i; - - dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x" - " sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n", - desc_id(desc), (unsigned long long) desc->txd.phys, - (unsigned long long) (pq_ex ? pq_ex->next : pq->next), - desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en, - pq->ctl_f.compl_write, - pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q", - pq->ctl_f.src_cnt); - for (i = 0; i < src_cnt; i++) - dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i, - (unsigned long long) pq_get_src(descs, i), pq->coef[i]); - dev_dbg(dev, "\tP: %#llx\n", pq->p_addr); - dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr); -} - -static struct dma_async_tx_descriptor * -__ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result, - const dma_addr_t *dst, const dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, - size_t len, unsigned long flags) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_chan_common *chan = &ioat->base; - struct ioat_ring_ent *compl_desc; - struct ioat_ring_ent *desc; - struct ioat_ring_ent *ext; - size_t total_len = len; - struct ioat_pq_descriptor *pq; - struct ioat_pq_ext_descriptor *pq_ex = NULL; - struct ioat_dma_descriptor *hw; - u32 offset = 0; - int num_descs; - int with_ext; - int i, s; - u16 idx; - u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ; - - dev_dbg(to_dev(chan), "%s\n", __func__); - /* the engine requires at least two sources (we provide - * at least 1 implied source in the DMA_PREP_CONTINUE case) - */ - BUG_ON(src_cnt + dmaf_continue(flags) < 2); - - num_descs = ioat2_xferlen_to_descs(ioat, len); - /* we need 2x the number of descriptors to cover greater than 3 - * sources - */ - if (src_cnt > 3 || flags & DMA_PREP_CONTINUE) { - with_ext = 1; - num_descs *= 2; - } else - with_ext = 0; - - /* completion writes from the raid engine may pass completion - * writes from the legacy engine, so we need one extra null - * (legacy) descriptor to ensure all completion writes arrive in - * order. - */ - if (likely(num_descs) && - ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0) - /* pass */; - else - return NULL; - i = 0; - do { - struct ioat_raw_descriptor *descs[2]; - size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); - - desc = ioat2_get_ring_ent(ioat, idx + i); - pq = desc->pq; - - /* save a branch by unconditionally retrieving the - * extended descriptor pq_set_src() knows to not write - * to it in the single descriptor case - */ - ext = ioat2_get_ring_ent(ioat, idx + i + with_ext); - pq_ex = ext->pq_ex; - - descs[0] = (struct ioat_raw_descriptor *) pq; - descs[1] = (struct ioat_raw_descriptor *) pq_ex; - - for (s = 0; s < src_cnt; s++) - pq_set_src(descs, src[s], offset, scf[s], s); - - /* see the comment for dma_maxpq in include/linux/dmaengine.h */ - if (dmaf_p_disabled_continue(flags)) - pq_set_src(descs, dst[1], offset, 1, s++); - else if (dmaf_continue(flags)) { - pq_set_src(descs, dst[0], offset, 0, s++); - pq_set_src(descs, dst[1], offset, 1, s++); - pq_set_src(descs, dst[1], offset, 0, s++); - } - pq->size = xfer_size; - pq->p_addr = dst[0] + offset; - pq->q_addr = dst[1] + offset; - pq->ctl = 0; - pq->ctl_f.op = op; - pq->ctl_f.src_cnt = src_cnt_to_hw(s); - pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P); - pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q); - - len -= xfer_size; - offset += xfer_size; - } while ((i += 1 + with_ext) < num_descs); - - /* last pq descriptor carries the unmap parameters and fence bit */ - desc->txd.flags = flags; - desc->len = total_len; - if (result) - desc->result = result; - pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE); - dump_pq_desc_dbg(ioat, desc, ext); - - /* completion descriptor carries interrupt bit */ - compl_desc = ioat2_get_ring_ent(ioat, idx + i); - compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT; - hw = compl_desc->hw; - hw->ctl = 0; - hw->ctl_f.null = 1; - hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); - hw->ctl_f.compl_write = 1; - hw->size = NULL_DESC_BUFFER_SIZE; - dump_desc_dbg(ioat, compl_desc); - - /* we leave the channel locked to ensure in order submission */ - return &desc->txd; -} - -static struct dma_async_tx_descriptor * -ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, size_t len, - unsigned long flags) -{ - /* handle the single source multiply case from the raid6 - * recovery path - */ - if (unlikely((flags & DMA_PREP_PQ_DISABLE_P) && src_cnt == 1)) { - dma_addr_t single_source[2]; - unsigned char single_source_coef[2]; - - BUG_ON(flags & DMA_PREP_PQ_DISABLE_Q); - single_source[0] = src[0]; - single_source[1] = src[0]; - single_source_coef[0] = scf[0]; - single_source_coef[1] = 0; - - return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2, - single_source_coef, len, flags); - } else - return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf, - len, flags); -} - -struct dma_async_tx_descriptor * -ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, size_t len, - enum sum_check_flags *pqres, unsigned long flags) -{ - /* the cleanup routine only sets bits on validate failure, it - * does not clear bits on validate success... so clear it here - */ - *pqres = 0; - - return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len, - flags); -} - -static struct dma_async_tx_descriptor * -ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, - unsigned int src_cnt, size_t len, unsigned long flags) -{ - unsigned char scf[src_cnt]; - dma_addr_t pq[2]; - - memset(scf, 0, src_cnt); - flags |= DMA_PREP_PQ_DISABLE_Q; - pq[0] = dst; - pq[1] = ~0; - - return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len, - flags); -} - -struct dma_async_tx_descriptor * -ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src, - unsigned int src_cnt, size_t len, - enum sum_check_flags *result, unsigned long flags) -{ - unsigned char scf[src_cnt]; - dma_addr_t pq[2]; - - /* the cleanup routine only sets bits on validate failure, it - * does not clear bits on validate success... so clear it here - */ - *result = 0; - - memset(scf, 0, src_cnt); - flags |= DMA_PREP_PQ_DISABLE_Q; - pq[0] = src[0]; - pq[1] = ~0; - - return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf, - len, flags); -} - -static struct dma_async_tx_descriptor * -ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags) -{ - struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - struct ioat_ring_ent *desc; - struct ioat_dma_descriptor *hw; - u16 idx; - - if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0) - desc = ioat2_get_ring_ent(ioat, idx); - else - return NULL; - - hw = desc->hw; - hw->ctl = 0; - hw->ctl_f.null = 1; - hw->ctl_f.int_en = 1; - hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE); - hw->ctl_f.compl_write = 1; - hw->size = NULL_DESC_BUFFER_SIZE; - hw->src_addr = 0; - hw->dst_addr = 0; - - desc->txd.flags = flags; - desc->len = 1; - - dump_desc_dbg(ioat, desc); - - /* we leave the channel locked to ensure in order submission */ - return &desc->txd; -} - -static void __devinit ioat3_dma_test_callback(void *dma_async_param) -{ - struct completion *cmp = dma_async_param; - - complete(cmp); -} - -#define IOAT_NUM_SRC_TEST 6 /* must be <= 8 */ -static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device) -{ - int i, src_idx; - struct page *dest; - struct page *xor_srcs[IOAT_NUM_SRC_TEST]; - struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1]; - dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1]; - dma_addr_t dma_addr, dest_dma; - struct dma_async_tx_descriptor *tx; - struct dma_chan *dma_chan; - dma_cookie_t cookie; - u8 cmp_byte = 0; - u32 cmp_word; - u32 xor_val_result; - int err = 0; - struct completion cmp; - unsigned long tmo; - struct device *dev = &device->pdev->dev; - struct dma_device *dma = &device->common; - - dev_dbg(dev, "%s\n", __func__); - - if (!dma_has_cap(DMA_XOR, dma->cap_mask)) - return 0; - - for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) { - xor_srcs[src_idx] = alloc_page(GFP_KERNEL); - if (!xor_srcs[src_idx]) { - while (src_idx--) - __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } - } - - dest = alloc_page(GFP_KERNEL); - if (!dest) { - while (src_idx--) - __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } - - /* Fill in src buffers */ - for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) { - u8 *ptr = page_address(xor_srcs[src_idx]); - for (i = 0; i < PAGE_SIZE; i++) - ptr[i] = (1 << src_idx); - } - - for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) - cmp_byte ^= (u8) (1 << src_idx); - - cmp_word = (cmp_byte << 24) | (cmp_byte << 16) | - (cmp_byte << 8) | cmp_byte; - - memset(page_address(dest), 0, PAGE_SIZE); - - dma_chan = container_of(dma->channels.next, struct dma_chan, - device_node); - if (dma->device_alloc_chan_resources(dma_chan) < 1) { - err = -ENODEV; - goto out; - } - - /* test xor */ - dest_dma = dma_map_page(dev, dest, 0, PAGE_SIZE, DMA_FROM_DEVICE); - for (i = 0; i < IOAT_NUM_SRC_TEST; i++) - dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs, - IOAT_NUM_SRC_TEST, PAGE_SIZE, - DMA_PREP_INTERRUPT); - - if (!tx) { - dev_err(dev, "Self-test xor prep failed\n"); - err = -ENODEV; - goto free_resources; - } - - async_tx_ack(tx); - init_completion(&cmp); - tx->callback = ioat3_dma_test_callback; - tx->callback_param = &cmp; - cookie = tx->tx_submit(tx); - if (cookie < 0) { - dev_err(dev, "Self-test xor setup failed\n"); - err = -ENODEV; - goto free_resources; - } - dma->device_issue_pending(dma_chan); - - tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - - if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(dev, "Self-test xor timed out\n"); - err = -ENODEV; - goto free_resources; - } - - dma_sync_single_for_cpu(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE); - for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) { - u32 *ptr = page_address(dest); - if (ptr[i] != cmp_word) { - dev_err(dev, "Self-test xor failed compare\n"); - err = -ENODEV; - goto free_resources; - } - } - dma_sync_single_for_device(dev, dest_dma, PAGE_SIZE, DMA_TO_DEVICE); - - /* skip validate if the capability is not present */ - if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask)) - goto free_resources; - - /* validate the sources with the destintation page */ - for (i = 0; i < IOAT_NUM_SRC_TEST; i++) - xor_val_srcs[i] = xor_srcs[i]; - xor_val_srcs[i] = dest; - - xor_val_result = 1; - - for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) - dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, - IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, - &xor_val_result, DMA_PREP_INTERRUPT); - if (!tx) { - dev_err(dev, "Self-test zero prep failed\n"); - err = -ENODEV; - goto free_resources; - } - - async_tx_ack(tx); - init_completion(&cmp); - tx->callback = ioat3_dma_test_callback; - tx->callback_param = &cmp; - cookie = tx->tx_submit(tx); - if (cookie < 0) { - dev_err(dev, "Self-test zero setup failed\n"); - err = -ENODEV; - goto free_resources; - } - dma->device_issue_pending(dma_chan); - - tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - - if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(dev, "Self-test validate timed out\n"); - err = -ENODEV; - goto free_resources; - } - - if (xor_val_result != 0) { - dev_err(dev, "Self-test validate failed compare\n"); - err = -ENODEV; - goto free_resources; - } - - /* skip memset if the capability is not present */ - if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask)) - goto free_resources; - - /* test memset */ - dma_addr = dma_map_page(dev, dest, 0, - PAGE_SIZE, DMA_FROM_DEVICE); - tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE, - DMA_PREP_INTERRUPT); - if (!tx) { - dev_err(dev, "Self-test memset prep failed\n"); - err = -ENODEV; - goto free_resources; - } - - async_tx_ack(tx); - init_completion(&cmp); - tx->callback = ioat3_dma_test_callback; - tx->callback_param = &cmp; - cookie = tx->tx_submit(tx); - if (cookie < 0) { - dev_err(dev, "Self-test memset setup failed\n"); - err = -ENODEV; - goto free_resources; - } - dma->device_issue_pending(dma_chan); - - tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - - if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(dev, "Self-test memset timed out\n"); - err = -ENODEV; - goto free_resources; - } - - for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) { - u32 *ptr = page_address(dest); - if (ptr[i]) { - dev_err(dev, "Self-test memset failed compare\n"); - err = -ENODEV; - goto free_resources; - } - } - - /* test for non-zero parity sum */ - xor_val_result = 0; - for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) - dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, - IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, - &xor_val_result, DMA_PREP_INTERRUPT); - if (!tx) { - dev_err(dev, "Self-test 2nd zero prep failed\n"); - err = -ENODEV; - goto free_resources; - } - - async_tx_ack(tx); - init_completion(&cmp); - tx->callback = ioat3_dma_test_callback; - tx->callback_param = &cmp; - cookie = tx->tx_submit(tx); - if (cookie < 0) { - dev_err(dev, "Self-test 2nd zero setup failed\n"); - err = -ENODEV; - goto free_resources; - } - dma->device_issue_pending(dma_chan); - - tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - - if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(dev, "Self-test 2nd validate timed out\n"); - err = -ENODEV; - goto free_resources; - } - - if (xor_val_result != SUM_CHECK_P_RESULT) { - dev_err(dev, "Self-test validate failed compare\n"); - err = -ENODEV; - goto free_resources; - } - -free_resources: - dma->device_free_chan_resources(dma_chan); -out: - src_idx = IOAT_NUM_SRC_TEST; - while (src_idx--) - __free_page(xor_srcs[src_idx]); - __free_page(dest); - return err; -} - -static int __devinit ioat3_dma_self_test(struct ioatdma_device *device) -{ - int rc = ioat_dma_self_test(device); - - if (rc) - return rc; - - rc = ioat_xor_val_self_test(device); - if (rc) - return rc; - - return 0; -} - -int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca) -{ - struct pci_dev *pdev = device->pdev; - struct dma_device *dma; - struct dma_chan *c; - struct ioat_chan_common *chan; - bool is_raid_device = false; - int err; - u16 dev_id; - u32 cap; - - device->enumerate_channels = ioat2_enumerate_channels; - device->self_test = ioat3_dma_self_test; - dma = &device->common; - dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock; - dma->device_issue_pending = ioat2_issue_pending; - dma->device_alloc_chan_resources = ioat2_alloc_chan_resources; - dma->device_free_chan_resources = ioat2_free_chan_resources; - - dma_cap_set(DMA_INTERRUPT, dma->cap_mask); - dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock; - - cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET); - if (cap & IOAT_CAP_XOR) { - is_raid_device = true; - dma->max_xor = 8; - dma->xor_align = 2; - - dma_cap_set(DMA_XOR, dma->cap_mask); - dma->device_prep_dma_xor = ioat3_prep_xor; - - dma_cap_set(DMA_XOR_VAL, dma->cap_mask); - dma->device_prep_dma_xor_val = ioat3_prep_xor_val; - } - if (cap & IOAT_CAP_PQ) { - is_raid_device = true; - dma_set_maxpq(dma, 8, 0); - dma->pq_align = 2; - - dma_cap_set(DMA_PQ, dma->cap_mask); - dma->device_prep_dma_pq = ioat3_prep_pq; - - dma_cap_set(DMA_PQ_VAL, dma->cap_mask); - dma->device_prep_dma_pq_val = ioat3_prep_pq_val; - - if (!(cap & IOAT_CAP_XOR)) { - dma->max_xor = 8; - dma->xor_align = 2; - - dma_cap_set(DMA_XOR, dma->cap_mask); - dma->device_prep_dma_xor = ioat3_prep_pqxor; - - dma_cap_set(DMA_XOR_VAL, dma->cap_mask); - dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val; - } - } - if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) { - dma_cap_set(DMA_MEMSET, dma->cap_mask); - dma->device_prep_dma_memset = ioat3_prep_memset_lock; - } - - - if (is_raid_device) { - dma->device_is_tx_complete = ioat3_is_complete; - device->cleanup_tasklet = ioat3_cleanup_tasklet; - device->timer_fn = ioat3_timer_event; - } else { - dma->device_is_tx_complete = ioat2_is_complete; - device->cleanup_tasklet = ioat2_cleanup_tasklet; - device->timer_fn = ioat2_timer_event; - } - - /* -= IOAT ver.3 workarounds =- */ - /* Write CHANERRMSK_INT with 3E07h to mask out the errors - * that can cause stability issues for IOAT ver.3 - */ - pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07); - - /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit - * (workaround for spurious config parity error after restart) - */ - pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id); - if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) - pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10); - - err = ioat_probe(device); - if (err) - return err; - ioat_set_tcp_copy_break(262144); - - list_for_each_entry(c, &dma->channels, device_node) { - chan = to_chan_common(c); - writel(IOAT_DMA_DCA_ANY_CPU, - chan->reg_base + IOAT_DCACTRL_OFFSET); - } - - err = ioat_register(device); - if (err) - return err; - - ioat_kobject_add(device, &ioat2_ktype); - - if (dca) - device->dca = ioat3_dca_init(pdev, device->reg_base); - - return 0; -} diff --git a/trunk/drivers/dma/ioat/hw.h b/trunk/drivers/dma/ioat/hw.h deleted file mode 100644 index 99afb12bd409..000000000000 --- a/trunk/drivers/dma/ioat/hw.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef _IOAT_HW_H_ -#define _IOAT_HW_H_ - -/* PCI Configuration Space Values */ -#define IOAT_PCI_VID 0x8086 -#define IOAT_MMIO_BAR 0 - -/* CB device ID's */ -#define IOAT_PCI_DID_5000 0x1A38 -#define IOAT_PCI_DID_CNB 0x360B -#define IOAT_PCI_DID_SCNB 0x65FF -#define IOAT_PCI_DID_SNB 0x402F - -#define IOAT_PCI_RID 0x00 -#define IOAT_PCI_SVID 0x8086 -#define IOAT_PCI_SID 0x8086 -#define IOAT_VER_1_2 0x12 /* Version 1.2 */ -#define IOAT_VER_2_0 0x20 /* Version 2.0 */ -#define IOAT_VER_3_0 0x30 /* Version 3.0 */ -#define IOAT_VER_3_2 0x32 /* Version 3.2 */ - -struct ioat_dma_descriptor { - uint32_t size; - union { - uint32_t ctl; - struct { - unsigned int int_en:1; - unsigned int src_snoop_dis:1; - unsigned int dest_snoop_dis:1; - unsigned int compl_write:1; - unsigned int fence:1; - unsigned int null:1; - unsigned int src_brk:1; - unsigned int dest_brk:1; - unsigned int bundle:1; - unsigned int dest_dca:1; - unsigned int hint:1; - unsigned int rsvd2:13; - #define IOAT_OP_COPY 0x00 - unsigned int op:8; - } ctl_f; - }; - uint64_t src_addr; - uint64_t dst_addr; - uint64_t next; - uint64_t rsv1; - uint64_t rsv2; - /* store some driver data in an unused portion of the descriptor */ - union { - uint64_t user1; - uint64_t tx_cnt; - }; - uint64_t user2; -}; - -struct ioat_fill_descriptor { - uint32_t size; - union { - uint32_t ctl; - struct { - unsigned int int_en:1; - unsigned int rsvd:1; - unsigned int dest_snoop_dis:1; - unsigned int compl_write:1; - unsigned int fence:1; - unsigned int rsvd2:2; - unsigned int dest_brk:1; - unsigned int bundle:1; - unsigned int rsvd4:15; - #define IOAT_OP_FILL 0x01 - unsigned int op:8; - } ctl_f; - }; - uint64_t src_data; - uint64_t dst_addr; - uint64_t next; - uint64_t rsv1; - uint64_t next_dst_addr; - uint64_t user1; - uint64_t user2; -}; - -struct ioat_xor_descriptor { - uint32_t size; - union { - uint32_t ctl; - struct { - unsigned int int_en:1; - unsigned int src_snoop_dis:1; - unsigned int dest_snoop_dis:1; - unsigned int compl_write:1; - unsigned int fence:1; - unsigned int src_cnt:3; - unsigned int bundle:1; - unsigned int dest_dca:1; - unsigned int hint:1; - unsigned int rsvd:13; - #define IOAT_OP_XOR 0x87 - #define IOAT_OP_XOR_VAL 0x88 - unsigned int op:8; - } ctl_f; - }; - uint64_t src_addr; - uint64_t dst_addr; - uint64_t next; - uint64_t src_addr2; - uint64_t src_addr3; - uint64_t src_addr4; - uint64_t src_addr5; -}; - -struct ioat_xor_ext_descriptor { - uint64_t src_addr6; - uint64_t src_addr7; - uint64_t src_addr8; - uint64_t next; - uint64_t rsvd[4]; -}; - -struct ioat_pq_descriptor { - uint32_t size; - union { - uint32_t ctl; - struct { - unsigned int int_en:1; - unsigned int src_snoop_dis:1; - unsigned int dest_snoop_dis:1; - unsigned int compl_write:1; - unsigned int fence:1; - unsigned int src_cnt:3; - unsigned int bundle:1; - unsigned int dest_dca:1; - unsigned int hint:1; - unsigned int p_disable:1; - unsigned int q_disable:1; - unsigned int rsvd:11; - #define IOAT_OP_PQ 0x89 - #define IOAT_OP_PQ_VAL 0x8a - unsigned int op:8; - } ctl_f; - }; - uint64_t src_addr; - uint64_t p_addr; - uint64_t next; - uint64_t src_addr2; - uint64_t src_addr3; - uint8_t coef[8]; - uint64_t q_addr; -}; - -struct ioat_pq_ext_descriptor { - uint64_t src_addr4; - uint64_t src_addr5; - uint64_t src_addr6; - uint64_t next; - uint64_t src_addr7; - uint64_t src_addr8; - uint64_t rsvd[2]; -}; - -struct ioat_pq_update_descriptor { - uint32_t size; - union { - uint32_t ctl; - struct { - unsigned int int_en:1; - unsigned int src_snoop_dis:1; - unsigned int dest_snoop_dis:1; - unsigned int compl_write:1; - unsigned int fence:1; - unsigned int src_cnt:3; - unsigned int bundle:1; - unsigned int dest_dca:1; - unsigned int hint:1; - unsigned int p_disable:1; - unsigned int q_disable:1; - unsigned int rsvd:3; - unsigned int coef:8; - #define IOAT_OP_PQ_UP 0x8b - unsigned int op:8; - } ctl_f; - }; - uint64_t src_addr; - uint64_t p_addr; - uint64_t next; - uint64_t src_addr2; - uint64_t p_src; - uint64_t q_src; - uint64_t q_addr; -}; - -struct ioat_raw_descriptor { - uint64_t field[8]; -}; -#endif diff --git a/trunk/drivers/dma/ioat/pci.c b/trunk/drivers/dma/ioat/pci.c deleted file mode 100644 index d545fae30f37..000000000000 --- a/trunk/drivers/dma/ioat/pci.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Intel I/OAT DMA Linux driver - * Copyright(c) 2007 - 2009 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - */ - -/* - * This driver supports an Intel I/OAT DMA engine, which does asynchronous - * copy operations. - */ - -#include -#include -#include -#include -#include -#include "dma.h" -#include "dma_v2.h" -#include "registers.h" -#include "hw.h" - -MODULE_VERSION(IOAT_DMA_VERSION); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Intel Corporation"); - -static struct pci_device_id ioat_pci_tbl[] = { - /* I/OAT v1 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) }, - { PCI_VDEVICE(UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) }, - - /* I/OAT v2 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) }, - - /* I/OAT v3 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) }, - - /* I/OAT v3.2 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) }, - - { 0, } -}; -MODULE_DEVICE_TABLE(pci, ioat_pci_tbl); - -static int __devinit ioat_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id); -static void __devexit ioat_remove(struct pci_dev *pdev); - -static int ioat_dca_enabled = 1; -module_param(ioat_dca_enabled, int, 0644); -MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)"); - -struct kmem_cache *ioat2_cache; - -#define DRV_NAME "ioatdma" - -static struct pci_driver ioat_pci_driver = { - .name = DRV_NAME, - .id_table = ioat_pci_tbl, - .probe = ioat_pci_probe, - .remove = __devexit_p(ioat_remove), -}; - -static struct ioatdma_device * -alloc_ioatdma(struct pci_dev *pdev, void __iomem *iobase) -{ - struct device *dev = &pdev->dev; - struct ioatdma_device *d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL); - - if (!d) - return NULL; - d->pdev = pdev; - d->reg_base = iobase; - return d; -} - -static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - void __iomem * const *iomap; - struct device *dev = &pdev->dev; - struct ioatdma_device *device; - int err; - - err = pcim_enable_device(pdev); - if (err) - return err; - - err = pcim_iomap_regions(pdev, 1 << IOAT_MMIO_BAR, DRV_NAME); - if (err) - return err; - iomap = pcim_iomap_table(pdev); - if (!iomap) - return -ENOMEM; - - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) - return err; - - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) - return err; - - device = devm_kzalloc(dev, sizeof(*device), GFP_KERNEL); - if (!device) - return -ENOMEM; - - pci_set_master(pdev); - - device = alloc_ioatdma(pdev, iomap[IOAT_MMIO_BAR]); - if (!device) - return -ENOMEM; - pci_set_drvdata(pdev, device); - - device->version = readb(device->reg_base + IOAT_VER_OFFSET); - if (device->version == IOAT_VER_1_2) - err = ioat1_dma_probe(device, ioat_dca_enabled); - else if (device->version == IOAT_VER_2_0) - err = ioat2_dma_probe(device, ioat_dca_enabled); - else if (device->version >= IOAT_VER_3_0) - err = ioat3_dma_probe(device, ioat_dca_enabled); - else - return -ENODEV; - - if (err) { - dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n"); - return -ENODEV; - } - - return 0; -} - -static void __devexit ioat_remove(struct pci_dev *pdev) -{ - struct ioatdma_device *device = pci_get_drvdata(pdev); - - if (!device) - return; - - dev_err(&pdev->dev, "Removing dma and dca services\n"); - if (device->dca) { - unregister_dca_provider(device->dca, &pdev->dev); - free_dca_provider(device->dca); - device->dca = NULL; - } - ioat_dma_remove(device); -} - -static int __init ioat_init_module(void) -{ - int err; - - pr_info("%s: Intel(R) QuickData Technology Driver %s\n", - DRV_NAME, IOAT_DMA_VERSION); - - ioat2_cache = kmem_cache_create("ioat2", sizeof(struct ioat_ring_ent), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ioat2_cache) - return -ENOMEM; - - err = pci_register_driver(&ioat_pci_driver); - if (err) - kmem_cache_destroy(ioat2_cache); - - return err; -} -module_init(ioat_init_module); - -static void __exit ioat_exit_module(void) -{ - pci_unregister_driver(&ioat_pci_driver); - kmem_cache_destroy(ioat2_cache); -} -module_exit(ioat_exit_module); diff --git a/trunk/drivers/dma/ioat/dca.c b/trunk/drivers/dma/ioat_dca.c similarity index 98% rename from trunk/drivers/dma/ioat/dca.c rename to trunk/drivers/dma/ioat_dca.c index 69d02615c4d6..c012a1e15043 100644 --- a/trunk/drivers/dma/ioat/dca.c +++ b/trunk/drivers/dma/ioat_dca.c @@ -33,8 +33,8 @@ #define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24) #endif -#include "dma.h" -#include "registers.h" +#include "ioatdma.h" +#include "ioatdma_registers.h" /* * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6 @@ -242,8 +242,7 @@ static struct dca_ops ioat_dca_ops = { }; -struct dca_provider * __devinit -ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase) +struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase) { struct dca_provider *dca; struct ioat_dca_priv *ioatdca; @@ -408,8 +407,7 @@ static int ioat2_dca_count_dca_slots(void __iomem *iobase, u16 dca_offset) return slots; } -struct dca_provider * __devinit -ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase) +struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase) { struct dca_provider *dca; struct ioat_dca_priv *ioatdca; @@ -604,8 +602,7 @@ static int ioat3_dca_count_dca_slots(void *iobase, u16 dca_offset) return slots; } -struct dca_provider * __devinit -ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase) +struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase) { struct dca_provider *dca; struct ioat_dca_priv *ioatdca; diff --git a/trunk/drivers/dma/ioat_dma.c b/trunk/drivers/dma/ioat_dma.c new file mode 100644 index 000000000000..a600fc0f7962 --- /dev/null +++ b/trunk/drivers/dma/ioat_dma.c @@ -0,0 +1,1741 @@ +/* + * Intel I/OAT DMA Linux driver + * Copyright(c) 2004 - 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +/* + * This driver supports an Intel I/OAT DMA engine, which does asynchronous + * copy operations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ioatdma.h" +#include "ioatdma_registers.h" +#include "ioatdma_hw.h" + +#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common) +#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common) +#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) +#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx) + +#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80) +static int ioat_pending_level = 4; +module_param(ioat_pending_level, int, 0644); +MODULE_PARM_DESC(ioat_pending_level, + "high-water mark for pushing ioat descriptors (default: 4)"); + +#define RESET_DELAY msecs_to_jiffies(100) +#define WATCHDOG_DELAY round_jiffies(msecs_to_jiffies(2000)) +static void ioat_dma_chan_reset_part2(struct work_struct *work); +static void ioat_dma_chan_watchdog(struct work_struct *work); + +/* + * workaround for IOAT ver.3.0 null descriptor issue + * (channel returns error when size is 0) + */ +#define NULL_DESC_BUFFER_SIZE 1 + +/* internal functions */ +static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan); +static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan); + +static struct ioat_desc_sw * +ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan); +static struct ioat_desc_sw * +ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan); + +static inline struct ioat_dma_chan *ioat_lookup_chan_by_index( + struct ioatdma_device *device, + int index) +{ + return device->idx[index]; +} + +/** + * ioat_dma_do_interrupt - handler used for single vector interrupt mode + * @irq: interrupt id + * @data: interrupt data + */ +static irqreturn_t ioat_dma_do_interrupt(int irq, void *data) +{ + struct ioatdma_device *instance = data; + struct ioat_dma_chan *ioat_chan; + unsigned long attnstatus; + int bit; + u8 intrctrl; + + intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET); + + if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN)) + return IRQ_NONE; + + if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) { + writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); + return IRQ_NONE; + } + + attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET); + for_each_bit(bit, &attnstatus, BITS_PER_LONG) { + ioat_chan = ioat_lookup_chan_by_index(instance, bit); + tasklet_schedule(&ioat_chan->cleanup_task); + } + + writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); + return IRQ_HANDLED; +} + +/** + * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode + * @irq: interrupt id + * @data: interrupt data + */ +static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data) +{ + struct ioat_dma_chan *ioat_chan = data; + + tasklet_schedule(&ioat_chan->cleanup_task); + + return IRQ_HANDLED; +} + +static void ioat_dma_cleanup_tasklet(unsigned long data); + +/** + * ioat_dma_enumerate_channels - find and initialize the device's channels + * @device: the device to be enumerated + */ +static int ioat_dma_enumerate_channels(struct ioatdma_device *device) +{ + u8 xfercap_scale; + u32 xfercap; + int i; + struct ioat_dma_chan *ioat_chan; + + /* + * IOAT ver.3 workarounds + */ + if (device->version == IOAT_VER_3_0) { + u32 chan_err_mask; + u16 dev_id; + u32 dmauncerrsts; + + /* + * Write CHANERRMSK_INT with 3E07h to mask out the errors + * that can cause stability issues for IOAT ver.3 + */ + chan_err_mask = 0x3E07; + pci_write_config_dword(device->pdev, + IOAT_PCI_CHANERRMASK_INT_OFFSET, + chan_err_mask); + + /* + * Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit + * (workaround for spurious config parity error after restart) + */ + pci_read_config_word(device->pdev, + IOAT_PCI_DEVICE_ID_OFFSET, + &dev_id); + if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) { + dmauncerrsts = 0x10; + pci_write_config_dword(device->pdev, + IOAT_PCI_DMAUNCERRSTS_OFFSET, + dmauncerrsts); + } + } + + device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET); + xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET); + xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); + +#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL + if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) { + device->common.chancnt--; + } +#endif + for (i = 0; i < device->common.chancnt; i++) { + ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL); + if (!ioat_chan) { + device->common.chancnt = i; + break; + } + + ioat_chan->device = device; + ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1)); + ioat_chan->xfercap = xfercap; + ioat_chan->desccount = 0; + INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2); + if (ioat_chan->device->version == IOAT_VER_2_0) + writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | + IOAT_DMA_DCA_ANY_CPU, + ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); + else if (ioat_chan->device->version == IOAT_VER_3_0) + writel(IOAT_DMA_DCA_ANY_CPU, + ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); + spin_lock_init(&ioat_chan->cleanup_lock); + spin_lock_init(&ioat_chan->desc_lock); + INIT_LIST_HEAD(&ioat_chan->free_desc); + INIT_LIST_HEAD(&ioat_chan->used_desc); + /* This should be made common somewhere in dmaengine.c */ + ioat_chan->common.device = &device->common; + list_add_tail(&ioat_chan->common.device_node, + &device->common.channels); + device->idx[i] = ioat_chan; + tasklet_init(&ioat_chan->cleanup_task, + ioat_dma_cleanup_tasklet, + (unsigned long) ioat_chan); + tasklet_disable(&ioat_chan->cleanup_task); + } + return device->common.chancnt; +} + +/** + * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended + * descriptors to hw + * @chan: DMA channel handle + */ +static inline void __ioat1_dma_memcpy_issue_pending( + struct ioat_dma_chan *ioat_chan) +{ + ioat_chan->pending = 0; + writeb(IOAT_CHANCMD_APPEND, ioat_chan->reg_base + IOAT1_CHANCMD_OFFSET); +} + +static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + if (ioat_chan->pending > 0) { + spin_lock_bh(&ioat_chan->desc_lock); + __ioat1_dma_memcpy_issue_pending(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + } +} + +static inline void __ioat2_dma_memcpy_issue_pending( + struct ioat_dma_chan *ioat_chan) +{ + ioat_chan->pending = 0; + writew(ioat_chan->dmacount, + ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET); +} + +static void ioat2_dma_memcpy_issue_pending(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + if (ioat_chan->pending > 0) { + spin_lock_bh(&ioat_chan->desc_lock); + __ioat2_dma_memcpy_issue_pending(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + } +} + + +/** + * ioat_dma_chan_reset_part2 - reinit the channel after a reset + */ +static void ioat_dma_chan_reset_part2(struct work_struct *work) +{ + struct ioat_dma_chan *ioat_chan = + container_of(work, struct ioat_dma_chan, work.work); + struct ioat_desc_sw *desc; + + spin_lock_bh(&ioat_chan->cleanup_lock); + spin_lock_bh(&ioat_chan->desc_lock); + + ioat_chan->completion_virt->low = 0; + ioat_chan->completion_virt->high = 0; + ioat_chan->pending = 0; + + /* + * count the descriptors waiting, and be sure to do it + * right for both the CB1 line and the CB2 ring + */ + ioat_chan->dmacount = 0; + if (ioat_chan->used_desc.prev) { + desc = to_ioat_desc(ioat_chan->used_desc.prev); + do { + ioat_chan->dmacount++; + desc = to_ioat_desc(desc->node.next); + } while (&desc->node != ioat_chan->used_desc.next); + } + + /* + * write the new starting descriptor address + * this puts channel engine into ARMED state + */ + desc = to_ioat_desc(ioat_chan->used_desc.prev); + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW); + writel(((u64) desc->async_tx.phys) >> 32, + ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH); + + writeb(IOAT_CHANCMD_START, ioat_chan->reg_base + + IOAT_CHANCMD_OFFSET(ioat_chan->device->version)); + break; + case IOAT_VER_2_0: + writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW); + writel(((u64) desc->async_tx.phys) >> 32, + ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH); + + /* tell the engine to go with what's left to be done */ + writew(ioat_chan->dmacount, + ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET); + + break; + } + dev_err(&ioat_chan->device->pdev->dev, + "chan%d reset - %d descs waiting, %d total desc\n", + chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount); + + spin_unlock_bh(&ioat_chan->desc_lock); + spin_unlock_bh(&ioat_chan->cleanup_lock); +} + +/** + * ioat_dma_reset_channel - restart a channel + * @ioat_chan: IOAT DMA channel handle + */ +static void ioat_dma_reset_channel(struct ioat_dma_chan *ioat_chan) +{ + u32 chansts, chanerr; + + if (!ioat_chan->used_desc.prev) + return; + + chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); + chansts = (ioat_chan->completion_virt->low + & IOAT_CHANSTS_DMA_TRANSFER_STATUS); + if (chanerr) { + dev_err(&ioat_chan->device->pdev->dev, + "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n", + chan_num(ioat_chan), chansts, chanerr); + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); + } + + /* + * whack it upside the head with a reset + * and wait for things to settle out. + * force the pending count to a really big negative + * to make sure no one forces an issue_pending + * while we're waiting. + */ + + spin_lock_bh(&ioat_chan->desc_lock); + ioat_chan->pending = INT_MIN; + writeb(IOAT_CHANCMD_RESET, + ioat_chan->reg_base + + IOAT_CHANCMD_OFFSET(ioat_chan->device->version)); + spin_unlock_bh(&ioat_chan->desc_lock); + + /* schedule the 2nd half instead of sleeping a long time */ + schedule_delayed_work(&ioat_chan->work, RESET_DELAY); +} + +/** + * ioat_dma_chan_watchdog - watch for stuck channels + */ +static void ioat_dma_chan_watchdog(struct work_struct *work) +{ + struct ioatdma_device *device = + container_of(work, struct ioatdma_device, work.work); + struct ioat_dma_chan *ioat_chan; + int i; + + union { + u64 full; + struct { + u32 low; + u32 high; + }; + } completion_hw; + unsigned long compl_desc_addr_hw; + + for (i = 0; i < device->common.chancnt; i++) { + ioat_chan = ioat_lookup_chan_by_index(device, i); + + if (ioat_chan->device->version == IOAT_VER_1_2 + /* have we started processing anything yet */ + && ioat_chan->last_completion + /* have we completed any since last watchdog cycle? */ + && (ioat_chan->last_completion == + ioat_chan->watchdog_completion) + /* has TCP stuck on one cookie since last watchdog? */ + && (ioat_chan->watchdog_tcp_cookie == + ioat_chan->watchdog_last_tcp_cookie) + && (ioat_chan->watchdog_tcp_cookie != + ioat_chan->completed_cookie) + /* is there something in the chain to be processed? */ + /* CB1 chain always has at least the last one processed */ + && (ioat_chan->used_desc.prev != ioat_chan->used_desc.next) + && ioat_chan->pending == 0) { + + /* + * check CHANSTS register for completed + * descriptor address. + * if it is different than completion writeback, + * it is not zero + * and it has changed since the last watchdog + * we can assume that channel + * is still working correctly + * and the problem is in completion writeback. + * update completion writeback + * with actual CHANSTS value + * else + * try resetting the channel + */ + + completion_hw.low = readl(ioat_chan->reg_base + + IOAT_CHANSTS_OFFSET_LOW(ioat_chan->device->version)); + completion_hw.high = readl(ioat_chan->reg_base + + IOAT_CHANSTS_OFFSET_HIGH(ioat_chan->device->version)); +#if (BITS_PER_LONG == 64) + compl_desc_addr_hw = + completion_hw.full + & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR; +#else + compl_desc_addr_hw = + completion_hw.low & IOAT_LOW_COMPLETION_MASK; +#endif + + if ((compl_desc_addr_hw != 0) + && (compl_desc_addr_hw != ioat_chan->watchdog_completion) + && (compl_desc_addr_hw != ioat_chan->last_compl_desc_addr_hw)) { + ioat_chan->last_compl_desc_addr_hw = compl_desc_addr_hw; + ioat_chan->completion_virt->low = completion_hw.low; + ioat_chan->completion_virt->high = completion_hw.high; + } else { + ioat_dma_reset_channel(ioat_chan); + ioat_chan->watchdog_completion = 0; + ioat_chan->last_compl_desc_addr_hw = 0; + } + + /* + * for version 2.0 if there are descriptors yet to be processed + * and the last completed hasn't changed since the last watchdog + * if they haven't hit the pending level + * issue the pending to push them through + * else + * try resetting the channel + */ + } else if (ioat_chan->device->version == IOAT_VER_2_0 + && ioat_chan->used_desc.prev + && ioat_chan->last_completion + && ioat_chan->last_completion == ioat_chan->watchdog_completion) { + + if (ioat_chan->pending < ioat_pending_level) + ioat2_dma_memcpy_issue_pending(&ioat_chan->common); + else { + ioat_dma_reset_channel(ioat_chan); + ioat_chan->watchdog_completion = 0; + } + } else { + ioat_chan->last_compl_desc_addr_hw = 0; + ioat_chan->watchdog_completion + = ioat_chan->last_completion; + } + + ioat_chan->watchdog_last_tcp_cookie = + ioat_chan->watchdog_tcp_cookie; + } + + schedule_delayed_work(&device->work, WATCHDOG_DELAY); +} + +static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan); + struct ioat_desc_sw *first = tx_to_ioat_desc(tx); + struct ioat_desc_sw *prev, *new; + struct ioat_dma_descriptor *hw; + dma_cookie_t cookie; + LIST_HEAD(new_chain); + u32 copy; + size_t len; + dma_addr_t src, dst; + unsigned long orig_flags; + unsigned int desc_count = 0; + + /* src and dest and len are stored in the initial descriptor */ + len = first->len; + src = first->src; + dst = first->dst; + orig_flags = first->async_tx.flags; + new = first; + + spin_lock_bh(&ioat_chan->desc_lock); + prev = to_ioat_desc(ioat_chan->used_desc.prev); + prefetch(prev->hw); + do { + copy = min_t(size_t, len, ioat_chan->xfercap); + + async_tx_ack(&new->async_tx); + + hw = new->hw; + hw->size = copy; + hw->ctl = 0; + hw->src_addr = src; + hw->dst_addr = dst; + hw->next = 0; + + /* chain together the physical address list for the HW */ + wmb(); + prev->hw->next = (u64) new->async_tx.phys; + + len -= copy; + dst += copy; + src += copy; + + list_add_tail(&new->node, &new_chain); + desc_count++; + prev = new; + } while (len && (new = ioat1_dma_get_next_descriptor(ioat_chan))); + + if (!new) { + dev_err(&ioat_chan->device->pdev->dev, + "tx submit failed\n"); + spin_unlock_bh(&ioat_chan->desc_lock); + return -ENOMEM; + } + + hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS; + if (first->async_tx.callback) { + hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN; + if (first != new) { + /* move callback into to last desc */ + new->async_tx.callback = first->async_tx.callback; + new->async_tx.callback_param + = first->async_tx.callback_param; + first->async_tx.callback = NULL; + first->async_tx.callback_param = NULL; + } + } + + new->tx_cnt = desc_count; + new->async_tx.flags = orig_flags; /* client is in control of this ack */ + + /* store the original values for use in later cleanup */ + if (new != first) { + new->src = first->src; + new->dst = first->dst; + new->len = first->len; + } + + /* cookie incr and addition to used_list must be atomic */ + cookie = ioat_chan->common.cookie; + cookie++; + if (cookie < 0) + cookie = 1; + ioat_chan->common.cookie = new->async_tx.cookie = cookie; + + /* write address into NextDescriptor field of last desc in chain */ + to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = + first->async_tx.phys; + list_splice_tail(&new_chain, &ioat_chan->used_desc); + + ioat_chan->dmacount += desc_count; + ioat_chan->pending += desc_count; + if (ioat_chan->pending >= ioat_pending_level) + __ioat1_dma_memcpy_issue_pending(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + + return cookie; +} + +static dma_cookie_t ioat2_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan); + struct ioat_desc_sw *first = tx_to_ioat_desc(tx); + struct ioat_desc_sw *new; + struct ioat_dma_descriptor *hw; + dma_cookie_t cookie; + u32 copy; + size_t len; + dma_addr_t src, dst; + unsigned long orig_flags; + unsigned int desc_count = 0; + + /* src and dest and len are stored in the initial descriptor */ + len = first->len; + src = first->src; + dst = first->dst; + orig_flags = first->async_tx.flags; + new = first; + + /* + * ioat_chan->desc_lock is still in force in version 2 path + * it gets unlocked at end of this function + */ + do { + copy = min_t(size_t, len, ioat_chan->xfercap); + + async_tx_ack(&new->async_tx); + + hw = new->hw; + hw->size = copy; + hw->ctl = 0; + hw->src_addr = src; + hw->dst_addr = dst; + + len -= copy; + dst += copy; + src += copy; + desc_count++; + } while (len && (new = ioat2_dma_get_next_descriptor(ioat_chan))); + + if (!new) { + dev_err(&ioat_chan->device->pdev->dev, + "tx submit failed\n"); + spin_unlock_bh(&ioat_chan->desc_lock); + return -ENOMEM; + } + + hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_CP_STS; + if (first->async_tx.callback) { + hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN; + if (first != new) { + /* move callback into to last desc */ + new->async_tx.callback = first->async_tx.callback; + new->async_tx.callback_param + = first->async_tx.callback_param; + first->async_tx.callback = NULL; + first->async_tx.callback_param = NULL; + } + } + + new->tx_cnt = desc_count; + new->async_tx.flags = orig_flags; /* client is in control of this ack */ + + /* store the original values for use in later cleanup */ + if (new != first) { + new->src = first->src; + new->dst = first->dst; + new->len = first->len; + } + + /* cookie incr and addition to used_list must be atomic */ + cookie = ioat_chan->common.cookie; + cookie++; + if (cookie < 0) + cookie = 1; + ioat_chan->common.cookie = new->async_tx.cookie = cookie; + + ioat_chan->dmacount += desc_count; + ioat_chan->pending += desc_count; + if (ioat_chan->pending >= ioat_pending_level) + __ioat2_dma_memcpy_issue_pending(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + + return cookie; +} + +/** + * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair + * @ioat_chan: the channel supplying the memory pool for the descriptors + * @flags: allocation flags + */ +static struct ioat_desc_sw *ioat_dma_alloc_descriptor( + struct ioat_dma_chan *ioat_chan, + gfp_t flags) +{ + struct ioat_dma_descriptor *desc; + struct ioat_desc_sw *desc_sw; + struct ioatdma_device *ioatdma_device; + dma_addr_t phys; + + ioatdma_device = to_ioatdma_device(ioat_chan->common.device); + desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys); + if (unlikely(!desc)) + return NULL; + + desc_sw = kzalloc(sizeof(*desc_sw), flags); + if (unlikely(!desc_sw)) { + pci_pool_free(ioatdma_device->dma_pool, desc, phys); + return NULL; + } + + memset(desc, 0, sizeof(*desc)); + dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common); + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + desc_sw->async_tx.tx_submit = ioat1_tx_submit; + break; + case IOAT_VER_2_0: + case IOAT_VER_3_0: + desc_sw->async_tx.tx_submit = ioat2_tx_submit; + break; + } + + desc_sw->hw = desc; + desc_sw->async_tx.phys = phys; + + return desc_sw; +} + +static int ioat_initial_desc_count = 256; +module_param(ioat_initial_desc_count, int, 0644); +MODULE_PARM_DESC(ioat_initial_desc_count, + "initial descriptors per channel (default: 256)"); + +/** + * ioat2_dma_massage_chan_desc - link the descriptors into a circle + * @ioat_chan: the channel to be massaged + */ +static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan) +{ + struct ioat_desc_sw *desc, *_desc; + + /* setup used_desc */ + ioat_chan->used_desc.next = ioat_chan->free_desc.next; + ioat_chan->used_desc.prev = NULL; + + /* pull free_desc out of the circle so that every node is a hw + * descriptor, but leave it pointing to the list + */ + ioat_chan->free_desc.prev->next = ioat_chan->free_desc.next; + ioat_chan->free_desc.next->prev = ioat_chan->free_desc.prev; + + /* circle link the hw descriptors */ + desc = to_ioat_desc(ioat_chan->free_desc.next); + desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys; + list_for_each_entry_safe(desc, _desc, ioat_chan->free_desc.next, node) { + desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys; + } +} + +/** + * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors + * @chan: the channel to be filled out + */ +static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_desc_sw *desc; + u16 chanctrl; + u32 chanerr; + int i; + LIST_HEAD(tmp_list); + + /* have we already been set up? */ + if (!list_empty(&ioat_chan->free_desc)) + return ioat_chan->desccount; + + /* Setup register to interrupt and write completion status on error */ + chanctrl = IOAT_CHANCTRL_ERR_INT_EN | + IOAT_CHANCTRL_ANY_ERR_ABORT_EN | + IOAT_CHANCTRL_ERR_COMPLETION_EN; + writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET); + + chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); + if (chanerr) { + dev_err(&ioat_chan->device->pdev->dev, + "CHANERR = %x, clearing\n", chanerr); + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); + } + + /* Allocate descriptors */ + for (i = 0; i < ioat_initial_desc_count; i++) { + desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL); + if (!desc) { + dev_err(&ioat_chan->device->pdev->dev, + "Only %d initial descriptors\n", i); + break; + } + list_add_tail(&desc->node, &tmp_list); + } + spin_lock_bh(&ioat_chan->desc_lock); + ioat_chan->desccount = i; + list_splice(&tmp_list, &ioat_chan->free_desc); + if (ioat_chan->device->version != IOAT_VER_1_2) + ioat2_dma_massage_chan_desc(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + + /* allocate a completion writeback area */ + /* doing 2 32bit writes to mmio since 1 64b write doesn't work */ + ioat_chan->completion_virt = + pci_pool_alloc(ioat_chan->device->completion_pool, + GFP_KERNEL, + &ioat_chan->completion_addr); + memset(ioat_chan->completion_virt, 0, + sizeof(*ioat_chan->completion_virt)); + writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW); + writel(((u64) ioat_chan->completion_addr) >> 32, + ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH); + + tasklet_enable(&ioat_chan->cleanup_task); + ioat_dma_start_null_desc(ioat_chan); /* give chain to dma device */ + return ioat_chan->desccount; +} + +/** + * ioat_dma_free_chan_resources - release all the descriptors + * @chan: the channel to be cleaned + */ +static void ioat_dma_free_chan_resources(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioatdma_device *ioatdma_device = to_ioatdma_device(chan->device); + struct ioat_desc_sw *desc, *_desc; + int in_use_descs = 0; + + /* Before freeing channel resources first check + * if they have been previously allocated for this channel. + */ + if (ioat_chan->desccount == 0) + return; + + tasklet_disable(&ioat_chan->cleanup_task); + ioat_dma_memcpy_cleanup(ioat_chan); + + /* Delay 100ms after reset to allow internal DMA logic to quiesce + * before removing DMA descriptor resources. + */ + writeb(IOAT_CHANCMD_RESET, + ioat_chan->reg_base + + IOAT_CHANCMD_OFFSET(ioat_chan->device->version)); + mdelay(100); + + spin_lock_bh(&ioat_chan->desc_lock); + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + list_for_each_entry_safe(desc, _desc, + &ioat_chan->used_desc, node) { + in_use_descs++; + list_del(&desc->node); + pci_pool_free(ioatdma_device->dma_pool, desc->hw, + desc->async_tx.phys); + kfree(desc); + } + list_for_each_entry_safe(desc, _desc, + &ioat_chan->free_desc, node) { + list_del(&desc->node); + pci_pool_free(ioatdma_device->dma_pool, desc->hw, + desc->async_tx.phys); + kfree(desc); + } + break; + case IOAT_VER_2_0: + case IOAT_VER_3_0: + list_for_each_entry_safe(desc, _desc, + ioat_chan->free_desc.next, node) { + list_del(&desc->node); + pci_pool_free(ioatdma_device->dma_pool, desc->hw, + desc->async_tx.phys); + kfree(desc); + } + desc = to_ioat_desc(ioat_chan->free_desc.next); + pci_pool_free(ioatdma_device->dma_pool, desc->hw, + desc->async_tx.phys); + kfree(desc); + INIT_LIST_HEAD(&ioat_chan->free_desc); + INIT_LIST_HEAD(&ioat_chan->used_desc); + break; + } + spin_unlock_bh(&ioat_chan->desc_lock); + + pci_pool_free(ioatdma_device->completion_pool, + ioat_chan->completion_virt, + ioat_chan->completion_addr); + + /* one is ok since we left it on there on purpose */ + if (in_use_descs > 1) + dev_err(&ioat_chan->device->pdev->dev, + "Freeing %d in use descriptors!\n", + in_use_descs - 1); + + ioat_chan->last_completion = ioat_chan->completion_addr = 0; + ioat_chan->pending = 0; + ioat_chan->dmacount = 0; + ioat_chan->desccount = 0; + ioat_chan->watchdog_completion = 0; + ioat_chan->last_compl_desc_addr_hw = 0; + ioat_chan->watchdog_tcp_cookie = + ioat_chan->watchdog_last_tcp_cookie = 0; +} + +/** + * ioat_dma_get_next_descriptor - return the next available descriptor + * @ioat_chan: IOAT DMA channel handle + * + * Gets the next descriptor from the chain, and must be called with the + * channel's desc_lock held. Allocates more descriptors if the channel + * has run out. + */ +static struct ioat_desc_sw * +ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan) +{ + struct ioat_desc_sw *new; + + if (!list_empty(&ioat_chan->free_desc)) { + new = to_ioat_desc(ioat_chan->free_desc.next); + list_del(&new->node); + } else { + /* try to get another desc */ + new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC); + if (!new) { + dev_err(&ioat_chan->device->pdev->dev, + "alloc failed\n"); + return NULL; + } + } + + prefetch(new->hw); + return new; +} + +static struct ioat_desc_sw * +ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan) +{ + struct ioat_desc_sw *new; + + /* + * used.prev points to where to start processing + * used.next points to next free descriptor + * if used.prev == NULL, there are none waiting to be processed + * if used.next == used.prev.prev, there is only one free descriptor, + * and we need to use it to as a noop descriptor before + * linking in a new set of descriptors, since the device + * has probably already read the pointer to it + */ + if (ioat_chan->used_desc.prev && + ioat_chan->used_desc.next == ioat_chan->used_desc.prev->prev) { + + struct ioat_desc_sw *desc; + struct ioat_desc_sw *noop_desc; + int i; + + /* set up the noop descriptor */ + noop_desc = to_ioat_desc(ioat_chan->used_desc.next); + /* set size to non-zero value (channel returns error when size is 0) */ + noop_desc->hw->size = NULL_DESC_BUFFER_SIZE; + noop_desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL; + noop_desc->hw->src_addr = 0; + noop_desc->hw->dst_addr = 0; + + ioat_chan->used_desc.next = ioat_chan->used_desc.next->next; + ioat_chan->pending++; + ioat_chan->dmacount++; + + /* try to get a few more descriptors */ + for (i = 16; i; i--) { + desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC); + if (!desc) { + dev_err(&ioat_chan->device->pdev->dev, + "alloc failed\n"); + break; + } + list_add_tail(&desc->node, ioat_chan->used_desc.next); + + desc->hw->next + = to_ioat_desc(desc->node.next)->async_tx.phys; + to_ioat_desc(desc->node.prev)->hw->next + = desc->async_tx.phys; + ioat_chan->desccount++; + } + + ioat_chan->used_desc.next = noop_desc->node.next; + } + new = to_ioat_desc(ioat_chan->used_desc.next); + prefetch(new); + ioat_chan->used_desc.next = new->node.next; + + if (ioat_chan->used_desc.prev == NULL) + ioat_chan->used_desc.prev = &new->node; + + prefetch(new->hw); + return new; +} + +static struct ioat_desc_sw *ioat_dma_get_next_descriptor( + struct ioat_dma_chan *ioat_chan) +{ + if (!ioat_chan) + return NULL; + + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + return ioat1_dma_get_next_descriptor(ioat_chan); + case IOAT_VER_2_0: + case IOAT_VER_3_0: + return ioat2_dma_get_next_descriptor(ioat_chan); + } + return NULL; +} + +static struct dma_async_tx_descriptor *ioat1_dma_prep_memcpy( + struct dma_chan *chan, + dma_addr_t dma_dest, + dma_addr_t dma_src, + size_t len, + unsigned long flags) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_desc_sw *new; + + spin_lock_bh(&ioat_chan->desc_lock); + new = ioat_dma_get_next_descriptor(ioat_chan); + spin_unlock_bh(&ioat_chan->desc_lock); + + if (new) { + new->len = len; + new->dst = dma_dest; + new->src = dma_src; + new->async_tx.flags = flags; + return &new->async_tx; + } else { + dev_err(&ioat_chan->device->pdev->dev, + "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n", + chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount); + return NULL; + } +} + +static struct dma_async_tx_descriptor *ioat2_dma_prep_memcpy( + struct dma_chan *chan, + dma_addr_t dma_dest, + dma_addr_t dma_src, + size_t len, + unsigned long flags) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_desc_sw *new; + + spin_lock_bh(&ioat_chan->desc_lock); + new = ioat2_dma_get_next_descriptor(ioat_chan); + + /* + * leave ioat_chan->desc_lock set in ioat 2 path + * it will get unlocked at end of tx_submit + */ + + if (new) { + new->len = len; + new->dst = dma_dest; + new->src = dma_src; + new->async_tx.flags = flags; + return &new->async_tx; + } else { + spin_unlock_bh(&ioat_chan->desc_lock); + dev_err(&ioat_chan->device->pdev->dev, + "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n", + chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount); + return NULL; + } +} + +static void ioat_dma_cleanup_tasklet(unsigned long data) +{ + struct ioat_dma_chan *chan = (void *)data; + ioat_dma_memcpy_cleanup(chan); + writew(IOAT_CHANCTRL_INT_DISABLE, + chan->reg_base + IOAT_CHANCTRL_OFFSET); +} + +static void +ioat_dma_unmap(struct ioat_dma_chan *ioat_chan, struct ioat_desc_sw *desc) +{ + if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + if (desc->async_tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE) + pci_unmap_single(ioat_chan->device->pdev, + pci_unmap_addr(desc, dst), + pci_unmap_len(desc, len), + PCI_DMA_FROMDEVICE); + else + pci_unmap_page(ioat_chan->device->pdev, + pci_unmap_addr(desc, dst), + pci_unmap_len(desc, len), + PCI_DMA_FROMDEVICE); + } + + if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + if (desc->async_tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE) + pci_unmap_single(ioat_chan->device->pdev, + pci_unmap_addr(desc, src), + pci_unmap_len(desc, len), + PCI_DMA_TODEVICE); + else + pci_unmap_page(ioat_chan->device->pdev, + pci_unmap_addr(desc, src), + pci_unmap_len(desc, len), + PCI_DMA_TODEVICE); + } +} + +/** + * ioat_dma_memcpy_cleanup - cleanup up finished descriptors + * @chan: ioat channel to be cleaned up + */ +static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan) +{ + unsigned long phys_complete; + struct ioat_desc_sw *desc, *_desc; + dma_cookie_t cookie = 0; + unsigned long desc_phys; + struct ioat_desc_sw *latest_desc; + + prefetch(ioat_chan->completion_virt); + + if (!spin_trylock_bh(&ioat_chan->cleanup_lock)) + return; + + /* The completion writeback can happen at any time, + so reads by the driver need to be atomic operations + The descriptor physical addresses are limited to 32-bits + when the CPU can only do a 32-bit mov */ + +#if (BITS_PER_LONG == 64) + phys_complete = + ioat_chan->completion_virt->full + & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR; +#else + phys_complete = + ioat_chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK; +#endif + + if ((ioat_chan->completion_virt->full + & IOAT_CHANSTS_DMA_TRANSFER_STATUS) == + IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) { + dev_err(&ioat_chan->device->pdev->dev, + "Channel halted, chanerr = %x\n", + readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET)); + + /* TODO do something to salvage the situation */ + } + + if (phys_complete == ioat_chan->last_completion) { + spin_unlock_bh(&ioat_chan->cleanup_lock); + /* + * perhaps we're stuck so hard that the watchdog can't go off? + * try to catch it after 2 seconds + */ + if (ioat_chan->device->version != IOAT_VER_3_0) { + if (time_after(jiffies, + ioat_chan->last_completion_time + HZ*WATCHDOG_DELAY)) { + ioat_dma_chan_watchdog(&(ioat_chan->device->work.work)); + ioat_chan->last_completion_time = jiffies; + } + } + return; + } + ioat_chan->last_completion_time = jiffies; + + cookie = 0; + if (!spin_trylock_bh(&ioat_chan->desc_lock)) { + spin_unlock_bh(&ioat_chan->cleanup_lock); + return; + } + + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + list_for_each_entry_safe(desc, _desc, + &ioat_chan->used_desc, node) { + + /* + * Incoming DMA requests may use multiple descriptors, + * due to exceeding xfercap, perhaps. If so, only the + * last one will have a cookie, and require unmapping. + */ + if (desc->async_tx.cookie) { + cookie = desc->async_tx.cookie; + ioat_dma_unmap(ioat_chan, desc); + if (desc->async_tx.callback) { + desc->async_tx.callback(desc->async_tx.callback_param); + desc->async_tx.callback = NULL; + } + } + + if (desc->async_tx.phys != phys_complete) { + /* + * a completed entry, but not the last, so clean + * up if the client is done with the descriptor + */ + if (async_tx_test_ack(&desc->async_tx)) { + list_move_tail(&desc->node, + &ioat_chan->free_desc); + } else + desc->async_tx.cookie = 0; + } else { + /* + * last used desc. Do not remove, so we can + * append from it, but don't look at it next + * time, either + */ + desc->async_tx.cookie = 0; + + /* TODO check status bits? */ + break; + } + } + break; + case IOAT_VER_2_0: + case IOAT_VER_3_0: + /* has some other thread has already cleaned up? */ + if (ioat_chan->used_desc.prev == NULL) + break; + + /* work backwards to find latest finished desc */ + desc = to_ioat_desc(ioat_chan->used_desc.next); + latest_desc = NULL; + do { + desc = to_ioat_desc(desc->node.prev); + desc_phys = (unsigned long)desc->async_tx.phys + & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR; + if (desc_phys == phys_complete) { + latest_desc = desc; + break; + } + } while (&desc->node != ioat_chan->used_desc.prev); + + if (latest_desc != NULL) { + + /* work forwards to clear finished descriptors */ + for (desc = to_ioat_desc(ioat_chan->used_desc.prev); + &desc->node != latest_desc->node.next && + &desc->node != ioat_chan->used_desc.next; + desc = to_ioat_desc(desc->node.next)) { + if (desc->async_tx.cookie) { + cookie = desc->async_tx.cookie; + desc->async_tx.cookie = 0; + ioat_dma_unmap(ioat_chan, desc); + if (desc->async_tx.callback) { + desc->async_tx.callback(desc->async_tx.callback_param); + desc->async_tx.callback = NULL; + } + } + } + + /* move used.prev up beyond those that are finished */ + if (&desc->node == ioat_chan->used_desc.next) + ioat_chan->used_desc.prev = NULL; + else + ioat_chan->used_desc.prev = &desc->node; + } + break; + } + + spin_unlock_bh(&ioat_chan->desc_lock); + + ioat_chan->last_completion = phys_complete; + if (cookie != 0) + ioat_chan->completed_cookie = cookie; + + spin_unlock_bh(&ioat_chan->cleanup_lock); +} + +/** + * ioat_dma_is_complete - poll the status of a IOAT DMA transaction + * @chan: IOAT DMA channel handle + * @cookie: DMA transaction identifier + * @done: if not %NULL, updated with last completed transaction + * @used: if not %NULL, updated with last used transaction + */ +static enum dma_status ioat_dma_is_complete(struct dma_chan *chan, + dma_cookie_t cookie, + dma_cookie_t *done, + dma_cookie_t *used) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + dma_cookie_t last_used; + dma_cookie_t last_complete; + enum dma_status ret; + + last_used = chan->cookie; + last_complete = ioat_chan->completed_cookie; + ioat_chan->watchdog_tcp_cookie = cookie; + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + if (ret == DMA_SUCCESS) + return ret; + + ioat_dma_memcpy_cleanup(ioat_chan); + + last_used = chan->cookie; + last_complete = ioat_chan->completed_cookie; + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + return dma_async_is_complete(cookie, last_complete, last_used); +} + +static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan) +{ + struct ioat_desc_sw *desc; + + spin_lock_bh(&ioat_chan->desc_lock); + + desc = ioat_dma_get_next_descriptor(ioat_chan); + + if (!desc) { + dev_err(&ioat_chan->device->pdev->dev, + "Unable to start null desc - get next desc failed\n"); + spin_unlock_bh(&ioat_chan->desc_lock); + return; + } + + desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL + | IOAT_DMA_DESCRIPTOR_CTL_INT_GN + | IOAT_DMA_DESCRIPTOR_CTL_CP_STS; + /* set size to non-zero value (channel returns error when size is 0) */ + desc->hw->size = NULL_DESC_BUFFER_SIZE; + desc->hw->src_addr = 0; + desc->hw->dst_addr = 0; + async_tx_ack(&desc->async_tx); + switch (ioat_chan->device->version) { + case IOAT_VER_1_2: + desc->hw->next = 0; + list_add_tail(&desc->node, &ioat_chan->used_desc); + + writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW); + writel(((u64) desc->async_tx.phys) >> 32, + ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH); + + writeb(IOAT_CHANCMD_START, ioat_chan->reg_base + + IOAT_CHANCMD_OFFSET(ioat_chan->device->version)); + break; + case IOAT_VER_2_0: + case IOAT_VER_3_0: + writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW); + writel(((u64) desc->async_tx.phys) >> 32, + ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH); + + ioat_chan->dmacount++; + __ioat2_dma_memcpy_issue_pending(ioat_chan); + break; + } + spin_unlock_bh(&ioat_chan->desc_lock); +} + +/* + * Perform a IOAT transaction to verify the HW works. + */ +#define IOAT_TEST_SIZE 2000 + +static void ioat_dma_test_callback(void *dma_async_param) +{ + struct completion *cmp = dma_async_param; + + complete(cmp); +} + +/** + * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works. + * @device: device to be tested + */ +static int ioat_dma_self_test(struct ioatdma_device *device) +{ + int i; + u8 *src; + u8 *dest; + struct dma_chan *dma_chan; + struct dma_async_tx_descriptor *tx; + dma_addr_t dma_dest, dma_src; + dma_cookie_t cookie; + int err = 0; + struct completion cmp; + unsigned long tmo; + unsigned long flags; + + src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); + if (!src) + return -ENOMEM; + dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); + if (!dest) { + kfree(src); + return -ENOMEM; + } + + /* Fill in src buffer */ + for (i = 0; i < IOAT_TEST_SIZE; i++) + src[i] = (u8)i; + + /* Start copy, using first DMA channel */ + dma_chan = container_of(device->common.channels.next, + struct dma_chan, + device_node); + if (device->common.device_alloc_chan_resources(dma_chan) < 1) { + dev_err(&device->pdev->dev, + "selftest cannot allocate chan resource\n"); + err = -ENODEV; + goto out; + } + + dma_src = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE, + DMA_TO_DEVICE); + dma_dest = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE, + DMA_FROM_DEVICE); + flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE; + tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src, + IOAT_TEST_SIZE, flags); + if (!tx) { + dev_err(&device->pdev->dev, + "Self-test prep failed, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + async_tx_ack(tx); + init_completion(&cmp); + tx->callback = ioat_dma_test_callback; + tx->callback_param = &cmp; + cookie = tx->tx_submit(tx); + if (cookie < 0) { + dev_err(&device->pdev->dev, + "Self-test setup failed, disabling\n"); + err = -ENODEV; + goto free_resources; + } + device->common.device_issue_pending(dma_chan); + + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + + if (tmo == 0 || + device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL) + != DMA_SUCCESS) { + dev_err(&device->pdev->dev, + "Self-test copy timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + if (memcmp(src, dest, IOAT_TEST_SIZE)) { + dev_err(&device->pdev->dev, + "Self-test copy failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + +free_resources: + device->common.device_free_chan_resources(dma_chan); +out: + kfree(src); + kfree(dest); + return err; +} + +static char ioat_interrupt_style[32] = "msix"; +module_param_string(ioat_interrupt_style, ioat_interrupt_style, + sizeof(ioat_interrupt_style), 0644); +MODULE_PARM_DESC(ioat_interrupt_style, + "set ioat interrupt style: msix (default), " + "msix-single-vector, msi, intx)"); + +/** + * ioat_dma_setup_interrupts - setup interrupt handler + * @device: ioat device + */ +static int ioat_dma_setup_interrupts(struct ioatdma_device *device) +{ + struct ioat_dma_chan *ioat_chan; + int err, i, j, msixcnt; + u8 intrctrl = 0; + + if (!strcmp(ioat_interrupt_style, "msix")) + goto msix; + if (!strcmp(ioat_interrupt_style, "msix-single-vector")) + goto msix_single_vector; + if (!strcmp(ioat_interrupt_style, "msi")) + goto msi; + if (!strcmp(ioat_interrupt_style, "intx")) + goto intx; + dev_err(&device->pdev->dev, "invalid ioat_interrupt_style %s\n", + ioat_interrupt_style); + goto err_no_irq; + +msix: + /* The number of MSI-X vectors should equal the number of channels */ + msixcnt = device->common.chancnt; + for (i = 0; i < msixcnt; i++) + device->msix_entries[i].entry = i; + + err = pci_enable_msix(device->pdev, device->msix_entries, msixcnt); + if (err < 0) + goto msi; + if (err > 0) + goto msix_single_vector; + + for (i = 0; i < msixcnt; i++) { + ioat_chan = ioat_lookup_chan_by_index(device, i); + err = request_irq(device->msix_entries[i].vector, + ioat_dma_do_interrupt_msix, + 0, "ioat-msix", ioat_chan); + if (err) { + for (j = 0; j < i; j++) { + ioat_chan = + ioat_lookup_chan_by_index(device, j); + free_irq(device->msix_entries[j].vector, + ioat_chan); + } + goto msix_single_vector; + } + } + intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL; + device->irq_mode = msix_multi_vector; + goto done; + +msix_single_vector: + device->msix_entries[0].entry = 0; + err = pci_enable_msix(device->pdev, device->msix_entries, 1); + if (err) + goto msi; + + err = request_irq(device->msix_entries[0].vector, ioat_dma_do_interrupt, + 0, "ioat-msix", device); + if (err) { + pci_disable_msix(device->pdev); + goto msi; + } + device->irq_mode = msix_single_vector; + goto done; + +msi: + err = pci_enable_msi(device->pdev); + if (err) + goto intx; + + err = request_irq(device->pdev->irq, ioat_dma_do_interrupt, + 0, "ioat-msi", device); + if (err) { + pci_disable_msi(device->pdev); + goto intx; + } + /* + * CB 1.2 devices need a bit set in configuration space to enable MSI + */ + if (device->version == IOAT_VER_1_2) { + u32 dmactrl; + pci_read_config_dword(device->pdev, + IOAT_PCI_DMACTRL_OFFSET, &dmactrl); + dmactrl |= IOAT_PCI_DMACTRL_MSI_EN; + pci_write_config_dword(device->pdev, + IOAT_PCI_DMACTRL_OFFSET, dmactrl); + } + device->irq_mode = msi; + goto done; + +intx: + err = request_irq(device->pdev->irq, ioat_dma_do_interrupt, + IRQF_SHARED, "ioat-intx", device); + if (err) + goto err_no_irq; + device->irq_mode = intx; + +done: + intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN; + writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET); + return 0; + +err_no_irq: + /* Disable all interrupt generation */ + writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET); + dev_err(&device->pdev->dev, "no usable interrupts\n"); + device->irq_mode = none; + return -1; +} + +/** + * ioat_dma_remove_interrupts - remove whatever interrupts were set + * @device: ioat device + */ +static void ioat_dma_remove_interrupts(struct ioatdma_device *device) +{ + struct ioat_dma_chan *ioat_chan; + int i; + + /* Disable all interrupt generation */ + writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET); + + switch (device->irq_mode) { + case msix_multi_vector: + for (i = 0; i < device->common.chancnt; i++) { + ioat_chan = ioat_lookup_chan_by_index(device, i); + free_irq(device->msix_entries[i].vector, ioat_chan); + } + pci_disable_msix(device->pdev); + break; + case msix_single_vector: + free_irq(device->msix_entries[0].vector, device); + pci_disable_msix(device->pdev); + break; + case msi: + free_irq(device->pdev->irq, device); + pci_disable_msi(device->pdev); + break; + case intx: + free_irq(device->pdev->irq, device); + break; + case none: + dev_warn(&device->pdev->dev, + "call to %s without interrupts setup\n", __func__); + } + device->irq_mode = none; +} + +struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev, + void __iomem *iobase) +{ + int err; + struct ioatdma_device *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + err = -ENOMEM; + goto err_kzalloc; + } + device->pdev = pdev; + device->reg_base = iobase; + device->version = readb(device->reg_base + IOAT_VER_OFFSET); + + /* DMA coherent memory pool for DMA descriptor allocations */ + device->dma_pool = pci_pool_create("dma_desc_pool", pdev, + sizeof(struct ioat_dma_descriptor), + 64, 0); + if (!device->dma_pool) { + err = -ENOMEM; + goto err_dma_pool; + } + + device->completion_pool = pci_pool_create("completion_pool", pdev, + sizeof(u64), SMP_CACHE_BYTES, + SMP_CACHE_BYTES); + if (!device->completion_pool) { + err = -ENOMEM; + goto err_completion_pool; + } + + INIT_LIST_HEAD(&device->common.channels); + ioat_dma_enumerate_channels(device); + + device->common.device_alloc_chan_resources = + ioat_dma_alloc_chan_resources; + device->common.device_free_chan_resources = + ioat_dma_free_chan_resources; + device->common.dev = &pdev->dev; + + dma_cap_set(DMA_MEMCPY, device->common.cap_mask); + device->common.device_is_tx_complete = ioat_dma_is_complete; + switch (device->version) { + case IOAT_VER_1_2: + device->common.device_prep_dma_memcpy = ioat1_dma_prep_memcpy; + device->common.device_issue_pending = + ioat1_dma_memcpy_issue_pending; + break; + case IOAT_VER_2_0: + case IOAT_VER_3_0: + device->common.device_prep_dma_memcpy = ioat2_dma_prep_memcpy; + device->common.device_issue_pending = + ioat2_dma_memcpy_issue_pending; + break; + } + + dev_err(&device->pdev->dev, + "Intel(R) I/OAT DMA Engine found," + " %d channels, device version 0x%02x, driver version %s\n", + device->common.chancnt, device->version, IOAT_DMA_VERSION); + + if (!device->common.chancnt) { + dev_err(&device->pdev->dev, + "Intel(R) I/OAT DMA Engine problem found: " + "zero channels detected\n"); + goto err_setup_interrupts; + } + + err = ioat_dma_setup_interrupts(device); + if (err) + goto err_setup_interrupts; + + err = ioat_dma_self_test(device); + if (err) + goto err_self_test; + + ioat_set_tcp_copy_break(device); + + dma_async_device_register(&device->common); + + if (device->version != IOAT_VER_3_0) { + INIT_DELAYED_WORK(&device->work, ioat_dma_chan_watchdog); + schedule_delayed_work(&device->work, + WATCHDOG_DELAY); + } + + return device; + +err_self_test: + ioat_dma_remove_interrupts(device); +err_setup_interrupts: + pci_pool_destroy(device->completion_pool); +err_completion_pool: + pci_pool_destroy(device->dma_pool); +err_dma_pool: + kfree(device); +err_kzalloc: + dev_err(&pdev->dev, + "Intel(R) I/OAT DMA Engine initialization failed\n"); + return NULL; +} + +void ioat_dma_remove(struct ioatdma_device *device) +{ + struct dma_chan *chan, *_chan; + struct ioat_dma_chan *ioat_chan; + + if (device->version != IOAT_VER_3_0) + cancel_delayed_work(&device->work); + + ioat_dma_remove_interrupts(device); + + dma_async_device_unregister(&device->common); + + pci_pool_destroy(device->dma_pool); + pci_pool_destroy(device->completion_pool); + + iounmap(device->reg_base); + pci_release_regions(device->pdev); + pci_disable_device(device->pdev); + + list_for_each_entry_safe(chan, _chan, + &device->common.channels, device_node) { + ioat_chan = to_ioat_chan(chan); + list_del(&chan->device_node); + kfree(ioat_chan); + } + kfree(device); +} + diff --git a/trunk/drivers/dma/ioatdma.h b/trunk/drivers/dma/ioatdma.h new file mode 100644 index 000000000000..a52ff4bd4601 --- /dev/null +++ b/trunk/drivers/dma/ioatdma.h @@ -0,0 +1,165 @@ +/* + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef IOATDMA_H +#define IOATDMA_H + +#include +#include "ioatdma_hw.h" +#include +#include +#include +#include +#include + +#define IOAT_DMA_VERSION "3.64" + +enum ioat_interrupt { + none = 0, + msix_multi_vector = 1, + msix_single_vector = 2, + msi = 3, + intx = 4, +}; + +#define IOAT_LOW_COMPLETION_MASK 0xffffffc0 +#define IOAT_DMA_DCA_ANY_CPU ~0 +#define IOAT_WATCHDOG_PERIOD (2 * HZ) + + +/** + * struct ioatdma_device - internal representation of a IOAT device + * @pdev: PCI-Express device + * @reg_base: MMIO register space base address + * @dma_pool: for allocating DMA descriptors + * @common: embedded struct dma_device + * @version: version of ioatdma device + * @irq_mode: which style irq to use + * @msix_entries: irq handlers + * @idx: per channel data + */ + +struct ioatdma_device { + struct pci_dev *pdev; + void __iomem *reg_base; + struct pci_pool *dma_pool; + struct pci_pool *completion_pool; + struct dma_device common; + u8 version; + enum ioat_interrupt irq_mode; + struct delayed_work work; + struct msix_entry msix_entries[4]; + struct ioat_dma_chan *idx[4]; +}; + +/** + * struct ioat_dma_chan - internal representation of a DMA channel + */ +struct ioat_dma_chan { + + void __iomem *reg_base; + + dma_cookie_t completed_cookie; + unsigned long last_completion; + unsigned long last_completion_time; + + size_t xfercap; /* XFERCAP register value expanded out */ + + spinlock_t cleanup_lock; + spinlock_t desc_lock; + struct list_head free_desc; + struct list_head used_desc; + unsigned long watchdog_completion; + int watchdog_tcp_cookie; + u32 watchdog_last_tcp_cookie; + struct delayed_work work; + + int pending; + int dmacount; + int desccount; + + struct ioatdma_device *device; + struct dma_chan common; + + dma_addr_t completion_addr; + union { + u64 full; /* HW completion writeback */ + struct { + u32 low; + u32 high; + }; + } *completion_virt; + unsigned long last_compl_desc_addr_hw; + struct tasklet_struct cleanup_task; +}; + +/* wrapper around hardware descriptor format + additional software fields */ + +/** + * struct ioat_desc_sw - wrapper around hardware descriptor + * @hw: hardware DMA descriptor + * @node: this descriptor will either be on the free list, + * or attached to a transaction list (async_tx.tx_list) + * @tx_cnt: number of descriptors required to complete the transaction + * @async_tx: the generic software descriptor for all engines + */ +struct ioat_desc_sw { + struct ioat_dma_descriptor *hw; + struct list_head node; + int tx_cnt; + size_t len; + dma_addr_t src; + dma_addr_t dst; + struct dma_async_tx_descriptor async_tx; +}; + +static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev) +{ + #ifdef CONFIG_NET_DMA + switch (dev->version) { + case IOAT_VER_1_2: + sysctl_tcp_dma_copybreak = 4096; + break; + case IOAT_VER_2_0: + sysctl_tcp_dma_copybreak = 2048; + break; + case IOAT_VER_3_0: + sysctl_tcp_dma_copybreak = 262144; + break; + } + #endif +} + +#if defined(CONFIG_INTEL_IOATDMA) || defined(CONFIG_INTEL_IOATDMA_MODULE) +struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev, + void __iomem *iobase); +void ioat_dma_remove(struct ioatdma_device *device); +struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase); +struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase); +struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase); +#else +#define ioat_dma_probe(pdev, iobase) NULL +#define ioat_dma_remove(device) do { } while (0) +#define ioat_dca_init(pdev, iobase) NULL +#define ioat2_dca_init(pdev, iobase) NULL +#define ioat3_dca_init(pdev, iobase) NULL +#endif + +#endif /* IOATDMA_H */ diff --git a/trunk/drivers/dma/ioatdma_hw.h b/trunk/drivers/dma/ioatdma_hw.h new file mode 100644 index 000000000000..afa57eef86c9 --- /dev/null +++ b/trunk/drivers/dma/ioatdma_hw.h @@ -0,0 +1,70 @@ +/* + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef _IOAT_HW_H_ +#define _IOAT_HW_H_ + +/* PCI Configuration Space Values */ +#define IOAT_PCI_VID 0x8086 + +/* CB device ID's */ +#define IOAT_PCI_DID_5000 0x1A38 +#define IOAT_PCI_DID_CNB 0x360B +#define IOAT_PCI_DID_SCNB 0x65FF +#define IOAT_PCI_DID_SNB 0x402F + +#define IOAT_PCI_RID 0x00 +#define IOAT_PCI_SVID 0x8086 +#define IOAT_PCI_SID 0x8086 +#define IOAT_VER_1_2 0x12 /* Version 1.2 */ +#define IOAT_VER_2_0 0x20 /* Version 2.0 */ +#define IOAT_VER_3_0 0x30 /* Version 3.0 */ + +struct ioat_dma_descriptor { + uint32_t size; + uint32_t ctl; + uint64_t src_addr; + uint64_t dst_addr; + uint64_t next; + uint64_t rsv1; + uint64_t rsv2; + uint64_t user1; + uint64_t user2; +}; + +#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001 +#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002 +#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004 +#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008 +#define IOAT_DMA_DESCRIPTOR_CTL_FRAME 0x00000010 +#define IOAT_DMA_DESCRIPTOR_NUL 0x00000020 +#define IOAT_DMA_DESCRIPTOR_CTL_SP_BRK 0x00000040 +#define IOAT_DMA_DESCRIPTOR_CTL_DP_BRK 0x00000080 +#define IOAT_DMA_DESCRIPTOR_CTL_BNDL 0x00000100 +#define IOAT_DMA_DESCRIPTOR_CTL_DCA 0x00000200 +#define IOAT_DMA_DESCRIPTOR_CTL_BUFHINT 0x00000400 + +#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_CONTEXT 0xFF000000 +#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_DMA 0x00000000 + +#define IOAT_DMA_DESCRIPTOR_CTL_CONTEXT_DCA 0x00000001 +#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_MASK 0xFF000000 + +#endif diff --git a/trunk/drivers/dma/ioat/registers.h b/trunk/drivers/dma/ioatdma_registers.h similarity index 84% rename from trunk/drivers/dma/ioat/registers.h rename to trunk/drivers/dma/ioatdma_registers.h index 63038e18ab03..49bc277424f8 100644 --- a/trunk/drivers/dma/ioat/registers.h +++ b/trunk/drivers/dma/ioatdma_registers.h @@ -64,37 +64,18 @@ #define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */ #define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001 -#define IOAT_DEVICE_MMIO_RESTRICTED 0x0002 -#define IOAT_DEVICE_MEMORY_BYPASS 0x0004 -#define IOAT_DEVICE_ADDRESS_REMAPPING 0x0008 - -#define IOAT_DMA_CAP_OFFSET 0x10 /* 32-bit */ -#define IOAT_CAP_PAGE_BREAK 0x00000001 -#define IOAT_CAP_CRC 0x00000002 -#define IOAT_CAP_SKIP_MARKER 0x00000004 -#define IOAT_CAP_DCA 0x00000010 -#define IOAT_CAP_CRC_MOVE 0x00000020 -#define IOAT_CAP_FILL_BLOCK 0x00000040 -#define IOAT_CAP_APIC 0x00000080 -#define IOAT_CAP_XOR 0x00000100 -#define IOAT_CAP_PQ 0x00000200 #define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */ /* DMA Channel Registers */ #define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */ #define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000 -#define IOAT3_CHANCTRL_COMPL_DCA_EN 0x0200 #define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100 #define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020 #define IOAT_CHANCTRL_ERR_INT_EN 0x0010 #define IOAT_CHANCTRL_ANY_ERR_ABORT_EN 0x0008 #define IOAT_CHANCTRL_ERR_COMPLETION_EN 0x0004 -#define IOAT_CHANCTRL_INT_REARM 0x0001 -#define IOAT_CHANCTRL_RUN (IOAT_CHANCTRL_INT_REARM |\ - IOAT_CHANCTRL_ERR_COMPLETION_EN |\ - IOAT_CHANCTRL_ANY_ERR_ABORT_EN |\ - IOAT_CHANCTRL_ERR_INT_EN) +#define IOAT_CHANCTRL_INT_DISABLE 0x0001 #define IOAT_DMA_COMP_OFFSET 0x02 /* 16-bit DMA channel compatibility */ #define IOAT_DMA_COMP_V1 0x0001 /* Compatibility with DMA version 1 */ @@ -113,14 +94,14 @@ #define IOAT2_CHANSTS_OFFSET_HIGH 0x0C #define IOAT_CHANSTS_OFFSET_HIGH(ver) ((ver) < IOAT_VER_2_0 \ ? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH) -#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR (~0x3fULL) -#define IOAT_CHANSTS_SOFT_ERR 0x10ULL -#define IOAT_CHANSTS_UNAFFILIATED_ERR 0x8ULL -#define IOAT_CHANSTS_STATUS 0x7ULL -#define IOAT_CHANSTS_ACTIVE 0x0 -#define IOAT_CHANSTS_DONE 0x1 -#define IOAT_CHANSTS_SUSPENDED 0x2 -#define IOAT_CHANSTS_HALTED 0x3 +#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR ~0x3F +#define IOAT_CHANSTS_SOFT_ERR 0x0000000000000010 +#define IOAT_CHANSTS_UNAFFILIATED_ERR 0x0000000000000008 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS 0x0000000000000007 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE 0x0 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE 0x1 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED 0x2 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED 0x3 @@ -223,27 +204,22 @@ #define IOAT_CDAR_OFFSET_HIGH 0x24 #define IOAT_CHANERR_OFFSET 0x28 /* 32-bit Channel Error Register */ -#define IOAT_CHANERR_SRC_ADDR_ERR 0x0001 -#define IOAT_CHANERR_DEST_ADDR_ERR 0x0002 -#define IOAT_CHANERR_NEXT_ADDR_ERR 0x0004 -#define IOAT_CHANERR_NEXT_DESC_ALIGN_ERR 0x0008 +#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001 +#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR 0x0002 +#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR 0x0004 +#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR 0x0008 #define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR 0x0010 #define IOAT_CHANERR_CHANCMD_ERR 0x0020 #define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0040 #define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0080 #define IOAT_CHANERR_READ_DATA_ERR 0x0100 #define IOAT_CHANERR_WRITE_DATA_ERR 0x0200 -#define IOAT_CHANERR_CONTROL_ERR 0x0400 -#define IOAT_CHANERR_LENGTH_ERR 0x0800 +#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR 0x0400 +#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR 0x0800 #define IOAT_CHANERR_COMPLETION_ADDR_ERR 0x1000 #define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000 #define IOAT_CHANERR_SOFT_ERR 0x4000 #define IOAT_CHANERR_UNAFFILIATED_ERR 0x8000 -#define IOAT_CHANERR_XOR_P_OR_CRC_ERR 0x10000 -#define IOAT_CHANERR_XOR_Q_ERR 0x20000 -#define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000 - -#define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR) #define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */ diff --git a/trunk/drivers/dma/iop-adma.c b/trunk/drivers/dma/iop-adma.c index 645ca8d54ec4..2f052265122f 100644 --- a/trunk/drivers/dma/iop-adma.c +++ b/trunk/drivers/dma/iop-adma.c @@ -31,7 +31,6 @@ #include #include #include -#include #include @@ -58,110 +57,65 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot) } } -static void -iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) -{ - struct dma_async_tx_descriptor *tx = &desc->async_tx; - struct iop_adma_desc_slot *unmap = desc->group_head; - struct device *dev = &iop_chan->device->pdev->dev; - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = tx->flags; - u32 src_cnt; - dma_addr_t addr; - dma_addr_t dest; - - src_cnt = unmap->unmap_src_cnt; - dest = iop_desc_get_dest_addr(unmap, iop_chan); - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - enum dma_data_direction dir; - - if (src_cnt > 1) /* is xor? */ - dir = DMA_BIDIRECTIONAL; - else - dir = DMA_FROM_DEVICE; - - dma_unmap_page(dev, dest, len, dir); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - while (src_cnt--) { - addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt); - if (addr == dest) - continue; - dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); - } - } - desc->group_head = NULL; -} - -static void -iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) -{ - struct dma_async_tx_descriptor *tx = &desc->async_tx; - struct iop_adma_desc_slot *unmap = desc->group_head; - struct device *dev = &iop_chan->device->pdev->dev; - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = tx->flags; - u32 src_cnt = unmap->unmap_src_cnt; - dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan); - dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan); - int i; - - if (tx->flags & DMA_PREP_CONTINUE) - src_cnt -= 3; - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) { - dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - dma_addr_t addr; - - for (i = 0; i < src_cnt; i++) { - addr = iop_desc_get_src_addr(unmap, iop_chan, i); - dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); - } - if (desc->pq_check_result) { - dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE); - dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE); - } - } - - desc->group_head = NULL; -} - - static dma_cookie_t iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, struct iop_adma_chan *iop_chan, dma_cookie_t cookie) { - struct dma_async_tx_descriptor *tx = &desc->async_tx; - - BUG_ON(tx->cookie < 0); - if (tx->cookie > 0) { - cookie = tx->cookie; - tx->cookie = 0; + BUG_ON(desc->async_tx.cookie < 0); + if (desc->async_tx.cookie > 0) { + cookie = desc->async_tx.cookie; + desc->async_tx.cookie = 0; /* call the callback (must not sleep or submit new * operations to this channel) */ - if (tx->callback) - tx->callback(tx->callback_param); + if (desc->async_tx.callback) + desc->async_tx.callback( + desc->async_tx.callback_param); /* unmap dma addresses * (unmap_single vs unmap_page?) */ if (desc->group_head && desc->unmap_len) { - if (iop_desc_is_pq(desc)) - iop_desc_unmap_pq(iop_chan, desc); - else - iop_desc_unmap(iop_chan, desc); + struct iop_adma_desc_slot *unmap = desc->group_head; + struct device *dev = + &iop_chan->device->pdev->dev; + u32 len = unmap->unmap_len; + enum dma_ctrl_flags flags = desc->async_tx.flags; + u32 src_cnt; + dma_addr_t addr; + dma_addr_t dest; + + src_cnt = unmap->unmap_src_cnt; + dest = iop_desc_get_dest_addr(unmap, iop_chan); + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + enum dma_data_direction dir; + + if (src_cnt > 1) /* is xor? */ + dir = DMA_BIDIRECTIONAL; + else + dir = DMA_FROM_DEVICE; + + dma_unmap_page(dev, dest, len, dir); + } + + if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + while (src_cnt--) { + addr = iop_desc_get_src_addr(unmap, + iop_chan, + src_cnt); + if (addr == dest) + continue; + dma_unmap_page(dev, addr, len, + DMA_TO_DEVICE); + } + } + desc->group_head = NULL; } } /* run dependent operations */ - dma_run_dependencies(tx); + dma_run_dependencies(&desc->async_tx); return cookie; } @@ -333,12 +287,7 @@ static void iop_adma_tasklet(unsigned long data) { struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data; - /* lockdep will flag depedency submissions as potentially - * recursive locking, this is not the case as a dependency - * submission will never recurse a channels submit routine. - * There are checks in async_tx.c to prevent this. - */ - spin_lock_nested(&iop_chan->lock, SINGLE_DEPTH_NESTING); + spin_lock(&iop_chan->lock); __iop_adma_slot_cleanup(iop_chan); spin_unlock(&iop_chan->lock); } @@ -421,7 +370,7 @@ iop_adma_alloc_slots(struct iop_adma_chan *iop_chan, int num_slots, } alloc_tail->group_head = alloc_start; alloc_tail->async_tx.cookie = -EBUSY; - list_splice(&chain, &alloc_tail->tx_list); + list_splice(&chain, &alloc_tail->async_tx.tx_list); iop_chan->last_used = last_used; iop_desc_clear_next_desc(alloc_start); iop_desc_clear_next_desc(alloc_tail); @@ -480,7 +429,7 @@ iop_adma_tx_submit(struct dma_async_tx_descriptor *tx) old_chain_tail = list_entry(iop_chan->chain.prev, struct iop_adma_desc_slot, chain_node); - list_splice_init(&sw_desc->tx_list, + list_splice_init(&sw_desc->async_tx.tx_list, &old_chain_tail->chain_node); /* fix up the hardware chain */ @@ -547,7 +496,6 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan) dma_async_tx_descriptor_init(&slot->async_tx, chan); slot->async_tx.tx_submit = iop_adma_tx_submit; - INIT_LIST_HEAD(&slot->tx_list); INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); hw_desc = (char *) iop_chan->device->dma_desc_pool; @@ -712,9 +660,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest, } static struct dma_async_tx_descriptor * -iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src, - unsigned int src_cnt, size_t len, u32 *result, - unsigned long flags) +iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src, + unsigned int src_cnt, size_t len, u32 *result, + unsigned long flags) { struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); struct iop_adma_desc_slot *sw_desc, *grp_start; @@ -748,118 +696,6 @@ iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src, return sw_desc ? &sw_desc->async_tx : NULL; } -static struct dma_async_tx_descriptor * -iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, size_t len, - unsigned long flags) -{ - struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); - struct iop_adma_desc_slot *sw_desc, *g; - int slot_cnt, slots_per_op; - int continue_srcs; - - if (unlikely(!len)) - return NULL; - BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT); - - dev_dbg(iop_chan->device->common.dev, - "%s src_cnt: %d len: %u flags: %lx\n", - __func__, src_cnt, len, flags); - - if (dmaf_p_disabled_continue(flags)) - continue_srcs = 1+src_cnt; - else if (dmaf_continue(flags)) - continue_srcs = 3+src_cnt; - else - continue_srcs = 0+src_cnt; - - spin_lock_bh(&iop_chan->lock); - slot_cnt = iop_chan_pq_slot_count(len, continue_srcs, &slots_per_op); - sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); - if (sw_desc) { - int i; - - g = sw_desc->group_head; - iop_desc_set_byte_count(g, iop_chan, len); - - /* even if P is disabled its destination address (bits - * [3:0]) must match Q. It is ok if P points to an - * invalid address, it won't be written. - */ - if (flags & DMA_PREP_PQ_DISABLE_P) - dst[0] = dst[1] & 0x7; - - iop_desc_set_pq_addr(g, dst); - sw_desc->unmap_src_cnt = src_cnt; - sw_desc->unmap_len = len; - sw_desc->async_tx.flags = flags; - for (i = 0; i < src_cnt; i++) - iop_desc_set_pq_src_addr(g, i, src[i], scf[i]); - - /* if we are continuing a previous operation factor in - * the old p and q values, see the comment for dma_maxpq - * in include/linux/dmaengine.h - */ - if (dmaf_p_disabled_continue(flags)) - iop_desc_set_pq_src_addr(g, i++, dst[1], 1); - else if (dmaf_continue(flags)) { - iop_desc_set_pq_src_addr(g, i++, dst[0], 0); - iop_desc_set_pq_src_addr(g, i++, dst[1], 1); - iop_desc_set_pq_src_addr(g, i++, dst[1], 0); - } - iop_desc_init_pq(g, i, flags); - } - spin_unlock_bh(&iop_chan->lock); - - return sw_desc ? &sw_desc->async_tx : NULL; -} - -static struct dma_async_tx_descriptor * -iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, - size_t len, enum sum_check_flags *pqres, - unsigned long flags) -{ - struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); - struct iop_adma_desc_slot *sw_desc, *g; - int slot_cnt, slots_per_op; - - if (unlikely(!len)) - return NULL; - BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT); - - dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n", - __func__, src_cnt, len); - - spin_lock_bh(&iop_chan->lock); - slot_cnt = iop_chan_pq_zero_sum_slot_count(len, src_cnt + 2, &slots_per_op); - sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); - if (sw_desc) { - /* for validate operations p and q are tagged onto the - * end of the source list - */ - int pq_idx = src_cnt; - - g = sw_desc->group_head; - iop_desc_init_pq_zero_sum(g, src_cnt+2, flags); - iop_desc_set_pq_zero_sum_byte_count(g, len); - g->pq_check_result = pqres; - pr_debug("\t%s: g->pq_check_result: %p\n", - __func__, g->pq_check_result); - sw_desc->unmap_src_cnt = src_cnt+2; - sw_desc->unmap_len = len; - sw_desc->async_tx.flags = flags; - while (src_cnt--) - iop_desc_set_pq_zero_sum_src_addr(g, src_cnt, - src[src_cnt], - scf[src_cnt]); - iop_desc_set_pq_zero_sum_addr(g, pq_idx, src); - } - spin_unlock_bh(&iop_chan->lock); - - return sw_desc ? &sw_desc->async_tx : NULL; -} - static void iop_adma_free_chan_resources(struct dma_chan *chan) { struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); @@ -1070,7 +906,7 @@ static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device) #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ static int __devinit -iop_adma_xor_val_self_test(struct iop_adma_device *device) +iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) { int i, src_idx; struct page *dest; @@ -1166,7 +1002,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) PAGE_SIZE, DMA_TO_DEVICE); /* skip zero sum if the capability is not present */ - if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask)) + if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask)) goto free_resources; /* zero sum the sources with the destintation page */ @@ -1180,10 +1016,10 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) dma_srcs[i] = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], 0, PAGE_SIZE, DMA_TO_DEVICE); - tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, - IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, - &zero_sum_result, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, + IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, + &zero_sum_result, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); cookie = iop_adma_tx_submit(tx); iop_adma_issue_pending(dma_chan); @@ -1236,10 +1072,10 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) dma_srcs[i] = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], 0, PAGE_SIZE, DMA_TO_DEVICE); - tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, - IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, - &zero_sum_result, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, + IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, + &zero_sum_result, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); cookie = iop_adma_tx_submit(tx); iop_adma_issue_pending(dma_chan); @@ -1269,170 +1105,6 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) return err; } -#ifdef CONFIG_MD_RAID6_PQ -static int __devinit -iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device) -{ - /* combined sources, software pq results, and extra hw pq results */ - struct page *pq[IOP_ADMA_NUM_SRC_TEST+2+2]; - /* ptr to the extra hw pq buffers defined above */ - struct page **pq_hw = &pq[IOP_ADMA_NUM_SRC_TEST+2]; - /* address conversion buffers (dma_map / page_address) */ - void *pq_sw[IOP_ADMA_NUM_SRC_TEST+2]; - dma_addr_t pq_src[IOP_ADMA_NUM_SRC_TEST]; - dma_addr_t pq_dest[2]; - - int i; - struct dma_async_tx_descriptor *tx; - struct dma_chan *dma_chan; - dma_cookie_t cookie; - u32 zero_sum_result; - int err = 0; - struct device *dev; - - dev_dbg(device->common.dev, "%s\n", __func__); - - for (i = 0; i < ARRAY_SIZE(pq); i++) { - pq[i] = alloc_page(GFP_KERNEL); - if (!pq[i]) { - while (i--) - __free_page(pq[i]); - return -ENOMEM; - } - } - - /* Fill in src buffers */ - for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) { - pq_sw[i] = page_address(pq[i]); - memset(pq_sw[i], 0x11111111 * (1<common.channels.next, - struct dma_chan, - device_node); - if (iop_adma_alloc_chan_resources(dma_chan) < 1) { - err = -ENODEV; - goto out; - } - - dev = dma_chan->device->dev; - - /* initialize the dests */ - memset(page_address(pq_hw[0]), 0 , PAGE_SIZE); - memset(page_address(pq_hw[1]), 0 , PAGE_SIZE); - - /* test pq */ - pq_dest[0] = dma_map_page(dev, pq_hw[0], 0, PAGE_SIZE, DMA_FROM_DEVICE); - pq_dest[1] = dma_map_page(dev, pq_hw[1], 0, PAGE_SIZE, DMA_FROM_DEVICE); - for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) - pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - - tx = iop_adma_prep_dma_pq(dma_chan, pq_dest, pq_src, - IOP_ADMA_NUM_SRC_TEST, (u8 *)raid6_gfexp, - PAGE_SIZE, - DMA_PREP_INTERRUPT | - DMA_CTRL_ACK); - - cookie = iop_adma_tx_submit(tx); - iop_adma_issue_pending(dma_chan); - msleep(8); - - if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != - DMA_SUCCESS) { - dev_err(dev, "Self-test pq timed out, disabling\n"); - err = -ENODEV; - goto free_resources; - } - - raid6_call.gen_syndrome(IOP_ADMA_NUM_SRC_TEST+2, PAGE_SIZE, pq_sw); - - if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST], - page_address(pq_hw[0]), PAGE_SIZE) != 0) { - dev_err(dev, "Self-test p failed compare, disabling\n"); - err = -ENODEV; - goto free_resources; - } - if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST+1], - page_address(pq_hw[1]), PAGE_SIZE) != 0) { - dev_err(dev, "Self-test q failed compare, disabling\n"); - err = -ENODEV; - goto free_resources; - } - - /* test correct zero sum using the software generated pq values */ - for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++) - pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - - zero_sum_result = ~0; - tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST], - pq_src, IOP_ADMA_NUM_SRC_TEST, - raid6_gfexp, PAGE_SIZE, &zero_sum_result, - DMA_PREP_INTERRUPT|DMA_CTRL_ACK); - - cookie = iop_adma_tx_submit(tx); - iop_adma_issue_pending(dma_chan); - msleep(8); - - if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != - DMA_SUCCESS) { - dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n"); - err = -ENODEV; - goto free_resources; - } - - if (zero_sum_result != 0) { - dev_err(dev, "Self-test pq-zero-sum failed to validate: %x\n", - zero_sum_result); - err = -ENODEV; - goto free_resources; - } - - /* test incorrect zero sum */ - i = IOP_ADMA_NUM_SRC_TEST; - memset(pq_sw[i] + 100, 0, 100); - memset(pq_sw[i+1] + 200, 0, 200); - for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++) - pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, - DMA_TO_DEVICE); - - zero_sum_result = 0; - tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST], - pq_src, IOP_ADMA_NUM_SRC_TEST, - raid6_gfexp, PAGE_SIZE, &zero_sum_result, - DMA_PREP_INTERRUPT|DMA_CTRL_ACK); - - cookie = iop_adma_tx_submit(tx); - iop_adma_issue_pending(dma_chan); - msleep(8); - - if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != - DMA_SUCCESS) { - dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n"); - err = -ENODEV; - goto free_resources; - } - - if (zero_sum_result != (SUM_CHECK_P_RESULT | SUM_CHECK_Q_RESULT)) { - dev_err(dev, "Self-test !pq-zero-sum failed to validate: %x\n", - zero_sum_result); - err = -ENODEV; - goto free_resources; - } - -free_resources: - iop_adma_free_chan_resources(dma_chan); -out: - i = ARRAY_SIZE(pq); - while (i--) - __free_page(pq[i]); - return err; -} -#endif - static int __devexit iop_adma_remove(struct platform_device *dev) { struct iop_adma_device *device = platform_get_drvdata(dev); @@ -1520,16 +1192,9 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) dma_dev->max_xor = iop_adma_get_max_xor(); dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; } - if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask)) - dma_dev->device_prep_dma_xor_val = - iop_adma_prep_dma_xor_val; - if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { - dma_set_maxpq(dma_dev, iop_adma_get_max_pq(), 0); - dma_dev->device_prep_dma_pq = iop_adma_prep_dma_pq; - } - if (dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) - dma_dev->device_prep_dma_pq_val = - iop_adma_prep_dma_pq_val; + if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask)) + dma_dev->device_prep_dma_zero_sum = + iop_adma_prep_dma_zero_sum; if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) dma_dev->device_prep_dma_interrupt = iop_adma_prep_dma_interrupt; @@ -1583,35 +1248,23 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) } if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || - dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { - ret = iop_adma_xor_val_self_test(adev); + dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { + ret = iop_adma_xor_zero_sum_self_test(adev); dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); if (ret) goto err_free_iop_chan; } - if (dma_has_cap(DMA_PQ, dma_dev->cap_mask) && - dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) { - #ifdef CONFIG_MD_RAID6_PQ - ret = iop_adma_pq_zero_sum_self_test(adev); - dev_dbg(&pdev->dev, "pq self test returned %d\n", ret); - #else - /* can not test raid6, so do not publish capability */ - dma_cap_clear(DMA_PQ, dma_dev->cap_mask); - dma_cap_clear(DMA_PQ_VAL, dma_dev->cap_mask); - ret = 0; - #endif - if (ret) - goto err_free_iop_chan; - } - dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " - "( %s%s%s%s%s%s%s)\n", - dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", - dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", + "( %s%s%s%s%s%s%s%s%s%s)\n", + dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", + dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", + dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", - dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "", + dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", + dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "", dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", + dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "", dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : ""); @@ -1643,7 +1296,7 @@ static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan) if (sw_desc) { grp_start = sw_desc->group_head; - list_splice_init(&sw_desc->tx_list, &iop_chan->chain); + list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); async_tx_ack(&sw_desc->async_tx); iop_desc_init_memcpy(grp_start, 0); iop_desc_set_byte_count(grp_start, iop_chan, 0); @@ -1699,7 +1352,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan) sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); if (sw_desc) { grp_start = sw_desc->group_head; - list_splice_init(&sw_desc->tx_list, &iop_chan->chain); + list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); async_tx_ack(&sw_desc->async_tx); iop_desc_init_null_xor(grp_start, 2, 0); iop_desc_set_byte_count(grp_start, iop_chan, 0); diff --git a/trunk/drivers/dma/iovlock.c b/trunk/drivers/dma/iovlock.c index c0a272c73682..9f6fe46a9b87 100644 --- a/trunk/drivers/dma/iovlock.c +++ b/trunk/drivers/dma/iovlock.c @@ -183,11 +183,6 @@ dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, iov_byte_offset, kdata, copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } len -= copy; iov[iovec_idx].iov_len -= copy; @@ -253,11 +248,6 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, page, offset, copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } len -= copy; iov[iovec_idx].iov_len -= copy; diff --git a/trunk/drivers/dma/mv_xor.c b/trunk/drivers/dma/mv_xor.c index 466ab10c1ff1..3f23eabe09f2 100644 --- a/trunk/drivers/dma/mv_xor.c +++ b/trunk/drivers/dma/mv_xor.c @@ -517,7 +517,7 @@ mv_xor_alloc_slots(struct mv_xor_chan *mv_chan, int num_slots, } alloc_tail->group_head = alloc_start; alloc_tail->async_tx.cookie = -EBUSY; - list_splice(&chain, &alloc_tail->tx_list); + list_splice(&chain, &alloc_tail->async_tx.tx_list); mv_chan->last_used = last_used; mv_desc_clear_next_desc(alloc_start); mv_desc_clear_next_desc(alloc_tail); @@ -565,14 +565,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx) cookie = mv_desc_assign_cookie(mv_chan, sw_desc); if (list_empty(&mv_chan->chain)) - list_splice_init(&sw_desc->tx_list, &mv_chan->chain); + list_splice_init(&sw_desc->async_tx.tx_list, &mv_chan->chain); else { new_hw_chain = 0; old_chain_tail = list_entry(mv_chan->chain.prev, struct mv_xor_desc_slot, chain_node); - list_splice_init(&grp_start->tx_list, + list_splice_init(&grp_start->async_tx.tx_list, &old_chain_tail->chain_node); if (!mv_can_chain(grp_start)) @@ -632,7 +632,6 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan) slot->async_tx.tx_submit = mv_xor_tx_submit; INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); - INIT_LIST_HEAD(&slot->tx_list); hw_desc = (char *) mv_chan->device->dma_desc_pool; slot->async_tx.phys = (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE]; diff --git a/trunk/drivers/dma/mv_xor.h b/trunk/drivers/dma/mv_xor.h index 977b592e976b..06cafe1ef521 100644 --- a/trunk/drivers/dma/mv_xor.h +++ b/trunk/drivers/dma/mv_xor.h @@ -126,8 +126,9 @@ struct mv_xor_chan { * @idx: pool index * @unmap_src_cnt: number of xor sources * @unmap_len: transaction bytecount - * @tx_list: list of slots that make up a multi-descriptor transaction * @async_tx: support for the async_tx api + * @group_list: list of slots that make up a multi-descriptor transaction + * for example transfer lengths larger than the supported hw max * @xor_check_result: result of zero sum * @crc32_result: result crc calculation */ @@ -144,7 +145,6 @@ struct mv_xor_desc_slot { u16 unmap_src_cnt; u32 value; size_t unmap_len; - struct list_head tx_list; struct dma_async_tx_descriptor async_tx; union { u32 *xor_check_result; diff --git a/trunk/drivers/dma/shdma.c b/trunk/drivers/dma/shdma.c deleted file mode 100644 index b3b065c4e5c1..000000000000 --- a/trunk/drivers/dma/shdma.c +++ /dev/null @@ -1,786 +0,0 @@ -/* - * Renesas SuperH DMA Engine support - * - * base is drivers/dma/flsdma.c - * - * Copyright (C) 2009 Nobuhiro Iwamatsu - * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved. - * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved. - * - * This is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * - DMA of SuperH does not have Hardware DMA chain mode. - * - MAX DMA size is 16MB. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "shdma.h" - -/* DMA descriptor control */ -#define DESC_LAST (-1) -#define DESC_COMP (1) -#define DESC_NCOMP (0) - -#define NR_DESCS_PER_CHANNEL 32 -/* - * Define the default configuration for dual address memory-memory transfer. - * The 0x400 value represents auto-request, external->external. - * - * And this driver set 4byte burst mode. - * If you want to change mode, you need to change RS_DEFAULT of value. - * (ex 1byte burst mode -> (RS_DUAL & ~TS_32) - */ -#define RS_DEFAULT (RS_DUAL) - -#define SH_DMAC_CHAN_BASE(id) (dma_base_addr[id]) -static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg) -{ - ctrl_outl(data, (SH_DMAC_CHAN_BASE(sh_dc->id) + reg)); -} - -static u32 sh_dmae_readl(struct sh_dmae_chan *sh_dc, u32 reg) -{ - return ctrl_inl((SH_DMAC_CHAN_BASE(sh_dc->id) + reg)); -} - -static void dmae_init(struct sh_dmae_chan *sh_chan) -{ - u32 chcr = RS_DEFAULT; /* default is DUAL mode */ - sh_dmae_writel(sh_chan, chcr, CHCR); -} - -/* - * Reset DMA controller - * - * SH7780 has two DMAOR register - */ -static void sh_dmae_ctl_stop(int id) -{ - unsigned short dmaor = dmaor_read_reg(id); - - dmaor &= ~(DMAOR_NMIF | DMAOR_AE); - dmaor_write_reg(id, dmaor); -} - -static int sh_dmae_rst(int id) -{ - unsigned short dmaor; - - sh_dmae_ctl_stop(id); - dmaor = (dmaor_read_reg(id)|DMAOR_INIT); - - dmaor_write_reg(id, dmaor); - if ((dmaor_read_reg(id) & (DMAOR_AE | DMAOR_NMIF))) { - pr_warning(KERN_ERR "dma-sh: Can't initialize DMAOR.\n"); - return -EINVAL; - } - return 0; -} - -static int dmae_is_idle(struct sh_dmae_chan *sh_chan) -{ - u32 chcr = sh_dmae_readl(sh_chan, CHCR); - if (chcr & CHCR_DE) { - if (!(chcr & CHCR_TE)) - return -EBUSY; /* working */ - } - return 0; /* waiting */ -} - -static inline unsigned int calc_xmit_shift(struct sh_dmae_chan *sh_chan) -{ - u32 chcr = sh_dmae_readl(sh_chan, CHCR); - return ts_shift[(chcr & CHCR_TS_MASK) >> CHCR_TS_SHIFT]; -} - -static void dmae_set_reg(struct sh_dmae_chan *sh_chan, struct sh_dmae_regs hw) -{ - sh_dmae_writel(sh_chan, hw.sar, SAR); - sh_dmae_writel(sh_chan, hw.dar, DAR); - sh_dmae_writel(sh_chan, - (hw.tcr >> calc_xmit_shift(sh_chan)), TCR); -} - -static void dmae_start(struct sh_dmae_chan *sh_chan) -{ - u32 chcr = sh_dmae_readl(sh_chan, CHCR); - - chcr |= (CHCR_DE|CHCR_IE); - sh_dmae_writel(sh_chan, chcr, CHCR); -} - -static void dmae_halt(struct sh_dmae_chan *sh_chan) -{ - u32 chcr = sh_dmae_readl(sh_chan, CHCR); - - chcr &= ~(CHCR_DE | CHCR_TE | CHCR_IE); - sh_dmae_writel(sh_chan, chcr, CHCR); -} - -static int dmae_set_chcr(struct sh_dmae_chan *sh_chan, u32 val) -{ - int ret = dmae_is_idle(sh_chan); - /* When DMA was working, can not set data to CHCR */ - if (ret) - return ret; - - sh_dmae_writel(sh_chan, val, CHCR); - return 0; -} - -#define DMARS1_ADDR 0x04 -#define DMARS2_ADDR 0x08 -#define DMARS_SHIFT 8 -#define DMARS_CHAN_MSK 0x01 -static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val) -{ - u32 addr; - int shift = 0; - int ret = dmae_is_idle(sh_chan); - if (ret) - return ret; - - if (sh_chan->id & DMARS_CHAN_MSK) - shift = DMARS_SHIFT; - - switch (sh_chan->id) { - /* DMARS0 */ - case 0: - case 1: - addr = SH_DMARS_BASE; - break; - /* DMARS1 */ - case 2: - case 3: - addr = (SH_DMARS_BASE + DMARS1_ADDR); - break; - /* DMARS2 */ - case 4: - case 5: - addr = (SH_DMARS_BASE + DMARS2_ADDR); - break; - default: - return -EINVAL; - } - - ctrl_outw((val << shift) | - (ctrl_inw(addr) & (shift ? 0xFF00 : 0x00FF)), - addr); - - return 0; -} - -static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx) -{ - struct sh_desc *desc = tx_to_sh_desc(tx); - struct sh_dmae_chan *sh_chan = to_sh_chan(tx->chan); - dma_cookie_t cookie; - - spin_lock_bh(&sh_chan->desc_lock); - - cookie = sh_chan->common.cookie; - cookie++; - if (cookie < 0) - cookie = 1; - - /* If desc only in the case of 1 */ - if (desc->async_tx.cookie != -EBUSY) - desc->async_tx.cookie = cookie; - sh_chan->common.cookie = desc->async_tx.cookie; - - list_splice_init(&desc->tx_list, sh_chan->ld_queue.prev); - - spin_unlock_bh(&sh_chan->desc_lock); - - return cookie; -} - -static struct sh_desc *sh_dmae_get_desc(struct sh_dmae_chan *sh_chan) -{ - struct sh_desc *desc, *_desc, *ret = NULL; - - spin_lock_bh(&sh_chan->desc_lock); - list_for_each_entry_safe(desc, _desc, &sh_chan->ld_free, node) { - if (async_tx_test_ack(&desc->async_tx)) { - list_del(&desc->node); - ret = desc; - break; - } - } - spin_unlock_bh(&sh_chan->desc_lock); - - return ret; -} - -static void sh_dmae_put_desc(struct sh_dmae_chan *sh_chan, struct sh_desc *desc) -{ - if (desc) { - spin_lock_bh(&sh_chan->desc_lock); - - list_splice_init(&desc->tx_list, &sh_chan->ld_free); - list_add(&desc->node, &sh_chan->ld_free); - - spin_unlock_bh(&sh_chan->desc_lock); - } -} - -static int sh_dmae_alloc_chan_resources(struct dma_chan *chan) -{ - struct sh_dmae_chan *sh_chan = to_sh_chan(chan); - struct sh_desc *desc; - - spin_lock_bh(&sh_chan->desc_lock); - while (sh_chan->descs_allocated < NR_DESCS_PER_CHANNEL) { - spin_unlock_bh(&sh_chan->desc_lock); - desc = kzalloc(sizeof(struct sh_desc), GFP_KERNEL); - if (!desc) { - spin_lock_bh(&sh_chan->desc_lock); - break; - } - dma_async_tx_descriptor_init(&desc->async_tx, - &sh_chan->common); - desc->async_tx.tx_submit = sh_dmae_tx_submit; - desc->async_tx.flags = DMA_CTRL_ACK; - INIT_LIST_HEAD(&desc->tx_list); - sh_dmae_put_desc(sh_chan, desc); - - spin_lock_bh(&sh_chan->desc_lock); - sh_chan->descs_allocated++; - } - spin_unlock_bh(&sh_chan->desc_lock); - - return sh_chan->descs_allocated; -} - -/* - * sh_dma_free_chan_resources - Free all resources of the channel. - */ -static void sh_dmae_free_chan_resources(struct dma_chan *chan) -{ - struct sh_dmae_chan *sh_chan = to_sh_chan(chan); - struct sh_desc *desc, *_desc; - LIST_HEAD(list); - - BUG_ON(!list_empty(&sh_chan->ld_queue)); - spin_lock_bh(&sh_chan->desc_lock); - - list_splice_init(&sh_chan->ld_free, &list); - sh_chan->descs_allocated = 0; - - spin_unlock_bh(&sh_chan->desc_lock); - - list_for_each_entry_safe(desc, _desc, &list, node) - kfree(desc); -} - -static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy( - struct dma_chan *chan, dma_addr_t dma_dest, dma_addr_t dma_src, - size_t len, unsigned long flags) -{ - struct sh_dmae_chan *sh_chan; - struct sh_desc *first = NULL, *prev = NULL, *new; - size_t copy_size; - - if (!chan) - return NULL; - - if (!len) - return NULL; - - sh_chan = to_sh_chan(chan); - - do { - /* Allocate the link descriptor from DMA pool */ - new = sh_dmae_get_desc(sh_chan); - if (!new) { - dev_err(sh_chan->dev, - "No free memory for link descriptor\n"); - goto err_get_desc; - } - - copy_size = min(len, (size_t)SH_DMA_TCR_MAX); - - new->hw.sar = dma_src; - new->hw.dar = dma_dest; - new->hw.tcr = copy_size; - if (!first) - first = new; - - new->mark = DESC_NCOMP; - async_tx_ack(&new->async_tx); - - prev = new; - len -= copy_size; - dma_src += copy_size; - dma_dest += copy_size; - /* Insert the link descriptor to the LD ring */ - list_add_tail(&new->node, &first->tx_list); - } while (len); - - new->async_tx.flags = flags; /* client is in control of this ack */ - new->async_tx.cookie = -EBUSY; /* Last desc */ - - return &first->async_tx; - -err_get_desc: - sh_dmae_put_desc(sh_chan, first); - return NULL; - -} - -/* - * sh_chan_ld_cleanup - Clean up link descriptors - * - * This function clean up the ld_queue of DMA channel. - */ -static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan) -{ - struct sh_desc *desc, *_desc; - - spin_lock_bh(&sh_chan->desc_lock); - list_for_each_entry_safe(desc, _desc, &sh_chan->ld_queue, node) { - dma_async_tx_callback callback; - void *callback_param; - - /* non send data */ - if (desc->mark == DESC_NCOMP) - break; - - /* send data sesc */ - callback = desc->async_tx.callback; - callback_param = desc->async_tx.callback_param; - - /* Remove from ld_queue list */ - list_splice_init(&desc->tx_list, &sh_chan->ld_free); - - dev_dbg(sh_chan->dev, "link descriptor %p will be recycle.\n", - desc); - - list_move(&desc->node, &sh_chan->ld_free); - /* Run the link descriptor callback function */ - if (callback) { - spin_unlock_bh(&sh_chan->desc_lock); - dev_dbg(sh_chan->dev, "link descriptor %p callback\n", - desc); - callback(callback_param); - spin_lock_bh(&sh_chan->desc_lock); - } - } - spin_unlock_bh(&sh_chan->desc_lock); -} - -static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan) -{ - struct list_head *ld_node; - struct sh_dmae_regs hw; - - /* DMA work check */ - if (dmae_is_idle(sh_chan)) - return; - - /* Find the first un-transfer desciptor */ - for (ld_node = sh_chan->ld_queue.next; - (ld_node != &sh_chan->ld_queue) - && (to_sh_desc(ld_node)->mark == DESC_COMP); - ld_node = ld_node->next) - cpu_relax(); - - if (ld_node != &sh_chan->ld_queue) { - /* Get the ld start address from ld_queue */ - hw = to_sh_desc(ld_node)->hw; - dmae_set_reg(sh_chan, hw); - dmae_start(sh_chan); - } -} - -static void sh_dmae_memcpy_issue_pending(struct dma_chan *chan) -{ - struct sh_dmae_chan *sh_chan = to_sh_chan(chan); - sh_chan_xfer_ld_queue(sh_chan); -} - -static enum dma_status sh_dmae_is_complete(struct dma_chan *chan, - dma_cookie_t cookie, - dma_cookie_t *done, - dma_cookie_t *used) -{ - struct sh_dmae_chan *sh_chan = to_sh_chan(chan); - dma_cookie_t last_used; - dma_cookie_t last_complete; - - sh_dmae_chan_ld_cleanup(sh_chan); - - last_used = chan->cookie; - last_complete = sh_chan->completed_cookie; - if (last_complete == -EBUSY) - last_complete = last_used; - - if (done) - *done = last_complete; - - if (used) - *used = last_used; - - return dma_async_is_complete(cookie, last_complete, last_used); -} - -static irqreturn_t sh_dmae_interrupt(int irq, void *data) -{ - irqreturn_t ret = IRQ_NONE; - struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data; - u32 chcr = sh_dmae_readl(sh_chan, CHCR); - - if (chcr & CHCR_TE) { - /* DMA stop */ - dmae_halt(sh_chan); - - ret = IRQ_HANDLED; - tasklet_schedule(&sh_chan->tasklet); - } - - return ret; -} - -#if defined(CONFIG_CPU_SH4) -static irqreturn_t sh_dmae_err(int irq, void *data) -{ - int err = 0; - struct sh_dmae_device *shdev = (struct sh_dmae_device *)data; - - /* IRQ Multi */ - if (shdev->pdata.mode & SHDMA_MIX_IRQ) { - int cnt = 0; - switch (irq) { -#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ) - case DMTE6_IRQ: - cnt++; -#endif - case DMTE0_IRQ: - if (dmaor_read_reg(cnt) & (DMAOR_NMIF | DMAOR_AE)) { - disable_irq(irq); - return IRQ_HANDLED; - } - default: - return IRQ_NONE; - } - } else { - /* reset dma controller */ - err = sh_dmae_rst(0); - if (err) - return err; - if (shdev->pdata.mode & SHDMA_DMAOR1) { - err = sh_dmae_rst(1); - if (err) - return err; - } - disable_irq(irq); - return IRQ_HANDLED; - } -} -#endif - -static void dmae_do_tasklet(unsigned long data) -{ - struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data; - struct sh_desc *desc, *_desc, *cur_desc = NULL; - u32 sar_buf = sh_dmae_readl(sh_chan, SAR); - list_for_each_entry_safe(desc, _desc, - &sh_chan->ld_queue, node) { - if ((desc->hw.sar + desc->hw.tcr) == sar_buf) { - cur_desc = desc; - break; - } - } - - if (cur_desc) { - switch (cur_desc->async_tx.cookie) { - case 0: /* other desc data */ - break; - case -EBUSY: /* last desc */ - sh_chan->completed_cookie = - cur_desc->async_tx.cookie; - break; - default: /* first desc ( 0 < )*/ - sh_chan->completed_cookie = - cur_desc->async_tx.cookie - 1; - break; - } - cur_desc->mark = DESC_COMP; - } - /* Next desc */ - sh_chan_xfer_ld_queue(sh_chan); - sh_dmae_chan_ld_cleanup(sh_chan); -} - -static unsigned int get_dmae_irq(unsigned int id) -{ - unsigned int irq = 0; - if (id < ARRAY_SIZE(dmte_irq_map)) - irq = dmte_irq_map[id]; - return irq; -} - -static int __devinit sh_dmae_chan_probe(struct sh_dmae_device *shdev, int id) -{ - int err; - unsigned int irq = get_dmae_irq(id); - unsigned long irqflags = IRQF_DISABLED; - struct sh_dmae_chan *new_sh_chan; - - /* alloc channel */ - new_sh_chan = kzalloc(sizeof(struct sh_dmae_chan), GFP_KERNEL); - if (!new_sh_chan) { - dev_err(shdev->common.dev, "No free memory for allocating " - "dma channels!\n"); - return -ENOMEM; - } - - new_sh_chan->dev = shdev->common.dev; - new_sh_chan->id = id; - - /* Init DMA tasklet */ - tasklet_init(&new_sh_chan->tasklet, dmae_do_tasklet, - (unsigned long)new_sh_chan); - - /* Init the channel */ - dmae_init(new_sh_chan); - - spin_lock_init(&new_sh_chan->desc_lock); - - /* Init descripter manage list */ - INIT_LIST_HEAD(&new_sh_chan->ld_queue); - INIT_LIST_HEAD(&new_sh_chan->ld_free); - - /* copy struct dma_device */ - new_sh_chan->common.device = &shdev->common; - - /* Add the channel to DMA device channel list */ - list_add_tail(&new_sh_chan->common.device_node, - &shdev->common.channels); - shdev->common.chancnt++; - - if (shdev->pdata.mode & SHDMA_MIX_IRQ) { - irqflags = IRQF_SHARED; -#if defined(DMTE6_IRQ) - if (irq >= DMTE6_IRQ) - irq = DMTE6_IRQ; - else -#endif - irq = DMTE0_IRQ; - } - - snprintf(new_sh_chan->dev_id, sizeof(new_sh_chan->dev_id), - "sh-dmae%d", new_sh_chan->id); - - /* set up channel irq */ - err = request_irq(irq, &sh_dmae_interrupt, - irqflags, new_sh_chan->dev_id, new_sh_chan); - if (err) { - dev_err(shdev->common.dev, "DMA channel %d request_irq error " - "with return %d\n", id, err); - goto err_no_irq; - } - - /* CHCR register control function */ - new_sh_chan->set_chcr = dmae_set_chcr; - /* DMARS register control function */ - new_sh_chan->set_dmars = dmae_set_dmars; - - shdev->chan[id] = new_sh_chan; - return 0; - -err_no_irq: - /* remove from dmaengine device node */ - list_del(&new_sh_chan->common.device_node); - kfree(new_sh_chan); - return err; -} - -static void sh_dmae_chan_remove(struct sh_dmae_device *shdev) -{ - int i; - - for (i = shdev->common.chancnt - 1 ; i >= 0 ; i--) { - if (shdev->chan[i]) { - struct sh_dmae_chan *shchan = shdev->chan[i]; - if (!(shdev->pdata.mode & SHDMA_MIX_IRQ)) - free_irq(dmte_irq_map[i], shchan); - - list_del(&shchan->common.device_node); - kfree(shchan); - shdev->chan[i] = NULL; - } - } - shdev->common.chancnt = 0; -} - -static int __init sh_dmae_probe(struct platform_device *pdev) -{ - int err = 0, cnt, ecnt; - unsigned long irqflags = IRQF_DISABLED; -#if defined(CONFIG_CPU_SH4) - int eirq[] = { DMAE0_IRQ, -#if defined(DMAE1_IRQ) - DMAE1_IRQ -#endif - }; -#endif - struct sh_dmae_device *shdev; - - shdev = kzalloc(sizeof(struct sh_dmae_device), GFP_KERNEL); - if (!shdev) { - dev_err(&pdev->dev, "No enough memory\n"); - err = -ENOMEM; - goto shdev_err; - } - - /* get platform data */ - if (!pdev->dev.platform_data) - goto shdev_err; - - /* platform data */ - memcpy(&shdev->pdata, pdev->dev.platform_data, - sizeof(struct sh_dmae_pdata)); - - /* reset dma controller */ - err = sh_dmae_rst(0); - if (err) - goto rst_err; - - /* SH7780/85/23 has DMAOR1 */ - if (shdev->pdata.mode & SHDMA_DMAOR1) { - err = sh_dmae_rst(1); - if (err) - goto rst_err; - } - - INIT_LIST_HEAD(&shdev->common.channels); - - dma_cap_set(DMA_MEMCPY, shdev->common.cap_mask); - shdev->common.device_alloc_chan_resources - = sh_dmae_alloc_chan_resources; - shdev->common.device_free_chan_resources = sh_dmae_free_chan_resources; - shdev->common.device_prep_dma_memcpy = sh_dmae_prep_memcpy; - shdev->common.device_is_tx_complete = sh_dmae_is_complete; - shdev->common.device_issue_pending = sh_dmae_memcpy_issue_pending; - shdev->common.dev = &pdev->dev; - -#if defined(CONFIG_CPU_SH4) - /* Non Mix IRQ mode SH7722/SH7730 etc... */ - if (shdev->pdata.mode & SHDMA_MIX_IRQ) { - irqflags = IRQF_SHARED; - eirq[0] = DMTE0_IRQ; -#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ) - eirq[1] = DMTE6_IRQ; -#endif - } - - for (ecnt = 0 ; ecnt < ARRAY_SIZE(eirq); ecnt++) { - err = request_irq(eirq[ecnt], sh_dmae_err, - irqflags, "DMAC Address Error", shdev); - if (err) { - dev_err(&pdev->dev, "DMA device request_irq" - "error (irq %d) with return %d\n", - eirq[ecnt], err); - goto eirq_err; - } - } -#endif /* CONFIG_CPU_SH4 */ - - /* Create DMA Channel */ - for (cnt = 0 ; cnt < MAX_DMA_CHANNELS ; cnt++) { - err = sh_dmae_chan_probe(shdev, cnt); - if (err) - goto chan_probe_err; - } - - platform_set_drvdata(pdev, shdev); - dma_async_device_register(&shdev->common); - - return err; - -chan_probe_err: - sh_dmae_chan_remove(shdev); - -eirq_err: - for (ecnt-- ; ecnt >= 0; ecnt--) - free_irq(eirq[ecnt], shdev); - -rst_err: - kfree(shdev); - -shdev_err: - return err; -} - -static int __exit sh_dmae_remove(struct platform_device *pdev) -{ - struct sh_dmae_device *shdev = platform_get_drvdata(pdev); - - dma_async_device_unregister(&shdev->common); - - if (shdev->pdata.mode & SHDMA_MIX_IRQ) { - free_irq(DMTE0_IRQ, shdev); -#if defined(DMTE6_IRQ) - free_irq(DMTE6_IRQ, shdev); -#endif - } - - /* channel data remove */ - sh_dmae_chan_remove(shdev); - - if (!(shdev->pdata.mode & SHDMA_MIX_IRQ)) { - free_irq(DMAE0_IRQ, shdev); -#if defined(DMAE1_IRQ) - free_irq(DMAE1_IRQ, shdev); -#endif - } - kfree(shdev); - - return 0; -} - -static void sh_dmae_shutdown(struct platform_device *pdev) -{ - struct sh_dmae_device *shdev = platform_get_drvdata(pdev); - sh_dmae_ctl_stop(0); - if (shdev->pdata.mode & SHDMA_DMAOR1) - sh_dmae_ctl_stop(1); -} - -static struct platform_driver sh_dmae_driver = { - .remove = __exit_p(sh_dmae_remove), - .shutdown = sh_dmae_shutdown, - .driver = { - .name = "sh-dma-engine", - }, -}; - -static int __init sh_dmae_init(void) -{ - return platform_driver_probe(&sh_dmae_driver, sh_dmae_probe); -} -module_init(sh_dmae_init); - -static void __exit sh_dmae_exit(void) -{ - platform_driver_unregister(&sh_dmae_driver); -} -module_exit(sh_dmae_exit); - -MODULE_AUTHOR("Nobuhiro Iwamatsu "); -MODULE_DESCRIPTION("Renesas SH DMA Engine driver"); -MODULE_LICENSE("GPL"); diff --git a/trunk/drivers/dma/shdma.h b/trunk/drivers/dma/shdma.h deleted file mode 100644 index 2b4bc15a2c0a..000000000000 --- a/trunk/drivers/dma/shdma.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Renesas SuperH DMA Engine support - * - * Copyright (C) 2009 Nobuhiro Iwamatsu - * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved. - * - * This is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#ifndef __DMA_SHDMA_H -#define __DMA_SHDMA_H - -#include -#include -#include - -#define SH_DMA_TCR_MAX 0x00FFFFFF /* 16MB */ - -struct sh_dmae_regs { - u32 sar; /* SAR / source address */ - u32 dar; /* DAR / destination address */ - u32 tcr; /* TCR / transfer count */ -}; - -struct sh_desc { - struct list_head tx_list; - struct sh_dmae_regs hw; - struct list_head node; - struct dma_async_tx_descriptor async_tx; - int mark; -}; - -struct sh_dmae_chan { - dma_cookie_t completed_cookie; /* The maximum cookie completed */ - spinlock_t desc_lock; /* Descriptor operation lock */ - struct list_head ld_queue; /* Link descriptors queue */ - struct list_head ld_free; /* Link descriptors free */ - struct dma_chan common; /* DMA common channel */ - struct device *dev; /* Channel device */ - struct tasklet_struct tasklet; /* Tasklet */ - int descs_allocated; /* desc count */ - int id; /* Raw id of this channel */ - char dev_id[16]; /* unique name per DMAC of channel */ - - /* Set chcr */ - int (*set_chcr)(struct sh_dmae_chan *sh_chan, u32 regs); - /* Set DMA resource */ - int (*set_dmars)(struct sh_dmae_chan *sh_chan, u16 res); -}; - -struct sh_dmae_device { - struct dma_device common; - struct sh_dmae_chan *chan[MAX_DMA_CHANNELS]; - struct sh_dmae_pdata pdata; -}; - -#define to_sh_chan(chan) container_of(chan, struct sh_dmae_chan, common) -#define to_sh_desc(lh) container_of(lh, struct sh_desc, node) -#define tx_to_sh_desc(tx) container_of(tx, struct sh_desc, async_tx) - -#endif /* __DMA_SHDMA_H */ diff --git a/trunk/drivers/dma/txx9dmac.c b/trunk/drivers/dma/txx9dmac.c index fb6bb64e8861..7837930146a4 100644 --- a/trunk/drivers/dma/txx9dmac.c +++ b/trunk/drivers/dma/txx9dmac.c @@ -180,8 +180,9 @@ static struct txx9dmac_desc *txx9dmac_first_queued(struct txx9dmac_chan *dc) static struct txx9dmac_desc *txx9dmac_last_child(struct txx9dmac_desc *desc) { - if (!list_empty(&desc->tx_list)) - desc = list_entry(desc->tx_list.prev, typeof(*desc), desc_node); + if (!list_empty(&desc->txd.tx_list)) + desc = list_entry(desc->txd.tx_list.prev, + struct txx9dmac_desc, desc_node); return desc; } @@ -196,7 +197,6 @@ static struct txx9dmac_desc *txx9dmac_desc_alloc(struct txx9dmac_chan *dc, desc = kzalloc(sizeof(*desc), flags); if (!desc) return NULL; - INIT_LIST_HEAD(&desc->tx_list); dma_async_tx_descriptor_init(&desc->txd, &dc->chan); desc->txd.tx_submit = txx9dmac_tx_submit; /* txd.flags will be overwritten in prep funcs */ @@ -245,7 +245,7 @@ static void txx9dmac_sync_desc_for_cpu(struct txx9dmac_chan *dc, struct txx9dmac_dev *ddev = dc->ddev; struct txx9dmac_desc *child; - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) dma_sync_single_for_cpu(chan2parent(&dc->chan), child->txd.phys, ddev->descsize, DMA_TO_DEVICE); @@ -267,11 +267,11 @@ static void txx9dmac_desc_put(struct txx9dmac_chan *dc, txx9dmac_sync_desc_for_cpu(dc, desc); spin_lock_bh(&dc->lock); - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) dev_vdbg(chan2dev(&dc->chan), "moving child desc %p to freelist\n", child); - list_splice_init(&desc->tx_list, &dc->free_list); + list_splice_init(&desc->txd.tx_list, &dc->free_list); dev_vdbg(chan2dev(&dc->chan), "moving desc %p to freelist\n", desc); list_add(&desc->desc_node, &dc->free_list); @@ -429,7 +429,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc, param = txd->callback_param; txx9dmac_sync_desc_for_cpu(dc, desc); - list_splice_init(&desc->tx_list, &dc->free_list); + list_splice_init(&txd->tx_list, &dc->free_list); list_move(&desc->desc_node, &dc->free_list); if (!ds) { @@ -571,7 +571,7 @@ static void txx9dmac_handle_error(struct txx9dmac_chan *dc, u32 csr) "Bad descriptor submitted for DMA! (cookie: %d)\n", bad_desc->txd.cookie); txx9dmac_dump_desc(dc, &bad_desc->hwdesc); - list_for_each_entry(child, &bad_desc->tx_list, desc_node) + list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) txx9dmac_dump_desc(dc, &child->hwdesc); /* Pretend the descriptor completed successfully */ txx9dmac_descriptor_complete(dc, bad_desc); @@ -613,7 +613,7 @@ static void txx9dmac_scan_descriptors(struct txx9dmac_chan *dc) return; } - list_for_each_entry(child, &desc->tx_list, desc_node) + list_for_each_entry(child, &desc->txd.tx_list, desc_node) if (desc_read_CHAR(dc, child) == chain) { /* Currently in progress */ if (csr & TXX9_DMA_CSR_ABCHC) @@ -823,7 +823,8 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, dma_sync_single_for_device(chan2parent(&dc->chan), prev->txd.phys, ddev->descsize, DMA_TO_DEVICE); - list_add_tail(&desc->desc_node, &first->tx_list); + list_add_tail(&desc->desc_node, + &first->txd.tx_list); } prev = desc; } @@ -918,7 +919,8 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, prev->txd.phys, ddev->descsize, DMA_TO_DEVICE); - list_add_tail(&desc->desc_node, &first->tx_list); + list_add_tail(&desc->desc_node, + &first->txd.tx_list); } prev = desc; } diff --git a/trunk/drivers/dma/txx9dmac.h b/trunk/drivers/dma/txx9dmac.h index 365d42366b9f..c907ff01d276 100644 --- a/trunk/drivers/dma/txx9dmac.h +++ b/trunk/drivers/dma/txx9dmac.h @@ -231,7 +231,6 @@ struct txx9dmac_desc { /* THEN values for driver housekeeping */ struct list_head desc_node ____cacheline_aligned; - struct list_head tx_list; struct dma_async_tx_descriptor txd; size_t len; }; diff --git a/trunk/drivers/edac/Kconfig b/trunk/drivers/edac/Kconfig index 02127e59fe8e..a3ca18e2d7cf 100644 --- a/trunk/drivers/edac/Kconfig +++ b/trunk/drivers/edac/Kconfig @@ -133,13 +133,6 @@ config EDAC_I3000 Support for error detection and correction on the Intel 3000 and 3010 server chipsets. -config EDAC_I3200 - tristate "Intel 3200" - depends on EDAC_MM_EDAC && PCI && X86 && EXPERIMENTAL - help - Support for error detection and correction on the Intel - 3200 and 3210 server chipsets. - config EDAC_X38 tristate "Intel X38" depends on EDAC_MM_EDAC && PCI && X86 @@ -183,11 +176,11 @@ config EDAC_I5100 San Clemente MCH. config EDAC_MPC85XX - tristate "Freescale MPC83xx / MPC85xx" - depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || MPC85xx) + tristate "Freescale MPC85xx" + depends on EDAC_MM_EDAC && FSL_SOC && MPC85xx help Support for error detection and correction on the Freescale - MPC8349, MPC8560, MPC8540, MPC8548 + MPC8560, MPC8540, MPC8548 config EDAC_MV64X60 tristate "Marvell MV64x60" diff --git a/trunk/drivers/edac/Makefile b/trunk/drivers/edac/Makefile index 7a473bbe8abd..cfa033ce53a7 100644 --- a/trunk/drivers/edac/Makefile +++ b/trunk/drivers/edac/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_EDAC_I82443BXGX) += i82443bxgx_edac.o obj-$(CONFIG_EDAC_I82875P) += i82875p_edac.o obj-$(CONFIG_EDAC_I82975X) += i82975x_edac.o obj-$(CONFIG_EDAC_I3000) += i3000_edac.o -obj-$(CONFIG_EDAC_I3200) += i3200_edac.o obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o @@ -50,4 +49,3 @@ obj-$(CONFIG_EDAC_CELL) += cell_edac.o obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o - diff --git a/trunk/drivers/edac/cpc925_edac.c b/trunk/drivers/edac/cpc925_edac.c index 3d50274f1348..8c54196b5aba 100644 --- a/trunk/drivers/edac/cpc925_edac.c +++ b/trunk/drivers/edac/cpc925_edac.c @@ -885,14 +885,14 @@ static int __devinit cpc925_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdev->name)) { cpc925_printk(KERN_ERR, "Unable to request mem region\n"); res = -EBUSY; goto err1; } - vbase = devm_ioremap(&pdev->dev, r->start, resource_size(r)); + vbase = devm_ioremap(&pdev->dev, r->start, r->end - r->start + 1); if (!vbase) { cpc925_printk(KERN_ERR, "Unable to ioremap device\n"); res = -ENOMEM; @@ -953,7 +953,7 @@ static int __devinit cpc925_probe(struct platform_device *pdev) cpc925_mc_exit(mci); edac_mc_free(mci); err2: - devm_release_mem_region(&pdev->dev, r->start, resource_size(r)); + devm_release_mem_region(&pdev->dev, r->start, r->end-r->start+1); err1: devres_release_group(&pdev->dev, cpc925_probe); out: diff --git a/trunk/drivers/edac/edac_device.c b/trunk/drivers/edac/edac_device.c index d5e13c94714f..b02a6a69a8f0 100644 --- a/trunk/drivers/edac/edac_device.c +++ b/trunk/drivers/edac/edac_device.c @@ -356,6 +356,7 @@ static void complete_edac_device_list_del(struct rcu_head *head) edac_dev = container_of(head, struct edac_device_ctl_info, rcu); INIT_LIST_HEAD(&edac_dev->link); + complete(&edac_dev->removal_complete); } /* @@ -368,8 +369,10 @@ static void del_edac_device_from_global_list(struct edac_device_ctl_info *edac_device) { list_del_rcu(&edac_device->link); + + init_completion(&edac_device->removal_complete); call_rcu(&edac_device->rcu, complete_edac_device_list_del); - rcu_barrier(); + wait_for_completion(&edac_device->removal_complete); } /* diff --git a/trunk/drivers/edac/edac_mc.c b/trunk/drivers/edac/edac_mc.c index b629c41756f0..335b7ebdb11c 100644 --- a/trunk/drivers/edac/edac_mc.c +++ b/trunk/drivers/edac/edac_mc.c @@ -418,14 +418,16 @@ static void complete_mc_list_del(struct rcu_head *head) mci = container_of(head, struct mem_ctl_info, rcu); INIT_LIST_HEAD(&mci->link); + complete(&mci->complete); } static void del_mc_from_global_list(struct mem_ctl_info *mci) { atomic_dec(&edac_handlers); list_del_rcu(&mci->link); + init_completion(&mci->complete); call_rcu(&mci->rcu, complete_mc_list_del); - rcu_barrier(); + wait_for_completion(&mci->complete); } /** diff --git a/trunk/drivers/edac/edac_pci.c b/trunk/drivers/edac/edac_pci.c index efb5d5650783..30b585b1d60b 100644 --- a/trunk/drivers/edac/edac_pci.c +++ b/trunk/drivers/edac/edac_pci.c @@ -174,6 +174,7 @@ static void complete_edac_pci_list_del(struct rcu_head *head) pci = container_of(head, struct edac_pci_ctl_info, rcu); INIT_LIST_HEAD(&pci->link); + complete(&pci->complete); } /* @@ -184,8 +185,9 @@ static void complete_edac_pci_list_del(struct rcu_head *head) static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci) { list_del_rcu(&pci->link); + init_completion(&pci->complete); call_rcu(&pci->rcu, complete_edac_pci_list_del); - rcu_barrier(); + wait_for_completion(&pci->complete); } #if 0 diff --git a/trunk/drivers/edac/i3200_edac.c b/trunk/drivers/edac/i3200_edac.c deleted file mode 100644 index fde4db91c4d2..000000000000 --- a/trunk/drivers/edac/i3200_edac.c +++ /dev/null @@ -1,527 +0,0 @@ -/* - * Intel 3200/3210 Memory Controller kernel module - * Copyright (C) 2008-2009 Akamai Technologies, Inc. - * Portions by Hitoshi Mitake . - * - * This file may be distributed under the terms of the - * GNU General Public License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "edac_core.h" - -#define I3200_REVISION "1.1" - -#define EDAC_MOD_STR "i3200_edac" - -#define PCI_DEVICE_ID_INTEL_3200_HB 0x29f0 - -#define I3200_RANKS 8 -#define I3200_RANKS_PER_CHANNEL 4 -#define I3200_CHANNELS 2 - -/* Intel 3200 register addresses - device 0 function 0 - DRAM Controller */ - -#define I3200_MCHBAR_LOW 0x48 /* MCH Memory Mapped Register BAR */ -#define I3200_MCHBAR_HIGH 0x4c -#define I3200_MCHBAR_MASK 0xfffffc000ULL /* bits 35:14 */ -#define I3200_MMR_WINDOW_SIZE 16384 - -#define I3200_TOM 0xa0 /* Top of Memory (16b) - * - * 15:10 reserved - * 9:0 total populated physical memory - */ -#define I3200_TOM_MASK 0x3ff /* bits 9:0 */ -#define I3200_TOM_SHIFT 26 /* 64MiB grain */ - -#define I3200_ERRSTS 0xc8 /* Error Status Register (16b) - * - * 15 reserved - * 14 Isochronous TBWRR Run Behind FIFO Full - * (ITCV) - * 13 Isochronous TBWRR Run Behind FIFO Put - * (ITSTV) - * 12 reserved - * 11 MCH Thermal Sensor Event - * for SMI/SCI/SERR (GTSE) - * 10 reserved - * 9 LOCK to non-DRAM Memory Flag (LCKF) - * 8 reserved - * 7 DRAM Throttle Flag (DTF) - * 6:2 reserved - * 1 Multi-bit DRAM ECC Error Flag (DMERR) - * 0 Single-bit DRAM ECC Error Flag (DSERR) - */ -#define I3200_ERRSTS_UE 0x0002 -#define I3200_ERRSTS_CE 0x0001 -#define I3200_ERRSTS_BITS (I3200_ERRSTS_UE | I3200_ERRSTS_CE) - - -/* Intel MMIO register space - device 0 function 0 - MMR space */ - -#define I3200_C0DRB 0x200 /* Channel 0 DRAM Rank Boundary (16b x 4) - * - * 15:10 reserved - * 9:0 Channel 0 DRAM Rank Boundary Address - */ -#define I3200_C1DRB 0x600 /* Channel 1 DRAM Rank Boundary (16b x 4) */ -#define I3200_DRB_MASK 0x3ff /* bits 9:0 */ -#define I3200_DRB_SHIFT 26 /* 64MiB grain */ - -#define I3200_C0ECCERRLOG 0x280 /* Channel 0 ECC Error Log (64b) - * - * 63:48 Error Column Address (ERRCOL) - * 47:32 Error Row Address (ERRROW) - * 31:29 Error Bank Address (ERRBANK) - * 28:27 Error Rank Address (ERRRANK) - * 26:24 reserved - * 23:16 Error Syndrome (ERRSYND) - * 15: 2 reserved - * 1 Multiple Bit Error Status (MERRSTS) - * 0 Correctable Error Status (CERRSTS) - */ -#define I3200_C1ECCERRLOG 0x680 /* Chan 1 ECC Error Log (64b) */ -#define I3200_ECCERRLOG_CE 0x1 -#define I3200_ECCERRLOG_UE 0x2 -#define I3200_ECCERRLOG_RANK_BITS 0x18000000 -#define I3200_ECCERRLOG_RANK_SHIFT 27 -#define I3200_ECCERRLOG_SYNDROME_BITS 0xff0000 -#define I3200_ECCERRLOG_SYNDROME_SHIFT 16 -#define I3200_CAPID0 0xe0 /* P.95 of spec for details */ - -struct i3200_priv { - void __iomem *window; -}; - -static int nr_channels; - -static int how_many_channels(struct pci_dev *pdev) -{ - unsigned char capid0_8b; /* 8th byte of CAPID0 */ - - pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b); - if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */ - debugf0("In single channel mode.\n"); - return 1; - } else { - debugf0("In dual channel mode.\n"); - return 2; - } -} - -static unsigned long eccerrlog_syndrome(u64 log) -{ - return (log & I3200_ECCERRLOG_SYNDROME_BITS) >> - I3200_ECCERRLOG_SYNDROME_SHIFT; -} - -static int eccerrlog_row(int channel, u64 log) -{ - u64 rank = ((log & I3200_ECCERRLOG_RANK_BITS) >> - I3200_ECCERRLOG_RANK_SHIFT); - return rank | (channel * I3200_RANKS_PER_CHANNEL); -} - -enum i3200_chips { - I3200 = 0, -}; - -struct i3200_dev_info { - const char *ctl_name; -}; - -struct i3200_error_info { - u16 errsts; - u16 errsts2; - u64 eccerrlog[I3200_CHANNELS]; -}; - -static const struct i3200_dev_info i3200_devs[] = { - [I3200] = { - .ctl_name = "i3200" - }, -}; - -static struct pci_dev *mci_pdev; -static int i3200_registered = 1; - - -static void i3200_clear_error_info(struct mem_ctl_info *mci) -{ - struct pci_dev *pdev; - - pdev = to_pci_dev(mci->dev); - - /* - * Clear any error bits. - * (Yes, we really clear bits by writing 1 to them.) - */ - pci_write_bits16(pdev, I3200_ERRSTS, I3200_ERRSTS_BITS, - I3200_ERRSTS_BITS); -} - -static void i3200_get_and_clear_error_info(struct mem_ctl_info *mci, - struct i3200_error_info *info) -{ - struct pci_dev *pdev; - struct i3200_priv *priv = mci->pvt_info; - void __iomem *window = priv->window; - - pdev = to_pci_dev(mci->dev); - - /* - * This is a mess because there is no atomic way to read all the - * registers at once and the registers can transition from CE being - * overwritten by UE. - */ - pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts); - if (!(info->errsts & I3200_ERRSTS_BITS)) - return; - - info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG); - if (nr_channels == 2) - info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG); - - pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts2); - - /* - * If the error is the same for both reads then the first set - * of reads is valid. If there is a change then there is a CE - * with no info and the second set of reads is valid and - * should be UE info. - */ - if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) { - info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG); - if (nr_channels == 2) - info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG); - } - - i3200_clear_error_info(mci); -} - -static void i3200_process_error_info(struct mem_ctl_info *mci, - struct i3200_error_info *info) -{ - int channel; - u64 log; - - if (!(info->errsts & I3200_ERRSTS_BITS)) - return; - - if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); - info->errsts = info->errsts2; - } - - for (channel = 0; channel < nr_channels; channel++) { - log = info->eccerrlog[channel]; - if (log & I3200_ECCERRLOG_UE) { - edac_mc_handle_ue(mci, 0, 0, - eccerrlog_row(channel, log), - "i3200 UE"); - } else if (log & I3200_ECCERRLOG_CE) { - edac_mc_handle_ce(mci, 0, 0, - eccerrlog_syndrome(log), - eccerrlog_row(channel, log), 0, - "i3200 CE"); - } - } -} - -static void i3200_check(struct mem_ctl_info *mci) -{ - struct i3200_error_info info; - - debugf1("MC%d: %s()\n", mci->mc_idx, __func__); - i3200_get_and_clear_error_info(mci, &info); - i3200_process_error_info(mci, &info); -} - - -void __iomem *i3200_map_mchbar(struct pci_dev *pdev) -{ - union { - u64 mchbar; - struct { - u32 mchbar_low; - u32 mchbar_high; - }; - } u; - void __iomem *window; - - pci_read_config_dword(pdev, I3200_MCHBAR_LOW, &u.mchbar_low); - pci_read_config_dword(pdev, I3200_MCHBAR_HIGH, &u.mchbar_high); - u.mchbar &= I3200_MCHBAR_MASK; - - if (u.mchbar != (resource_size_t)u.mchbar) { - printk(KERN_ERR - "i3200: mmio space beyond accessible range (0x%llx)\n", - (unsigned long long)u.mchbar); - return NULL; - } - - window = ioremap_nocache(u.mchbar, I3200_MMR_WINDOW_SIZE); - if (!window) - printk(KERN_ERR "i3200: cannot map mmio space at 0x%llx\n", - (unsigned long long)u.mchbar); - - return window; -} - - -static void i3200_get_drbs(void __iomem *window, - u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL]) -{ - int i; - - for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) { - drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK; - drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK; - } -} - -static bool i3200_is_stacked(struct pci_dev *pdev, - u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL]) -{ - u16 tom; - - pci_read_config_word(pdev, I3200_TOM, &tom); - tom &= I3200_TOM_MASK; - - return drbs[I3200_CHANNELS - 1][I3200_RANKS_PER_CHANNEL - 1] == tom; -} - -static unsigned long drb_to_nr_pages( - u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL], bool stacked, - int channel, int rank) -{ - int n; - - n = drbs[channel][rank]; - if (rank > 0) - n -= drbs[channel][rank - 1]; - if (stacked && (channel == 1) && - drbs[channel][rank] == drbs[channel][I3200_RANKS_PER_CHANNEL - 1]) - n -= drbs[0][I3200_RANKS_PER_CHANNEL - 1]; - - n <<= (I3200_DRB_SHIFT - PAGE_SHIFT); - return n; -} - -static int i3200_probe1(struct pci_dev *pdev, int dev_idx) -{ - int rc; - int i; - struct mem_ctl_info *mci = NULL; - unsigned long last_page; - u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL]; - bool stacked; - void __iomem *window; - struct i3200_priv *priv; - - debugf0("MC: %s()\n", __func__); - - window = i3200_map_mchbar(pdev); - if (!window) - return -ENODEV; - - i3200_get_drbs(window, drbs); - nr_channels = how_many_channels(pdev); - - mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS, - nr_channels, 0); - if (!mci) - return -ENOMEM; - - debugf3("MC: %s(): init mci\n", __func__); - - mci->dev = &pdev->dev; - mci->mtype_cap = MEM_FLAG_DDR2; - - mci->edac_ctl_cap = EDAC_FLAG_SECDED; - mci->edac_cap = EDAC_FLAG_SECDED; - - mci->mod_name = EDAC_MOD_STR; - mci->mod_ver = I3200_REVISION; - mci->ctl_name = i3200_devs[dev_idx].ctl_name; - mci->dev_name = pci_name(pdev); - mci->edac_check = i3200_check; - mci->ctl_page_to_phys = NULL; - priv = mci->pvt_info; - priv->window = window; - - stacked = i3200_is_stacked(pdev, drbs); - - /* - * The dram rank boundary (DRB) reg values are boundary addresses - * for each DRAM rank with a granularity of 64MB. DRB regs are - * cumulative; the last one will contain the total memory - * contained in all ranks. - */ - last_page = -1UL; - for (i = 0; i < mci->nr_csrows; i++) { - unsigned long nr_pages; - struct csrow_info *csrow = &mci->csrows[i]; - - nr_pages = drb_to_nr_pages(drbs, stacked, - i / I3200_RANKS_PER_CHANNEL, - i % I3200_RANKS_PER_CHANNEL); - - if (nr_pages == 0) { - csrow->mtype = MEM_EMPTY; - continue; - } - - csrow->first_page = last_page + 1; - last_page += nr_pages; - csrow->last_page = last_page; - csrow->nr_pages = nr_pages; - - csrow->grain = nr_pages << PAGE_SHIFT; - csrow->mtype = MEM_DDR2; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = EDAC_UNKNOWN; - } - - i3200_clear_error_info(mci); - - rc = -ENODEV; - if (edac_mc_add_mc(mci)) { - debugf3("MC: %s(): failed edac_mc_add_mc()\n", __func__); - goto fail; - } - - /* get this far and it's successful */ - debugf3("MC: %s(): success\n", __func__); - return 0; - -fail: - iounmap(window); - if (mci) - edac_mc_free(mci); - - return rc; -} - -static int __devinit i3200_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) -{ - int rc; - - debugf0("MC: %s()\n", __func__); - - if (pci_enable_device(pdev) < 0) - return -EIO; - - rc = i3200_probe1(pdev, ent->driver_data); - if (!mci_pdev) - mci_pdev = pci_dev_get(pdev); - - return rc; -} - -static void __devexit i3200_remove_one(struct pci_dev *pdev) -{ - struct mem_ctl_info *mci; - struct i3200_priv *priv; - - debugf0("%s()\n", __func__); - - mci = edac_mc_del_mc(&pdev->dev); - if (!mci) - return; - - priv = mci->pvt_info; - iounmap(priv->window); - - edac_mc_free(mci); -} - -static const struct pci_device_id i3200_pci_tbl[] __devinitdata = { - { - PCI_VEND_DEV(INTEL, 3200_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0, - I3200}, - { - 0, - } /* 0 terminated list. */ -}; - -MODULE_DEVICE_TABLE(pci, i3200_pci_tbl); - -static struct pci_driver i3200_driver = { - .name = EDAC_MOD_STR, - .probe = i3200_init_one, - .remove = __devexit_p(i3200_remove_one), - .id_table = i3200_pci_tbl, -}; - -static int __init i3200_init(void) -{ - int pci_rc; - - debugf3("MC: %s()\n", __func__); - - /* Ensure that the OPSTATE is set correctly for POLL or NMI */ - opstate_init(); - - pci_rc = pci_register_driver(&i3200_driver); - if (pci_rc < 0) - goto fail0; - - if (!mci_pdev) { - i3200_registered = 0; - mci_pdev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_3200_HB, NULL); - if (!mci_pdev) { - debugf0("i3200 pci_get_device fail\n"); - pci_rc = -ENODEV; - goto fail1; - } - - pci_rc = i3200_init_one(mci_pdev, i3200_pci_tbl); - if (pci_rc < 0) { - debugf0("i3200 init fail\n"); - pci_rc = -ENODEV; - goto fail1; - } - } - - return 0; - -fail1: - pci_unregister_driver(&i3200_driver); - -fail0: - if (mci_pdev) - pci_dev_put(mci_pdev); - - return pci_rc; -} - -static void __exit i3200_exit(void) -{ - debugf3("MC: %s()\n", __func__); - - pci_unregister_driver(&i3200_driver); - if (!i3200_registered) { - i3200_remove_one(mci_pdev); - pci_dev_put(mci_pdev); - } -} - -module_init(i3200_init); -module_exit(i3200_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Akamai Technologies, Inc."); -MODULE_DESCRIPTION("MC support for Intel 3200 memory hub controllers"); - -module_param(edac_op_state, int, 0444); -MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); diff --git a/trunk/drivers/edac/mpc85xx_edac.c b/trunk/drivers/edac/mpc85xx_edac.c index 157f6504f25e..3f2ccfc6407c 100644 --- a/trunk/drivers/edac/mpc85xx_edac.c +++ b/trunk/drivers/edac/mpc85xx_edac.c @@ -41,9 +41,7 @@ static u32 orig_pci_err_en; #endif static u32 orig_l2_err_disable; -#ifdef CONFIG_MPC85xx static u32 orig_hid1[2]; -#endif /************************ MC SYSFS parts ***********************************/ @@ -648,7 +646,6 @@ static struct of_device_id mpc85xx_l2_err_of_match[] = { { .compatible = "fsl,mpc8560-l2-cache-controller", }, { .compatible = "fsl,mpc8568-l2-cache-controller", }, { .compatible = "fsl,mpc8572-l2-cache-controller", }, - { .compatible = "fsl,p2020-l2-cache-controller", }, {}, }; @@ -791,20 +788,19 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci) csrow = &mci->csrows[index]; cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 + (index * MPC85XX_MC_CS_BNDS_OFS)); - - start = (cs_bnds & 0xffff0000) >> 16; - end = (cs_bnds & 0x0000ffff); + start = (cs_bnds & 0xfff0000) << 4; + end = ((cs_bnds & 0xfff) << 20); + if (start) + start |= 0xfffff; + if (end) + end |= 0xfffff; if (start == end) continue; /* not populated */ - start <<= (24 - PAGE_SHIFT); - end <<= (24 - PAGE_SHIFT); - end |= (1 << (24 - PAGE_SHIFT)) - 1; - csrow->first_page = start >> PAGE_SHIFT; csrow->last_page = end >> PAGE_SHIFT; - csrow->nr_pages = end + 1 - start; + csrow->nr_pages = csrow->last_page + 1 - csrow->first_page; csrow->grain = 8; csrow->mtype = mtype; csrow->dtype = DEV_UNKNOWN; @@ -988,8 +984,6 @@ static struct of_device_id mpc85xx_mc_err_of_match[] = { { .compatible = "fsl,mpc8560-memory-controller", }, { .compatible = "fsl,mpc8568-memory-controller", }, { .compatible = "fsl,mpc8572-memory-controller", }, - { .compatible = "fsl,mpc8349-memory-controller", }, - { .compatible = "fsl,p2020-memory-controller", }, {}, }; @@ -1005,13 +999,13 @@ static struct of_platform_driver mpc85xx_mc_err_driver = { }, }; -#ifdef CONFIG_MPC85xx + static void __init mpc85xx_mc_clear_rfxe(void *data) { orig_hid1[smp_processor_id()] = mfspr(SPRN_HID1); mtspr(SPRN_HID1, (orig_hid1[smp_processor_id()] & ~0x20000)); } -#endif + static int __init mpc85xx_mc_init(void) { @@ -1044,32 +1038,26 @@ static int __init mpc85xx_mc_init(void) printk(KERN_WARNING EDAC_MOD_STR "PCI fails to register\n"); #endif -#ifdef CONFIG_MPC85xx /* * need to clear HID1[RFXE] to disable machine check int * so we can catch it */ if (edac_op_state == EDAC_OPSTATE_INT) on_each_cpu(mpc85xx_mc_clear_rfxe, NULL, 0); -#endif return 0; } module_init(mpc85xx_mc_init); -#ifdef CONFIG_MPC85xx static void __exit mpc85xx_mc_restore_hid1(void *data) { mtspr(SPRN_HID1, orig_hid1[smp_processor_id()]); } -#endif static void __exit mpc85xx_mc_exit(void) { -#ifdef CONFIG_MPC85xx on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0); -#endif #ifdef CONFIG_PCI of_unregister_platform_driver(&mpc85xx_pci_err_driver); #endif diff --git a/trunk/drivers/edac/mv64x60_edac.c b/trunk/drivers/edac/mv64x60_edac.c index a6b9fec13a74..5131aaae8e03 100644 --- a/trunk/drivers/edac/mv64x60_edac.c +++ b/trunk/drivers/edac/mv64x60_edac.c @@ -90,7 +90,7 @@ static int __init mv64x60_pci_fixup(struct platform_device *pdev) return -ENOENT; } - pci_serr = ioremap(r->start, resource_size(r)); + pci_serr = ioremap(r->start, r->end - r->start + 1); if (!pci_serr) return -ENOMEM; @@ -140,7 +140,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); @@ -150,7 +150,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev) pdata->pci_vbase = devm_ioremap(&pdev->dev, r->start, - resource_size(r)); + r->end - r->start + 1); if (!pdata->pci_vbase) { printk(KERN_ERR "%s: Unable to setup PCI err regs\n", __func__); res = -ENOMEM; @@ -306,7 +306,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdata->name)) { printk(KERN_ERR "%s: Error while request mem region\n", __func__); @@ -316,7 +316,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev) pdata->sram_vbase = devm_ioremap(&pdev->dev, r->start, - resource_size(r)); + r->end - r->start + 1); if (!pdata->sram_vbase) { printk(KERN_ERR "%s: Unable to setup SRAM err regs\n", __func__); @@ -474,7 +474,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); @@ -484,7 +484,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev) pdata->cpu_vbase[0] = devm_ioremap(&pdev->dev, r->start, - resource_size(r)); + r->end - r->start + 1); if (!pdata->cpu_vbase[0]) { printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__); res = -ENOMEM; @@ -501,7 +501,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); @@ -511,7 +511,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev) pdata->cpu_vbase[1] = devm_ioremap(&pdev->dev, r->start, - resource_size(r)); + r->end - r->start + 1); if (!pdata->cpu_vbase[1]) { printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__); res = -ENOMEM; @@ -726,7 +726,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev) if (!devm_request_mem_region(&pdev->dev, r->start, - resource_size(r), + r->end - r->start + 1, pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); @@ -736,7 +736,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev) pdata->mc_vbase = devm_ioremap(&pdev->dev, r->start, - resource_size(r)); + r->end - r->start + 1); if (!pdata->mc_vbase) { printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__); res = -ENOMEM; diff --git a/trunk/drivers/idle/i7300_idle.c b/trunk/drivers/idle/i7300_idle.c index 1f20a042a4f5..949c97ff57e3 100644 --- a/trunk/drivers/idle/i7300_idle.c +++ b/trunk/drivers/idle/i7300_idle.c @@ -29,8 +29,8 @@ #include -#include "../dma/ioat/hw.h" -#include "../dma/ioat/registers.h" +#include "../dma/ioatdma_hw.h" +#include "../dma/ioatdma_registers.h" #define I7300_IDLE_DRIVER_VERSION "1.55" #define I7300_PRINT "i7300_idle:" @@ -126,9 +126,9 @@ static void i7300_idle_ioat_stop(void) udelay(10); sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & - IOAT_CHANSTS_STATUS; + IOAT_CHANSTS_DMA_TRANSFER_STATUS; - if (sts != IOAT_CHANSTS_ACTIVE) + if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) break; } @@ -160,9 +160,9 @@ static int __init i7300_idle_ioat_selftest(u8 *ctl, udelay(1000); chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & - IOAT_CHANSTS_STATUS; + IOAT_CHANSTS_DMA_TRANSFER_STATUS; - if (chan_sts != IOAT_CHANSTS_DONE) { + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) { /* Not complete, reset the channel */ writeb(IOAT_CHANCMD_RESET, ioat_chanbase + IOAT1_CHANCMD_OFFSET); @@ -288,9 +288,9 @@ static void __exit i7300_idle_ioat_exit(void) ioat_chanbase + IOAT1_CHANCMD_OFFSET); chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & - IOAT_CHANSTS_STATUS; + IOAT_CHANSTS_DMA_TRANSFER_STATUS; - if (chan_sts != IOAT_CHANSTS_ACTIVE) { + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); break; } @@ -298,14 +298,14 @@ static void __exit i7300_idle_ioat_exit(void) } chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & - IOAT_CHANSTS_STATUS; + IOAT_CHANSTS_DMA_TRANSFER_STATUS; /* * We tried to reset multiple times. If IO A/T channel is still active * flag an error and return without cleanup. Memory leak is better * than random corruption in that extreme error situation. */ - if (chan_sts == IOAT_CHANSTS_ACTIVE) { + if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels." " Not freeing resources\n"); return; diff --git a/trunk/drivers/input/misc/Kconfig b/trunk/drivers/input/misc/Kconfig index 02f4f8f1db6f..76d6751f89a7 100644 --- a/trunk/drivers/input/misc/Kconfig +++ b/trunk/drivers/input/misc/Kconfig @@ -225,7 +225,6 @@ config INPUT_SGI_BTNS config INPUT_WINBOND_CIR tristate "Winbond IR remote control" depends on X86 && PNP - select NEW_LEDS select LEDS_CLASS select BITREVERSE help diff --git a/trunk/drivers/md/Kconfig b/trunk/drivers/md/Kconfig index 2158377a1359..020f9573fd82 100644 --- a/trunk/drivers/md/Kconfig +++ b/trunk/drivers/md/Kconfig @@ -124,8 +124,6 @@ config MD_RAID456 select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR - select ASYNC_PQ - select ASYNC_RAID6_RECOV ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure @@ -154,33 +152,9 @@ config MD_RAID456 If unsure, say Y. -config MULTICORE_RAID456 - bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" - depends on MD_RAID456 - depends on SMP - depends on EXPERIMENTAL - ---help--- - Enable the raid456 module to dispatch per-stripe raid operations to a - thread pool. - - If unsure, say N. - config MD_RAID6_PQ tristate -config ASYNC_RAID6_TEST - tristate "Self test for hardware accelerated raid6 recovery" - depends on MD_RAID6_PQ - select ASYNC_RAID6_RECOV - ---help--- - This is a one-shot self test that permutes through the - recovery of all the possible two disk failure scenarios for a - N-disk array. Recovery is performed with the asynchronous - raid6 recovery routines, and will optionally use an offload - engine if one is available. - - If unsure, say N. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/trunk/drivers/md/bitmap.c b/trunk/drivers/md/bitmap.c index 6986b0059d23..3319c2fec28e 100644 --- a/trunk/drivers/md/bitmap.c +++ b/trunk/drivers/md/bitmap.c @@ -108,8 +108,6 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) * allocated while we're using it */ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) -__releases(bitmap->lock) -__acquires(bitmap->lock) { unsigned char *mappage; @@ -327,6 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) return 0; bad_alignment: + rcu_read_unlock(); return -EINVAL; } @@ -1208,8 +1207,6 @@ void bitmap_daemon_work(struct bitmap *bitmap) static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, sector_t offset, int *blocks, int create) -__releases(bitmap->lock) -__acquires(bitmap->lock) { /* If 'create', we might release the lock and reclaim it. * The lock must have been taken with interrupts enabled. diff --git a/trunk/drivers/md/linear.c b/trunk/drivers/md/linear.c index 1ceceb334d5e..ea4842905444 100644 --- a/trunk/drivers/md/linear.c +++ b/trunk/drivers/md/linear.c @@ -108,9 +108,6 @@ static int linear_congested(void *data, int bits) linear_conf_t *conf; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); conf = rcu_dereference(mddev->private); diff --git a/trunk/drivers/md/md.c b/trunk/drivers/md/md.c index 26ba42a79129..6aa497e4baf8 100644 --- a/trunk/drivers/md/md.c +++ b/trunk/drivers/md/md.c @@ -262,12 +262,6 @@ static void mddev_resume(mddev_t *mddev) mddev->pers->quiesce(mddev, 0); } -int mddev_congested(mddev_t *mddev, int bits) -{ - return mddev->suspended; -} -EXPORT_SYMBOL(mddev_congested); - static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -4224,7 +4218,7 @@ static int do_md_run(mddev_t * mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "resync"); + "%s_resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n", @@ -4581,10 +4575,10 @@ static int get_version(void __user * arg) static int get_array_info(mddev_t * mddev, void __user * arg) { mdu_array_info_t info; - int nr,working,insync,failed,spare; + int nr,working,active,failed,spare; mdk_rdev_t *rdev; - nr=working=insync=failed=spare=0; + nr=working=active=failed=spare=0; list_for_each_entry(rdev, &mddev->disks, same_set) { nr++; if (test_bit(Faulty, &rdev->flags)) @@ -4592,7 +4586,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) else { working++; if (test_bit(In_sync, &rdev->flags)) - insync++; + active++; else spare++; } @@ -4617,7 +4611,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = (1<bitmap && mddev->bitmap_offset) info.state = (1<disks)) { mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, mdk_rdev_t, same_set); - err = super_types[mddev->major_version] + int err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { printk(KERN_WARNING @@ -5637,10 +5631,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, - "%s_%s", - mdname(thread->mddev), - name ?: mddev->pers->name); + thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -6754,7 +6745,7 @@ void md_check_recovery(mddev_t *mddev) } mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "resync"); + "%s_resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n", diff --git a/trunk/drivers/md/md.h b/trunk/drivers/md/md.h index f184b69ef337..f55d2ff95133 100644 --- a/trunk/drivers/md/md.h +++ b/trunk/drivers/md/md.h @@ -430,7 +430,6 @@ extern void md_write_end(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); -extern int mddev_congested(mddev_t *mddev, int bits); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); diff --git a/trunk/drivers/md/multipath.c b/trunk/drivers/md/multipath.c index ee7646f974a0..d2d3fd54cc68 100644 --- a/trunk/drivers/md/multipath.c +++ b/trunk/drivers/md/multipath.c @@ -150,6 +150,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) } mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + memset(mp_bh, 0, sizeof(*mp_bh)); mp_bh->master_bio = bio; mp_bh->mddev = mddev; @@ -198,9 +199,6 @@ static int multipath_congested(void *data, int bits) multipath_conf_t *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); @@ -506,7 +504,7 @@ static int multipath_run (mddev_t *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, NULL); + mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); diff --git a/trunk/drivers/md/raid0.c b/trunk/drivers/md/raid0.c index d3a4ce06015a..f845ed98fec9 100644 --- a/trunk/drivers/md/raid0.c +++ b/trunk/drivers/md/raid0.c @@ -44,9 +44,6 @@ static int raid0_congested(void *data, int bits) mdk_rdev_t **devlist = conf->devlist; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); @@ -89,7 +86,7 @@ static void dump_zones(mddev_t *mddev) static int create_strip_zones(mddev_t *mddev) { - int i, c, err; + int i, c, j, err; sector_t curr_zone_end, sectors; mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; struct strip_zone *zone; @@ -201,8 +198,6 @@ static int create_strip_zones(mddev_t *mddev) /* now do the other zones */ for (i = 1; i < conf->nr_strip_zones; i++) { - int j; - zone = conf->strip_zone + i; dev = conf->devlist + i * mddev->raid_disks; @@ -212,6 +207,7 @@ static int create_strip_zones(mddev_t *mddev) c = 0; for (j=0; jdevlist[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); diff --git a/trunk/drivers/md/raid1.c b/trunk/drivers/md/raid1.c index d1b9bd5fd4f6..ff7ed3335995 100644 --- a/trunk/drivers/md/raid1.c +++ b/trunk/drivers/md/raid1.c @@ -576,9 +576,6 @@ static int raid1_congested(void *data, int bits) conf_t *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -854,7 +851,7 @@ static int make_request(struct request_queue *q, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -946,8 +943,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | - (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_barriers | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1627,8 +1623,7 @@ static void raid1d(mddev_t *mddev) conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | - (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = WRITE | do_sync; bio->bi_private = r1_bio; r1_bio->bios[i] = bio; generic_make_request(bio); @@ -1677,7 +1672,7 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); @@ -2052,7 +2047,7 @@ static int run(mddev_t *mddev) conf->last_used = j; - mddev->thread = md_register_thread(raid1d, mddev, NULL); + mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); if (!mddev->thread) { printk(KERN_ERR "raid1: couldn't allocate thread for %s\n", diff --git a/trunk/drivers/md/raid10.c b/trunk/drivers/md/raid10.c index 51c4c5c4d87a..d0a2152e064f 100644 --- a/trunk/drivers/md/raid10.c +++ b/trunk/drivers/md/raid10.c @@ -631,8 +631,6 @@ static int raid10_congested(void *data, int bits) conf_t *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; rcu_read_lock(); for (i = 0; i < mddev->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -884,7 +882,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; generic_make_request(read_bio); @@ -952,7 +950,7 @@ static int make_request(struct request_queue *q, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1625,7 +1623,7 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; @@ -1775,7 +1773,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ - int j, k; + int i, j, k; r10_bio = NULL; for (i=0 ; iraid_disks; i++) @@ -2190,7 +2188,7 @@ static int run(mddev_t *mddev) } - mddev->thread = md_register_thread(raid10d, mddev, NULL); + mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); if (!mddev->thread) { printk(KERN_ERR "raid10: couldn't allocate thread for %s\n", diff --git a/trunk/drivers/md/raid5.c b/trunk/drivers/md/raid5.c index 94829804ab7f..826eb3467357 100644 --- a/trunk/drivers/md/raid5.c +++ b/trunk/drivers/md/raid5.c @@ -47,9 +47,7 @@ #include #include #include -#include #include -#include #include "md.h" #include "raid5.h" #include "bitmap.h" @@ -501,18 +499,11 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, struct page *bio_page; int i; int page_offset; - struct async_submit_ctl submit; - enum async_tx_flags flags = 0; if (bio->bi_sector >= sector) page_offset = (signed)(bio->bi_sector - sector) * 512; else page_offset = (signed)(sector - bio->bi_sector) * -512; - - if (frombio) - flags |= ASYNC_TX_FENCE; - init_async_submit(&submit, flags, tx, NULL, NULL, NULL); - bio_for_each_segment(bvl, bio, i) { int len = bio_iovec_idx(bio, i)->bv_len; int clen; @@ -534,14 +525,15 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, bio_page = bio_iovec_idx(bio, i)->bv_page; if (frombio) tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, &submit); + b_offset, clen, + ASYNC_TX_DEP_ACK, + tx, NULL, NULL); else tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, &submit); + page_offset, clen, + ASYNC_TX_DEP_ACK, + tx, NULL, NULL); } - /* chain the operations */ - submit.depend_tx = tx; - if (clen < len) /* hit end of page */ break; page_offset += len; @@ -600,7 +592,6 @@ static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; raid5_conf_t *conf = sh->raid_conf; - struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -624,34 +615,22 @@ static void ops_run_biofill(struct stripe_head *sh) } atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); - async_trigger_callback(&submit); + async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, + ops_complete_biofill, sh); } -static void mark_target_uptodate(struct stripe_head *sh, int target) -{ - struct r5dev *tgt; - - if (target < 0) - return; - - tgt = &sh->dev[target]; - set_bit(R5_UPTODATE, &tgt->flags); - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - clear_bit(R5_Wantcompute, &tgt->flags); -} - -static void ops_complete_compute(void *stripe_head_ref) +static void ops_complete_compute5(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; + int target = sh->ops.target; + struct r5dev *tgt = &sh->dev[target]; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - /* mark the computed target(s) as uptodate */ - mark_target_uptodate(sh, sh->ops.target); - mark_target_uptodate(sh, sh->ops.target2); - + set_bit(R5_UPTODATE, &tgt->flags); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + clear_bit(R5_Wantcompute, &tgt->flags); clear_bit(STRIPE_COMPUTE_RUN, &sh->state); if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; @@ -659,24 +638,16 @@ static void ops_complete_compute(void *stripe_head_ref) release_stripe(sh); } -/* return a pointer to the address conversion region of the scribble buffer */ -static addr_conv_t *to_addr_conv(struct stripe_head *sh, - struct raid5_percpu *percpu) -{ - return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); -} - -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { + /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; + struct page *xor_srcs[disks]; int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; int count = 0; struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu block: %d\n", @@ -689,215 +660,17 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute, sh, to_addr_conv(sh, percpu)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); - else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); - - return tx; -} - -/* set_syndrome_sources - populate source buffers for gen_syndrome - * @srcs - (struct page *) array of size sh->disks - * @sh - stripe_head to parse - * - * Populates srcs in proper layout order for the stripe and returns the - * 'count' of sources to be used in a call to async_gen_syndrome. The P - * destination buffer is recorded in srcs[count] and the Q destination - * is recorded in srcs[count+1]]. - */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) -{ - int disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); - int d0_idx = raid6_d0(sh); - int count; - int i; - - for (i = 0; i < disks; i++) - srcs[i] = (void *)raid6_empty_zero_page; - - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - srcs[slot] = sh->dev[i].page; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - return count; -} - -static struct dma_async_tx_descriptor * -ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int disks = sh->disks; - struct page **blocks = percpu->scribble; - int target; - int qd_idx = sh->qd_idx; - struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - struct r5dev *tgt; - struct page *dest; - int i; - int count; - - if (sh->ops.target < 0) - target = sh->ops.target2; - else if (sh->ops.target2 < 0) - target = sh->ops.target; + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, + 0, NULL, ops_complete_compute5, sh); else - /* we should only have one valid target */ - BUG(); - BUG_ON(target < 0); - pr_debug("%s: stripe %llu block: %d\n", - __func__, (unsigned long long)sh->sector, target); - - tgt = &sh->dev[target]; - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - dest = tgt->page; - - atomic_inc(&sh->count); - - if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); - blocks[count] = NULL; /* regenerating p is not necessary */ - BUG_ON(blocks[count+1] != dest); /* q should already be set */ - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); - } else { - /* Compute any data- or p-drive using XOR */ - count = 0; - for (i = disks; i-- ; ) { - if (i == target || i == qd_idx) - continue; - blocks[count++] = sh->dev[i].page; - } - - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, - NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - } + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute5, sh); return tx; } -static struct dma_async_tx_descriptor * -ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int i, count, disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : disks-2; - int d0_idx = raid6_d0(sh); - int faila = -1, failb = -1; - int target = sh->ops.target; - int target2 = sh->ops.target2; - struct r5dev *tgt = &sh->dev[target]; - struct r5dev *tgt2 = &sh->dev[target2]; - struct dma_async_tx_descriptor *tx; - struct page **blocks = percpu->scribble; - struct async_submit_ctl submit; - - pr_debug("%s: stripe %llu block1: %d block2: %d\n", - __func__, (unsigned long long)sh->sector, target, target2); - BUG_ON(target < 0 || target2 < 0); - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); - - /* we need to open-code set_syndrome_sources to handle the - * slot number conversion for 'faila' and 'failb' - */ - for (i = 0; i < disks ; i++) - blocks[i] = (void *)raid6_empty_zero_page; - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - blocks[slot] = sh->dev[i].page; - - if (i == target) - faila = slot; - if (i == target2) - failb = slot; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - BUG_ON(faila == failb); - if (failb < faila) - swap(faila, failb); - pr_debug("%s: stripe: %llu faila: %d failb: %d\n", - __func__, (unsigned long long)sh->sector, faila, failb); - - atomic_inc(&sh->count); - - if (failb == syndrome_disks+1) { - /* Q disk is one of the missing disks */ - if (faila == syndrome_disks) { - /* Missing P+Q, just recompute */ - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); - } else { - struct page *dest; - int data_target; - int qd_idx = sh->qd_idx; - - /* Missing D+Q: recompute D from P, then recompute Q */ - if (target == qd_idx) - data_target = target2; - else - data_target = target; - - count = 0; - for (i = disks; i-- ; ) { - if (i == data_target || i == qd_idx) - continue; - blocks[count++] = sh->dev[i].page; - } - dest = sh->dev[data_target].page; - init_async_submit(&submit, - ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, - NULL, NULL, NULL, - to_addr_conv(sh, percpu)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, - &submit); - - count = set_syndrome_sources(blocks, sh); - init_async_submit(&submit, ASYNC_TX_FENCE, tx, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); - } - } else { - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - if (failb == syndrome_disks) { - /* We're missing D+P. */ - return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); - } else { - /* We're missing D+D. */ - return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); - } - } -} - - static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -907,13 +680,12 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { + /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; + struct page *xor_srcs[disks]; int count = 0, pd_idx = sh->pd_idx, i; - struct async_submit_ctl submit; /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; @@ -928,9 +700,9 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, xor_srcs[count++] = dev->page; } - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh, to_addr_conv(sh, percpu)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh); return tx; } @@ -970,21 +742,17 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) return tx; } -static void ops_complete_reconstruct(void *stripe_head_ref) +static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = sh->qd_idx; - int i; + int disks = sh->disks, i, pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - - if (dev->written || i == pd_idx || i == qd_idx) + if (dev->written || i == pd_idx) set_bit(R5_UPTODATE, &dev->flags); } @@ -1002,12 +770,12 @@ static void ops_complete_reconstruct(void *stripe_head_ref) } static void -ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { + /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; - struct async_submit_ctl submit; + struct page *xor_srcs[disks]; + int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; @@ -1041,36 +809,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_ACK | + flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); atomic_inc(&sh->count); - init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, - to_addr_conv(sh, percpu)); - if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); - else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); -} - -static void -ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) -{ - struct async_submit_ctl submit; - struct page **blocks = percpu->scribble; - int count; - - pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - count = set_syndrome_sources(blocks, sh); - - atomic_inc(&sh->count); - - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, - sh, to_addr_conv(sh, percpu)); - async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + if (unlikely(count == 1)) { + flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, + flags, tx, ops_complete_postxor, sh); + } else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) @@ -1085,115 +835,63 @@ static void ops_complete_check(void *stripe_head_ref) release_stripe(sh); } -static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) +static void ops_run_check(struct stripe_head *sh) { + /* kernel stack size limits the total number of disks */ int disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = sh->qd_idx; - struct page *xor_dest; - struct page **xor_srcs = percpu->scribble; + struct page *xor_srcs[disks]; struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - int count; - int i; + + int count = 0, pd_idx = sh->pd_idx, i; + struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - count = 0; - xor_dest = sh->dev[pd_idx].page; - xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { - if (i == pd_idx || i == qd_idx) - continue; - xor_srcs[count++] = sh->dev[i].page; + struct r5dev *dev = &sh->dev[i]; + if (i != pd_idx) + xor_srcs[count++] = dev->page; } - init_async_submit(&submit, 0, NULL, NULL, NULL, - to_addr_conv(sh, percpu)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - &sh->ops.zero_sum_result, &submit); - - atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); - tx = async_trigger_callback(&submit); -} - -static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) -{ - struct page **srcs = percpu->scribble; - struct async_submit_ctl submit; - int count; - - pr_debug("%s: stripe %llu checkp: %d\n", __func__, - (unsigned long long)sh->sector, checkp); - - count = set_syndrome_sources(srcs, sh); - if (!checkp) - srcs[count] = NULL; + tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, - sh, to_addr_conv(sh, percpu)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, - &sh->ops.zero_sum_result, percpu->spare_page, &submit); + tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, + ops_complete_check, sh); } -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - raid5_conf_t *conf = sh->raid_conf; - int level = conf->level; - struct raid5_percpu *percpu; - unsigned long cpu; - cpu = get_cpu(); - percpu = per_cpu_ptr(conf->percpu, cpu); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { - if (level < 6) - tx = ops_run_compute5(sh, percpu); - else { - if (sh->ops.target2 < 0 || sh->ops.target < 0) - tx = ops_run_compute6_1(sh, percpu); - else - tx = ops_run_compute6_2(sh, percpu); - } - /* terminate the chain if reconstruct is not set to be run */ - if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) async_tx_ack(tx); } if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); + tx = ops_run_prexor(sh, tx); if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { - if (level < 6) - ops_run_reconstruct5(sh, percpu, tx); - else - ops_run_reconstruct6(sh, percpu, tx); - } + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &ops_request)) { - if (sh->check_state == check_state_run) - ops_run_check_p(sh, percpu); - else if (sh->check_state == check_state_run_q) - ops_run_check_pq(sh, percpu, 0); - else if (sh->check_state == check_state_run_pq) - ops_run_check_pq(sh, percpu, 1); - else - BUG(); - } + if (test_bit(STRIPE_OP_CHECK, &ops_request)) + ops_run_check(sh); if (overlap_clear) for (i = disks; i--; ) { @@ -1201,7 +899,6 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); } static int grow_one_stripe(raid5_conf_t *conf) @@ -1251,28 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } -/** - * scribble_len - return the required size of the scribble region - * @num - total number of disks in the array - * - * The size must be enough to contain: - * 1/ a struct page pointer for each device in the array +2 - * 2/ room to convert each entry in (1) to its corresponding dma - * (dma_map_page()) or page (page_address()) address. - * - * Note: the +2 is for the destination buffers of the ddf/raid6 case where we - * calculate over all devices (not just the data blocks), using zeros in place - * of the P and Q blocks. - */ -static size_t scribble_len(int num) -{ - size_t len; - - len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); - - return len; -} - static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -1301,7 +976,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - unsigned long cpu; int err; struct kmem_cache *sc; int i; @@ -1367,7 +1041,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) /* Step 3. * At this point, we are holding all the stripes so the array * is completely stalled, so now is a good time to resize - * conf->disks and the scribble region + * conf->disks. */ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); if (ndisks) { @@ -1378,30 +1052,10 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) } else err = -ENOMEM; - get_online_cpus(); - conf->scribble_len = scribble_len(newsize); - for_each_present_cpu(cpu) { - struct raid5_percpu *percpu; - void *scribble; - - percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = kmalloc(conf->scribble_len, GFP_NOIO); - - if (scribble) { - kfree(percpu->scribble); - percpu->scribble = scribble; - } else { - err = -ENOMEM; - break; - } - } - put_online_cpus(); - /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); - for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); @@ -1940,13 +1594,258 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) } + +/* + * Copy data between a page in the stripe cache, and one or more bion + * The page could align with the middle of the bio, or there could be + * several bion, each with several bio_vecs, which cover part of the page + * Multiple bion are linked together on bi_next. There may be extras + * at the end of this list. We ignore them. + */ +static void copy_data(int frombio, struct bio *bio, + struct page *page, + sector_t sector) +{ + char *pa = page_address(page); + struct bio_vec *bvl; + int i; + int page_offset; + + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + bio_for_each_segment(bvl, bio, i) { + int len = bio_iovec_idx(bio,i)->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else clen = len; + + if (clen > 0) { + char *ba = __bio_kmap_atomic(bio, i, KM_USER0); + if (frombio) + memcpy(pa+page_offset, ba+b_offset, clen); + else + memcpy(ba+b_offset, pa+page_offset, clen); + __bio_kunmap_atomic(ba, KM_USER0); + } + if (clen < len) /* hit end of page */ + break; + page_offset += len; + } +} + +#define check_xor() do { \ + if (count == MAX_XOR_BLOCKS) { \ + xor_blocks(count, STRIPE_SIZE, dest, ptr);\ + count = 0; \ + } \ + } while(0) + +static void compute_parity6(struct stripe_head *sh, int method) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + struct bio *chosen; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[syndrome_disks+2]; + + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); + + pr_debug("compute_parity, stripe %llu, method %d\n", + (unsigned long long)sh->sector, method); + + switch(method) { + case READ_MODIFY_WRITE: + BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ + case RECONSTRUCT_WRITE: + for (i= disks; i-- ;) + if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { + chosen = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + BUG_ON(sh->dev[i].written); + sh->dev[i].written = chosen; + } + break; + case CHECK_PARITY: + BUG(); /* Not implemented yet */ + } + + for (i = disks; i--;) + if (sh->dev[i].written) { + sector_t sector = sh->dev[i].sector; + struct bio *wbi = sh->dev[i].written; + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, sh->dev[i].page, sector); + wbi = r5_next_bio(wbi, sector); + } + + set_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(R5_UPTODATE, &sh->dev[i].flags); + } + + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < syndrome_disks && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); + + switch(method) { + case RECONSTRUCT_WRITE: + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); + break; + case UPDATE_PARITY: + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); + break; + } +} + + +/* Compute one missing block */ +static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) +{ + int i, count, disks = sh->disks; + void *ptr[MAX_XOR_BLOCKS], *dest, *p; + int qd_idx = sh->qd_idx; + + pr_debug("compute_block_1, stripe %llu, idx %d\n", + (unsigned long long)sh->sector, dd_idx); + + if ( dd_idx == qd_idx ) { + /* We're actually computing the Q drive */ + compute_parity6(sh, UPDATE_PARITY); + } else { + dest = page_address(sh->dev[dd_idx].page); + if (!nozero) memset(dest, 0, STRIPE_SIZE); + count = 0; + for (i = disks ; i--; ) { + if (i == dd_idx || i == qd_idx) + continue; + p = page_address(sh->dev[i].page); + if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) + ptr[count++] = p; + else + printk("compute_block() %d, stripe %llu, %d" + " not present\n", dd_idx, + (unsigned long long)sh->sector, i); + + check_xor(); + } + if (count) + xor_blocks(count, STRIPE_SIZE, dest, ptr); + if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + } +} + +/* Compute two missing blocks */ +static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[syndrome_disks+2]; + + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + BUG_ON(faila == failb); + if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } + + pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + compute_parity6(sh, UPDATE_PARITY); + return; + } else { + /* We're missing D+Q; recompute D from P */ + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); + compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ + return; + } + } + + /* We're missing D+P or D+D; */ + if (failb == syndrome_disks) { + /* We're missing D+P. */ + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); + } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); +} + static void -schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - raid5_conf_t *conf = sh->raid_conf; - int level = conf->level; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1959,7 +1858,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, } else sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1972,18 +1871,17 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } - if (s->locked + conf->max_degraded == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&conf->pending_full_writes); + atomic_inc(&sh->raid_conf->pending_full_writes); } else { - BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); sh->reconstruct_state = reconstruct_state_prexor_drain_run; set_bit(STRIPE_OP_PREXOR, &s->ops_request); set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2001,22 +1899,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, } } - /* keep the parity disk(s) locked while asynchronous operations + /* keep the parity disk locked while asynchronous operations * are in flight */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); s->locked++; - if (level == 6) { - int qd_idx = sh->qd_idx; - struct r5dev *dev = &sh->dev[qd_idx]; - - set_bit(R5_LOCKED, &dev->flags); - clear_bit(R5_UPTODATE, &dev->flags); - s->locked++; - } - pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -2097,6 +1986,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in static void end_reshape(raid5_conf_t *conf); +static int page_is_zero(struct page *p) +{ + char *a = page_address(p); + return ((*(u32*)a) == 0 && + memcmp(a, a+4, STRIPE_SIZE-4)==0); +} + static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { @@ -2236,10 +2132,9 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; - sh->ops.target2 = -1; s->req_compute = 1; /* Careful: from this point on 'uptodate' is in the eye - * of raid_run_ops which services 'compute' operations + * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will * be R5_UPTODATE by the time it is needed for a * subsequent operation. @@ -2278,104 +2173,61 @@ static void handle_stripe_fill5(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } -/* fetch_block6 - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill6 to continue - */ -static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, - struct r6_state *r6s, int disk_idx, int disks) -{ - struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], - &sh->dev[r6s->failed_num[1]] }; - - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed >= 1 && - (fdev[0]->toread || s->to_write)) || - (s->failed >= 2 && - (fdev[1]->toread || s->to_write)))) { - /* we would like to get this block, possibly by computing it, - * otherwise read it if the backing disk is insync - */ - BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); - BUG_ON(test_bit(R5_Wantread, &dev->flags)); - if ((s->uptodate == disks - 1) && - (s->failed && (disk_idx == r6s->failed_num[0] || - disk_idx == r6s->failed_num[1]))) { - /* have disk failed, and we're requested to fetch it; - * do compute it - */ - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, disk_idx); - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, &dev->flags); - sh->ops.target = disk_idx; - sh->ops.target2 = -1; /* no 2nd target */ - s->req_compute = 1; - s->uptodate++; - return 1; - } else if (s->uptodate == disks-2 && s->failed >= 2) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == disk_idx) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - disk_idx, other); - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); - set_bit(R5_Wantcompute, &sh->dev[other].flags); - sh->ops.target = disk_idx; - sh->ops.target2 = other; - s->uptodate += 2; - s->req_compute = 1; - return 1; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - disk_idx, s->syncing); - } - } - - return 0; -} - -/** - * handle_stripe_fill6 - read or compute data to satisfy pending requests. - */ static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { int i; - - /* look for blocks to read/compute, skip this if a compute - * is already in flight, or if the stripe contents are in the - * midst of changing due to a write - */ - if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) - for (i = disks; i--; ) - if (fetch_block6(sh, s, r6s, i, disks)) - break; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || (dev->towrite && + !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed >= 1 && + (sh->dev[r6s->failed_num[0]].toread || + s->to_write)) || + (s->failed >= 2 && + (sh->dev[r6s->failed_num[1]].toread || + s->to_write)))) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + compute_block_1(sh, i, 0); + s->uptodate++; + } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == i) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; + } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + i, other); + compute_block_2(sh, i, other); + s->uptodate += 2; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + i, s->syncing); + } + } + } set_bit(STRIPE_HANDLE, &sh->state); } @@ -2509,61 +2361,114 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, */ /* since handle_stripe can be called at any time we need to handle the * case where a compute block operation has been submitted and then a - * subsequent call wants to start a write request. raid_run_ops only - * handles the case where compute block and reconstruct are requested + * subsequent call wants to start a write request. raid5_run_ops only + * handles the case where compute block and postxor are requested * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - schedule_reconstruction(sh, s, rcw == 0, 0); + schedule_reconstruction5(sh, s, rcw == 0, 0); } static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { - int rcw = 0, pd_idx = sh->pd_idx, i; + int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; int qd_idx = sh->qd_idx; - - set_bit(STRIPE_HANDLE, &sh->state); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - /* check if we haven't enough data */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != pd_idx && i != qd_idx && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - pr_debug("Request delayed stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) + && i != pd_idx && i != qd_idx + && (!test_bit(R5_LOCKED, &dev->flags) + ) && + !test_bit(R5_UPTODATE, &dev->flags)) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; + else { + pr_debug("raid6: must_compute: " + "disk %d flags=%#lx\n", i, dev->flags); + must_compute++; + } + } + } + pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", + (unsigned long long)sh->sector, rcw, must_compute); + set_bit(STRIPE_HANDLE, &sh->state); + + if (rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) + && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) + && !test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Insync, &dev->flags)) { + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + pr_debug("Request delayed stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } } } - } /* now if nothing is locked, and if we have enough data, we can start a * write request */ - if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && - s->locked == 0 && rcw == 0 && + if (s->locked == 0 && rcw == 0 && !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - schedule_reconstruction(sh, s, 1, 0); + if (must_compute > 0) { + /* We have failed blocks and need to compute them */ + switch (s->failed) { + case 0: + BUG(); + case 1: + compute_block_1(sh, r6s->failed_num[0], 0); + break; + case 2: + compute_block_2(sh, r6s->failed_num[0], + r6s->failed_num[1]); + break; + default: /* This request should have been failed? */ + BUG(); + } + } + + pr_debug("Computing parity for stripe %llu\n", + (unsigned long long)sh->sector); + compute_parity6(sh, RECONSTRUCT_WRITE); + /* now every locked buffer is ready to be written */ + for (i = disks; i--; ) + if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { + pr_debug("Writing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + s->locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + if (s->locked == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); + /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ + set_bit(STRIPE_INSYNC, &sh->state); + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } } } @@ -2622,7 +2527,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, * we are done. Otherwise update the mismatch count and repair * parity if !MD_RECOVERY_CHECK */ - if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) + if (sh->ops.zero_sum_result == 0) /* parity is correct (on disc, * not in buffer any more) */ @@ -2639,7 +2544,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_Wantcompute, &sh->dev[sh->pd_idx].flags); sh->ops.target = sh->pd_idx; - sh->ops.target2 = -1; s->uptodate++; } } @@ -2656,74 +2560,67 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, - struct stripe_head_state *s, - struct r6_state *r6s, int disks) + struct stripe_head_state *s, + struct r6_state *r6s, struct page *tmp_page, + int disks) { + int update_p = 0, update_q = 0; + struct r5dev *dev; int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; - struct r5dev *dev; set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); - + BUG_ON(s->uptodate < disks); /* Want to check and possibly repair P and Q. * However there could be one 'failed' device, in which * case we can only check one of them, possibly using the * other to generate missing data */ - switch (sh->check_state) { - case check_state_idle: - /* start a new check operation if there are < 2 failures */ + /* If !tmp_page, we cannot do the calculations, + * but as we have set STRIPE_HANDLE, we will soon be called + * by stripe_handle with a tmp_page - just wait until then. + */ + if (tmp_page) { if (s->failed == r6s->q_failed) { - /* The only possible failed device holds Q, so it + /* The only possible failed device holds 'Q', so it * makes sense to check P (If anything else were failed, * we would have used P to recreate it). */ - sh->check_state = check_state_run; + compute_block_1(sh, pd_idx, 1); + if (!page_is_zero(sh->dev[pd_idx].page)) { + compute_block_1(sh, pd_idx, 0); + update_p = 1; + } } if (!r6s->q_failed && s->failed < 2) { - /* Q is not failed, and we didn't use it to generate + /* q is not failed, and we didn't use it to generate * anything, so it makes sense to check it */ - if (sh->check_state == check_state_run) - sh->check_state = check_state_run_pq; - else - sh->check_state = check_state_run_q; - } - - /* discard potentially stale zero_sum_result */ - sh->ops.zero_sum_result = 0; - - if (sh->check_state == check_state_run) { - /* async_xor_zero_sum destroys the contents of P */ - clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - s->uptodate--; + memcpy(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE); + compute_parity6(sh, UPDATE_PARITY); + if (memcmp(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE) != 0) { + clear_bit(STRIPE_INSYNC, &sh->state); + update_q = 1; + } } - if (sh->check_state >= check_state_run && - sh->check_state <= check_state_run_pq) { - /* async_syndrome_zero_sum preserves P and Q, so - * no need to mark them !uptodate here - */ - set_bit(STRIPE_OP_CHECK, &s->ops_request); - break; + if (update_p || update_q) { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + update_p = update_q = 0; } - /* we have 2-disk failure */ - BUG_ON(s->failed != 2); - /* fall through */ - case check_state_compute_result: - sh->check_state = check_state_idle; - - /* check that a write has not made the stripe insync */ - if (test_bit(STRIPE_INSYNC, &sh->state)) - break; - /* now write out any block on a failed drive, - * or P or Q if they were recomputed + * or P or Q if they need it */ - BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ + if (s->failed == 2) { dev = &sh->dev[r6s->failed_num[1]]; s->locked++; @@ -2736,13 +2633,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + + if (update_p) { dev = &sh->dev[pd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + if (update_q) { dev = &sh->dev[qd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); @@ -2751,70 +2649,6 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); - break; - case check_state_run: - case check_state_run_q: - case check_state_run_pq: - break; /* we will be called again upon completion */ - case check_state_check_result: - sh->check_state = check_state_idle; - - /* handle a successful check operation, if parity is correct - * we are done. Otherwise update the mismatch count and repair - * parity if !MD_RECOVERY_CHECK - */ - if (sh->ops.zero_sum_result == 0) { - /* both parities are correct */ - if (!s->failed) - set_bit(STRIPE_INSYNC, &sh->state); - else { - /* in contrast to the raid5 case we can validate - * parity, but still have a failure to write - * back - */ - sh->check_state = check_state_compute_result; - /* Returning at this point means that we may go - * off and bring p and/or q uptodate again so - * we make sure to check zero_sum_result again - * to verify if p or q need writeback - */ - } - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - int *target = &sh->ops.target; - - sh->ops.target = -1; - sh->ops.target2 = -1; - sh->check_state = check_state_compute_run; - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { - set_bit(R5_Wantcompute, - &sh->dev[pd_idx].flags); - *target = pd_idx; - target = &sh->ops.target2; - s->uptodate++; - } - if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { - set_bit(R5_Wantcompute, - &sh->dev[qd_idx].flags); - *target = qd_idx; - s->uptodate++; - } - } - } - break; - case check_state_compute_run: - break; - default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", - __func__, sh->check_state, - (unsigned long long) sh->sector); - BUG(); } } @@ -2832,7 +2666,6 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, if (i != sh->pd_idx && i != sh->qd_idx) { int dd_idx, j; struct stripe_head *sh2; - struct async_submit_ctl submit; sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, @@ -2852,10 +2685,9 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } /* place all the copies on one channel */ - init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, - &submit); + sh->dev[i].page, 0, 0, STRIPE_SIZE, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); @@ -2924,8 +2756,7 @@ static bool handle_stripe5(struct stripe_head *sh) rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; - - dev = &sh->dev[i]; + struct r5dev *dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx toread %p read %p write %p " @@ -3142,7 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction(sh, &s, 1, 1); + schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -3162,7 +2993,7 @@ static bool handle_stripe5(struct stripe_head *sh) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); if (s.ops_request) - raid_run_ops(sh, s.ops_request); + raid5_run_ops(sh, s.ops_request); ops_run_io(sh, &s); @@ -3171,7 +3002,7 @@ static bool handle_stripe5(struct stripe_head *sh) return blocked_rdev == NULL; } -static bool handle_stripe6(struct stripe_head *sh) +static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -3183,10 +3014,9 @@ static bool handle_stripe6(struct stripe_head *sh) mdk_rdev_t *blocked_rdev = NULL; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " - "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", + "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, qd_idx, - sh->check_state, sh->reconstruct_state); + atomic_read(&sh->count), pd_idx, qd_idx); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -3206,26 +3036,35 @@ static bool handle_stripe6(struct stripe_head *sh) pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read - * - * new wantfill requests are only permitted while - * ops_complete_biofill is guaranteed to be inactive - */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) - set_bit(R5_Wantfill, &dev->flags); + /* maybe we can reply to a read */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { + struct bio *rbi, *rbi2; + pr_debug("Return read for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + rbi = dev->toread; + dev->toread = NULL; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&conf->wait_for_overlap); + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (!raid5_dec_bi_phys_segments(rbi)) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + } + } /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) { - s.compute++; - BUG_ON(s.compute > 2); - } - if (test_bit(R5_Wantfill, &dev->flags)) { - s.to_fill++; - } else if (dev->toread) + + if (dev->toread) s.to_read++; if (dev->towrite) { s.to_write++; @@ -3266,11 +3105,6 @@ static bool handle_stripe6(struct stripe_head *sh) blocked_rdev = NULL; } - if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { - set_bit(STRIPE_OP_BIOFILL, &s.ops_request); - set_bit(STRIPE_BIOFILL_RUN, &sh->state); - } - pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3311,62 +3145,19 @@ static bool handle_stripe6(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + (s.syncing && (s.uptodate < disks)) || s.expanding) handle_stripe_fill6(sh, &s, &r6s, disks); - /* Now we check to see if any write operations have recently - * completed - */ - if (sh->reconstruct_state == reconstruct_state_drain_result) { - int qd_idx = sh->qd_idx; - - sh->reconstruct_state = reconstruct_state_idle; - /* All the 'written' buffers and the parity blocks are ready to - * be written back to disk - */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); - for (i = disks; i--; ) { - dev = &sh->dev[i]; - if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || i == qd_idx || - dev->written)) { - pr_debug("Writing block %d\n", i); - BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); - set_bit(R5_Wantwrite, &dev->flags); - if (!test_bit(R5_Insync, &dev->flags) || - ((i == sh->pd_idx || i == qd_idx) && - s.failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } - - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. - * 2/ A 'check' operation is in flight, as it may clobber the parity - * block. - */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) + /* now to consider writing and what else, if anything should be read */ + if (s.to_write) handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough - * data is available. The parity check is held off while parity - * dependent operations are in flight. + * data is available */ - if (sh->check_state || - (s.syncing && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) - handle_parity_checks6(conf, sh, &s, &r6s, disks); + if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) + handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); @@ -3387,29 +3178,15 @@ static bool handle_stripe6(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); - s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); set_bit(R5_LOCKED, &dev->flags); - s.locked++; } } } - /* Finish reconstruct operations initiated by the expansion process */ - if (sh->reconstruct_state == reconstruct_state_result) { - sh->reconstruct_state = reconstruct_state_idle; - clear_bit(STRIPE_EXPANDING, &sh->state); - for (i = conf->raid_disks; i--; ) { - set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - } - } - - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !sh->reconstruct_state) { + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { struct stripe_head *sh2 = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { @@ -3430,8 +3207,14 @@ static bool handle_stripe6(struct stripe_head *sh) /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction(sh, &s, 1, 1); - } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { + compute_parity6(sh, RECONSTRUCT_WRITE); + for (i = conf->raid_disks ; i-- ; ) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + clear_bit(STRIPE_EXPANDING, &sh->state); + } else if (s.expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -3449,9 +3232,6 @@ static bool handle_stripe6(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (s.ops_request) - raid_run_ops(sh, s.ops_request); - ops_run_io(sh, &s); return_io(return_bi); @@ -3460,14 +3240,16 @@ static bool handle_stripe6(struct stripe_head *sh) } /* returns true if the stripe was handled */ -static bool handle_stripe(struct stripe_head *sh) +static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) { if (sh->raid_conf->level == 6) - return handle_stripe6(sh); + return handle_stripe6(sh, tmp_page); else return handle_stripe5(sh); } + + static void raid5_activate_delayed(raid5_conf_t *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { @@ -3549,9 +3331,6 @@ static int raid5_congested(void *data, int bits) /* No difference between reads and writes. Just check * how busy the stripe_cache is */ - - if (mddev_congested(mddev, bits)) - return 1; if (conf->inactive_blocked) return 1; if (conf->quiesce) @@ -4101,7 +3880,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped INIT_LIST_HEAD(&stripes); for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; - int skipped_disk = 0; + int skipped = 0; sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); @@ -4117,14 +3896,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped continue; s = compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { - skipped_disk = 1; + skipped = 1; continue; } memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } - if (!skipped_disk) { + if (!skipped) { set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } @@ -4278,7 +4057,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski spin_unlock(&sh->lock); /* wait for any blocked device to be handled */ - while (unlikely(!handle_stripe(sh))) + while(unlikely(!handle_stripe(sh, NULL))) ; release_stripe(sh); @@ -4335,7 +4114,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); handled++; } @@ -4349,36 +4128,6 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } -#ifdef CONFIG_MULTICORE_RAID456 -static void __process_stripe(void *param, async_cookie_t cookie) -{ - struct stripe_head *sh = param; - - handle_stripe(sh); - release_stripe(sh); -} - -static void process_stripe(struct stripe_head *sh, struct list_head *domain) -{ - async_schedule_domain(__process_stripe, sh, domain); -} - -static void synchronize_stripe_processing(struct list_head *domain) -{ - async_synchronize_full_domain(domain); -} -#else -static void process_stripe(struct stripe_head *sh, struct list_head *domain) -{ - handle_stripe(sh); - release_stripe(sh); - cond_resched(); -} - -static void synchronize_stripe_processing(struct list_head *domain) -{ -} -#endif /* @@ -4393,7 +4142,6 @@ static void raid5d(mddev_t *mddev) struct stripe_head *sh; raid5_conf_t *conf = mddev->private; int handled; - LIST_HEAD(raid_domain); pr_debug("+++ raid5d active\n"); @@ -4430,7 +4178,8 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - process_stripe(sh, &raid_domain); + handle_stripe(sh, conf->spare_page); + release_stripe(sh); spin_lock_irq(&conf->device_lock); } @@ -4438,7 +4187,6 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); - synchronize_stripe_processing(&raid_domain); async_tx_issue_pending_all(); unplug_slaves(mddev); @@ -4571,118 +4319,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } -static void raid5_free_percpu(raid5_conf_t *conf) -{ - struct raid5_percpu *percpu; - unsigned long cpu; - - if (!conf->percpu) - return; - - get_online_cpus(); - for_each_possible_cpu(cpu) { - percpu = per_cpu_ptr(conf->percpu, cpu); - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - } -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&conf->cpu_notify); -#endif - put_online_cpus(); - - free_percpu(conf->percpu); -} - static void free_conf(raid5_conf_t *conf) { shrink_stripes(conf); - raid5_free_percpu(conf); + safe_put_page(conf->spare_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); kfree(conf); } -#ifdef CONFIG_HOTPLUG_CPU -static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); - long cpu = (long)hcpu; - struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (conf->level == 6 && !percpu->spare_page) - percpu->spare_page = alloc_page(GFP_KERNEL); - if (!percpu->scribble) - percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - - if (!percpu->scribble || - (conf->level == 6 && !percpu->spare_page)) { - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - return NOTIFY_BAD; - } - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - percpu->spare_page = NULL; - percpu->scribble = NULL; - break; - default: - break; - } - return NOTIFY_OK; -} -#endif - -static int raid5_alloc_percpu(raid5_conf_t *conf) -{ - unsigned long cpu; - struct page *spare_page; - struct raid5_percpu *allcpus; - void *scribble; - int err; - - allcpus = alloc_percpu(struct raid5_percpu); - if (!allcpus) - return -ENOMEM; - conf->percpu = allcpus; - - get_online_cpus(); - err = 0; - for_each_present_cpu(cpu) { - if (conf->level == 6) { - spare_page = alloc_page(GFP_KERNEL); - if (!spare_page) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; - } - scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); - if (!scribble) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; - } -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - if (err == 0) - err = register_cpu_notifier(&conf->cpu_notify); -#endif - put_online_cpus(); - - return err; -} - static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4724,7 +4369,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) goto abort; conf->raid_disks = mddev->raid_disks; - conf->scribble_len = scribble_len(conf->raid_disks); if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; else @@ -4740,10 +4384,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - conf->level = mddev->new_level; - if (raid5_alloc_percpu(conf) != 0) - goto abort; - + if (mddev->new_level == 6) { + conf->spare_page = alloc_page(GFP_KERNEL); + if (!conf->spare_page) + goto abort; + } spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -4802,7 +4447,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, mdname(mddev)); - conf->thread = md_register_thread(raid5d, mddev, NULL); + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); if (!conf->thread) { printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", @@ -4968,7 +4613,7 @@ static int run(mddev_t *mddev) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + "%s_reshape"); } /* read-ahead size must cover two whole stripes, which is @@ -5386,7 +5031,7 @@ static int raid5_start_reshape(mddev_t *mddev) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + "%s_reshape"); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); diff --git a/trunk/drivers/md/raid5.h b/trunk/drivers/md/raid5.h index 2390e0e83daf..9459689c4ea0 100644 --- a/trunk/drivers/md/raid5.h +++ b/trunk/drivers/md/raid5.h @@ -2,7 +2,6 @@ #define _RAID5_H #include -#include /* * @@ -176,9 +175,7 @@ */ enum check_states { check_state_idle = 0, - check_state_run, /* xor parity check */ - check_state_run_q, /* q-parity check */ - check_state_run_pq, /* pq dual parity check */ + check_state_run, /* parity check */ check_state_check_result, check_state_compute_run, /* parity repair */ check_state_compute_result, @@ -218,8 +215,8 @@ struct stripe_head { * @target - STRIPE_OP_COMPUTE_BLK target */ struct stripe_operations { - int target, target2; - enum sum_check_flags zero_sum_result; + int target; + u32 zero_sum_result; } ops; struct r5dev { struct bio req; @@ -301,7 +298,7 @@ struct r6_state { #define STRIPE_OP_COMPUTE_BLK 1 #define STRIPE_OP_PREXOR 2 #define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_RECONSTRUCT 4 +#define STRIPE_OP_POSTXOR 4 #define STRIPE_OP_CHECK 5 /* @@ -388,21 +385,8 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ - /* per cpu variables */ - struct raid5_percpu { - struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer - * lists and performing address - * conversions - */ - } *percpu; - size_t scribble_len; /* size of scribble region must be - * associated with conf to handle - * cpu hotplug while reshaping - */ -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + + struct page *spare_page; /* Used when checking P/Q in raid6 */ /* * Free stripes pool diff --git a/trunk/drivers/media/dvb/dvb-core/dvbdev.h b/trunk/drivers/media/dvb/dvb-core/dvbdev.h index 01fc70484743..895e2efca8a9 100644 --- a/trunk/drivers/media/dvb/dvb-core/dvbdev.h +++ b/trunk/drivers/media/dvb/dvb-core/dvbdev.h @@ -31,9 +31,10 @@ #define DVB_MAJOR 212 #if defined(CONFIG_DVB_MAX_ADAPTERS) && CONFIG_DVB_MAX_ADAPTERS > 0 - #define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS +#define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS #else - #define DVB_MAX_ADAPTERS 8 +#warning invalid CONFIG_DVB_MAX_ADAPTERS value +#define DVB_MAX_ADAPTERS 8 #endif #define DVB_UNSET (-1) diff --git a/trunk/drivers/media/dvb/dvb-usb/Kconfig b/trunk/drivers/media/dvb/dvb-usb/Kconfig index 9744b0692417..0e4b97fba384 100644 --- a/trunk/drivers/media/dvb/dvb-usb/Kconfig +++ b/trunk/drivers/media/dvb/dvb-usb/Kconfig @@ -75,7 +75,7 @@ config DVB_USB_DIB0700 select DVB_DIB3000MC if !DVB_FE_CUSTOMISE select DVB_S5H1411 if !DVB_FE_CUSTOMISE select DVB_LGDT3305 if !DVB_FE_CUSTOMISE - select DVB_TUNER_DIB0070 + select DVB_TUNER_DIB0070 if !DVB_FE_CUSTOMISE select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMISE select MEDIA_TUNER_MT2266 if !MEDIA_TUNER_CUSTOMISE select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE diff --git a/trunk/drivers/media/video/saa7164/saa7164-api.c b/trunk/drivers/media/video/saa7164/saa7164-api.c index 6f094a96ac81..bb6df1b276be 100644 --- a/trunk/drivers/media/video/saa7164/saa7164-api.c +++ b/trunk/drivers/media/video/saa7164/saa7164-api.c @@ -415,7 +415,7 @@ int saa7164_api_enum_subdevs(struct saa7164_dev *dev) goto out; } - if (saa_debug & DBGLVL_API) + if (debug & DBGLVL_API) saa7164_dumphex16(dev, buf, (buflen/16)*16); saa7164_api_dump_subdevs(dev, buf, buflen); @@ -480,7 +480,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg, dprintk(DBGLVL_API, "%s() len = %d bytes\n", __func__, len); - if (saa_debug & DBGLVL_I2C) + if (debug & DBGLVL_I2C) saa7164_dumphex16(dev, buf, 2 * 16); ret = saa7164_cmd_send(bus->dev, unitid, GET_CUR, @@ -488,7 +488,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg, if (ret != SAA_OK) printk(KERN_ERR "%s() error, ret(2) = 0x%x\n", __func__, ret); else { - if (saa_debug & DBGLVL_I2C) + if (debug & DBGLVL_I2C) saa7164_dumphex16(dev, buf, sizeof(buf)); memcpy(data, (buf + 2 * sizeof(u32) + reglen), datalen); } @@ -548,7 +548,7 @@ int saa7164_api_i2c_write(struct saa7164_i2c *bus, u8 addr, u32 datalen, *((u32 *)(buf + 1 * sizeof(u32))) = datalen - reglen; memcpy((buf + 2 * sizeof(u32)), data, datalen); - if (saa_debug & DBGLVL_I2C) + if (debug & DBGLVL_I2C) saa7164_dumphex16(dev, buf, sizeof(buf)); ret = saa7164_cmd_send(bus->dev, unitid, SET_CUR, diff --git a/trunk/drivers/media/video/saa7164/saa7164-cmd.c b/trunk/drivers/media/video/saa7164/saa7164-cmd.c index c45966edc0cf..e097f1a0969a 100644 --- a/trunk/drivers/media/video/saa7164/saa7164-cmd.c +++ b/trunk/drivers/media/video/saa7164/saa7164-cmd.c @@ -250,7 +250,7 @@ int saa7164_cmd_wait(struct saa7164_dev *dev, u8 seqno) unsigned long stamp; int r; - if (saa_debug >= 4) + if (debug >= 4) saa7164_bus_dump(dev); dprintk(DBGLVL_CMD, "%s(seqno=%d)\n", __func__, seqno); diff --git a/trunk/drivers/media/video/saa7164/saa7164-core.c b/trunk/drivers/media/video/saa7164/saa7164-core.c index 709affc31042..f0dbead188c8 100644 --- a/trunk/drivers/media/video/saa7164/saa7164-core.c +++ b/trunk/drivers/media/video/saa7164/saa7164-core.c @@ -45,8 +45,8 @@ MODULE_LICENSE("GPL"); 32 bus */ -unsigned int saa_debug; -module_param_named(debug, saa_debug, int, 0644); +unsigned int debug; +module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "enable debug messages"); unsigned int waitsecs = 10; @@ -653,7 +653,7 @@ static int __devinit saa7164_initdev(struct pci_dev *pci_dev, printk(KERN_ERR "%s() Unsupported board detected, " "registering without firmware\n", __func__); - dprintk(1, "%s() parameter debug = %d\n", __func__, saa_debug); + dprintk(1, "%s() parameter debug = %d\n", __func__, debug); dprintk(1, "%s() parameter waitsecs = %d\n", __func__, waitsecs); fail_fw: diff --git a/trunk/drivers/media/video/saa7164/saa7164.h b/trunk/drivers/media/video/saa7164/saa7164.h index 42660b546f0e..6753008a9c9b 100644 --- a/trunk/drivers/media/video/saa7164/saa7164.h +++ b/trunk/drivers/media/video/saa7164/saa7164.h @@ -375,9 +375,9 @@ extern int saa7164_buffer_dealloc(struct saa7164_tsport *port, /* ----------------------------------------------------------- */ -extern unsigned int saa_debug; +extern unsigned int debug; #define dprintk(level, fmt, arg...)\ - do { if (saa_debug & level)\ + do { if (debug & level)\ printk(KERN_DEBUG "%s: " fmt, dev->name, ## arg);\ } while (0) diff --git a/trunk/drivers/memstick/core/memstick.c b/trunk/drivers/memstick/core/memstick.c index b3bf1c44d74d..a5b448ea4eab 100644 --- a/trunk/drivers/memstick/core/memstick.c +++ b/trunk/drivers/memstick/core/memstick.c @@ -339,9 +339,9 @@ static int h_memstick_read_dev_id(struct memstick_dev *card, card->id.type = id_reg.type; card->id.category = id_reg.category; card->id.class = id_reg.class; - dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode); } complete(&card->mrq_complete); + dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode); return -EAGAIN; } } diff --git a/trunk/drivers/misc/sgi-gru/grukservices.c b/trunk/drivers/misc/sgi-gru/grukservices.c index 766e21e15574..79689b10f937 100644 --- a/trunk/drivers/misc/sgi-gru/grukservices.c +++ b/trunk/drivers/misc/sgi-gru/grukservices.c @@ -937,8 +937,6 @@ static int quicktest1(unsigned long arg) /* Need 1K cacheline aligned that does not cross page boundary */ p = kmalloc(4096, 0); - if (p == NULL) - return -ENOMEM; mq = ALIGNUP(p, 1024); memset(mes, 0xee, sizeof(mes)); dw = mq; diff --git a/trunk/drivers/misc/sgi-gru/gruprocfs.c b/trunk/drivers/misc/sgi-gru/gruprocfs.c index ccd4408a26c7..9cbf95bedce6 100644 --- a/trunk/drivers/misc/sgi-gru/gruprocfs.c +++ b/trunk/drivers/misc/sgi-gru/gruprocfs.c @@ -340,9 +340,10 @@ static struct proc_dir_entry *proc_gru __read_mostly; static int create_proc_file(struct proc_entry *p) { - p->entry = proc_create(p->name, p->mode, proc_gru, p->fops); + p->entry = create_proc_entry(p->name, p->mode, proc_gru); if (!p->entry) return -1; + p->entry->proc_fops = p->fops; return 0; } diff --git a/trunk/drivers/mmc/host/atmel-mci.c b/trunk/drivers/mmc/host/atmel-mci.c index fc25586b7ee1..065fa818be57 100644 --- a/trunk/drivers/mmc/host/atmel-mci.c +++ b/trunk/drivers/mmc/host/atmel-mci.c @@ -599,7 +599,6 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data) struct scatterlist *sg; unsigned int i; enum dma_data_direction direction; - unsigned int sglen; /* * We don't do DMA on "complex" transfers, i.e. with @@ -629,14 +628,11 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data) else direction = DMA_TO_DEVICE; - sglen = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len, direction); - if (sglen != data->sg_len) - goto unmap_exit; desc = chan->device->device_prep_slave_sg(chan, data->sg, data->sg_len, direction, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) - goto unmap_exit; + return -ENOMEM; host->dma.data_desc = desc; desc->callback = atmci_dma_complete; @@ -647,9 +643,6 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data) chan->device->device_issue_pending(chan); return 0; -unmap_exit: - dma_unmap_sg(&host->pdev->dev, data->sg, sglen, direction); - return -ENOMEM; } #else /* CONFIG_MMC_ATMELMCI_DMA */ diff --git a/trunk/drivers/net/wireless/arlan-proc.c b/trunk/drivers/net/wireless/arlan-proc.c index a8b689635a3b..2ab1d59870f4 100644 --- a/trunk/drivers/net/wireless/arlan-proc.c +++ b/trunk/drivers/net/wireless/arlan-proc.c @@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev) static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0"; -static int arlan_sysctl_info(ctl_table * ctl, int write, +static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -629,7 +629,7 @@ static int arlan_sysctl_info(ctl_table * ctl, int write, *lenp = pos; if (!write) - retv = proc_dostring(ctl, write, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); else { *lenp = 0; @@ -639,7 +639,7 @@ static int arlan_sysctl_info(ctl_table * ctl, int write, } -static int arlan_sysctl_info161719(ctl_table * ctl, int write, +static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, final: *lenp = pos; - retv = proc_dostring(ctl, write, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, +static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, SARLBNpln(u_char, txBuffer, 0x800); final: *lenp = pos; - retv = proc_dostring(ctl, write, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, +static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, SARLBNpln(u_char, rxBuffer, 0x800); final: *lenp = pos; - retv = proc_dostring(ctl, write, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_info18(ctl_table * ctl, int write, +static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, final: *lenp = pos; - retv = proc_dostring(ctl, write, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); return retv; } @@ -766,7 +766,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, static char conf_reset_result[200]; -static int arlan_configure(ctl_table * ctl, int write, +static int arlan_configure(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int pos = 0; @@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, return -1; *lenp = pos; - return proc_dostring(ctl, write, buffer, lenp, ppos); + return proc_dostring(ctl, write, filp, buffer, lenp, ppos); } -static int arlan_sysctl_reset(ctl_table * ctl, int write, +static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp, void __user *buffer, size_t * lenp, loff_t *ppos) { int pos = 0; @@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, } else return -1; *lenp = pos + 3; - return proc_dostring(ctl, write, buffer, lenp, ppos); + return proc_dostring(ctl, write, filp, buffer, lenp, ppos); } diff --git a/trunk/drivers/parport/procfs.c b/trunk/drivers/parport/procfs.c index 8eefe56f1cbe..554e11f9e1ce 100644 --- a/trunk/drivers/parport/procfs.c +++ b/trunk/drivers/parport/procfs.c @@ -31,7 +31,7 @@ #define PARPORT_MIN_SPINTIME_VALUE 1 #define PARPORT_MAX_SPINTIME_VALUE 1000 -static int do_active_device(ctl_table *table, int write, +static int do_active_device(ctl_table *table, int write, struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, } #ifdef CONFIG_PARPORT_1284 -static int do_autoprobe(ctl_table *table, int write, +static int do_autoprobe(ctl_table *table, int write, struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; @@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr (ctl_table *table, int write, - void __user *result, + struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write, } static int do_hardware_irq (ctl_table *table, int write, - void __user *result, + struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write, } static int do_hardware_dma (ctl_table *table, int write, - void __user *result, + struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write, } static int do_hardware_modes (ctl_table *table, int write, - void __user *result, + struct file *filp, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; diff --git a/trunk/drivers/staging/go7007/Makefile b/trunk/drivers/staging/go7007/Makefile index 1301caa7495d..d14ea84a01f6 100644 --- a/trunk/drivers/staging/go7007/Makefile +++ b/trunk/drivers/staging/go7007/Makefile @@ -32,3 +32,8 @@ endif EXTRA_CFLAGS += -Idrivers/media/dvb/frontends EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core + +# Ubuntu 8.04 has CONFIG_SND undefined, so include lum sound/config.h too +ifeq ($(CONFIG_SND),) +EXTRA_CFLAGS += -include sound/config.h +endif diff --git a/trunk/drivers/usb/serial/sierra.c b/trunk/drivers/usb/serial/sierra.c index 8c075b2416bb..68fa0e43b781 100644 --- a/trunk/drivers/usb/serial/sierra.c +++ b/trunk/drivers/usb/serial/sierra.c @@ -912,7 +912,6 @@ static void sierra_release(struct usb_serial *serial) } } -#ifdef CONFIG_PM static void stop_read_write_urbs(struct usb_serial *serial) { int i, j; @@ -989,10 +988,6 @@ static int sierra_resume(struct usb_serial *serial) return ec ? -EIO : 0; } -#else -#define sierra_suspend NULL -#define sierra_resume NULL -#endif static struct usb_serial_driver sierra_device = { .driver = { diff --git a/trunk/drivers/vlynq/vlynq.c b/trunk/drivers/vlynq/vlynq.c index 9554ad5f9af7..ba3d71f5c7d0 100644 --- a/trunk/drivers/vlynq/vlynq.c +++ b/trunk/drivers/vlynq/vlynq.c @@ -702,7 +702,7 @@ static int vlynq_probe(struct platform_device *pdev) dev->mem_start = mem_res->start; dev->mem_end = mem_res->end; - len = resource_size(regs_res); + len = regs_res->end - regs_res->start; if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) { printk(KERN_ERR "%s: Can't request vlynq registers\n", dev_name(&dev->dev)); diff --git a/trunk/fs/adfs/inode.c b/trunk/fs/adfs/inode.c index 3f57ce4bee5d..798cb071d132 100644 --- a/trunk/fs/adfs/inode.c +++ b/trunk/fs/adfs/inode.c @@ -19,6 +19,9 @@ static int adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, int create) { + if (block < 0) + goto abort_negative; + if (!create) { if (block >= inode->i_blocks) goto abort_toobig; @@ -31,6 +34,10 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, /* don't support allocation of blocks yet */ return -EIO; +abort_negative: + adfs_error(inode->i_sb, "block %d < 0", block); + return -EIO; + abort_toobig: return 0; } diff --git a/trunk/fs/binfmt_elf.c b/trunk/fs/binfmt_elf.c index b9b3bb51b1e4..442d94fe255c 100644 --- a/trunk/fs/binfmt_elf.c +++ b/trunk/fs/binfmt_elf.c @@ -1711,52 +1711,42 @@ struct elf_note_info { int numnote; }; -static int elf_note_info_init(struct elf_note_info *info) +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + long signr, struct pt_regs *regs) { - memset(info, 0, sizeof(*info)); +#define NUM_NOTES 6 + struct list_head *t; + + info->notes = NULL; + info->prstatus = NULL; + info->psinfo = NULL; + info->fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + info->xfpu = NULL; +#endif INIT_LIST_HEAD(&info->thread_list); - /* Allocate space for six ELF notes */ - info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); + info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), + GFP_KERNEL); if (!info->notes) return 0; info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); if (!info->psinfo) - goto notes_free; + return 0; info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); if (!info->prstatus) - goto psinfo_free; + return 0; info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); if (!info->fpu) - goto prstatus_free; + return 0; #ifdef ELF_CORE_COPY_XFPREGS info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); if (!info->xfpu) - goto fpu_free; -#endif - return 1; -#ifdef ELF_CORE_COPY_XFPREGS - fpu_free: - kfree(info->fpu); -#endif - prstatus_free: - kfree(info->prstatus); - psinfo_free: - kfree(info->psinfo); - notes_free: - kfree(info->notes); - return 0; -} - -static int fill_note_info(struct elfhdr *elf, int phdrs, - struct elf_note_info *info, - long signr, struct pt_regs *regs) -{ - struct list_head *t; - - if (!elf_note_info_init(info)) return 0; +#endif + info->thread_status_size = 0; if (signr) { struct core_thread *ct; struct elf_thread_status *ets; @@ -1816,6 +1806,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, #endif return 1; + +#undef NUM_NOTES } static size_t get_note_info_size(struct elf_note_info *info) diff --git a/trunk/fs/binfmt_elf_fdpic.c b/trunk/fs/binfmt_elf_fdpic.c index 38502c67987c..76285471073e 100644 --- a/trunk/fs/binfmt_elf_fdpic.c +++ b/trunk/fs/binfmt_elf_fdpic.c @@ -283,23 +283,20 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, } stack_size = exec_params.stack_size; + if (stack_size < interp_params.stack_size) + stack_size = interp_params.stack_size; + if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) executable_stack = EXSTACK_ENABLE_X; else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) executable_stack = EXSTACK_DISABLE_X; + else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; else executable_stack = EXSTACK_DEFAULT; - if (stack_size == 0) { - stack_size = interp_params.stack_size; - if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) - executable_stack = EXSTACK_ENABLE_X; - else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) - executable_stack = EXSTACK_DISABLE_X; - else - executable_stack = EXSTACK_DEFAULT; - } - retval = -ENOEXEC; if (stack_size == 0) goto error; diff --git a/trunk/fs/binfmt_flat.c b/trunk/fs/binfmt_flat.c index a2796651e756..e92f229e3c6e 100644 --- a/trunk/fs/binfmt_flat.c +++ b/trunk/fs/binfmt_flat.c @@ -278,6 +278,8 @@ static int decompress_exec( ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); if (ret <= 0) break; + if (ret >= (unsigned long) -4096) + break; len -= ret; strm.next_in = buf; @@ -333,7 +335,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - IS_ERR_VALUE(load_flat_shared_library(id, p))) { + load_flat_shared_library(id, p) > (unsigned long) -4096) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -543,7 +545,7 @@ static int load_flat_file(struct linux_binprm * bprm, textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); up_write(¤t->mm->mmap_sem); - if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos || textpos >= (unsigned long) -4096) { if (!textpos) textpos = (unsigned long) -ENOMEM; printk("Unable to mmap process text, errno %d\n", (int)-textpos); @@ -558,7 +560,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); - if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { if (!realdatastart) realdatastart = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process data, errno %d\n", @@ -585,7 +587,7 @@ static int load_flat_file(struct linux_binprm * bprm, result = bprm->file->f_op->read(bprm->file, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), &fpos); } - if (IS_ERR_VALUE(result)) { + if (result >= (unsigned long)-4096) { printk("Unable to read data+bss, errno %d\n", (int)-result); do_munmap(current->mm, textpos, text_len); do_munmap(current->mm, realdatastart, data_len + extra); @@ -605,7 +607,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); - if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos || textpos >= (unsigned long) -4096) { if (!textpos) textpos = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process text/data, errno %d\n", @@ -639,7 +641,7 @@ static int load_flat_file(struct linux_binprm * bprm, fpos = 0; result = bprm->file->f_op->read(bprm->file, (char *) textpos, text_len, &fpos); - if (!IS_ERR_VALUE(result)) + if (result < (unsigned long) -4096) result = decompress_exec(bprm, text_len, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), 0); } @@ -649,13 +651,13 @@ static int load_flat_file(struct linux_binprm * bprm, fpos = 0; result = bprm->file->f_op->read(bprm->file, (char *) textpos, text_len, &fpos); - if (!IS_ERR_VALUE(result)) { + if (result < (unsigned long) -4096) { fpos = ntohl(hdr->data_start); result = bprm->file->f_op->read(bprm->file, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), &fpos); } } - if (IS_ERR_VALUE(result)) { + if (result >= (unsigned long)-4096) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); do_munmap(current->mm, textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); @@ -833,7 +835,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (!IS_ERR_VALUE(res)) + if (res <= (unsigned long)-4096) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -878,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (IS_ERR_VALUE(res)) + if (res > (unsigned long)-4096) return res; /* Update data segment pointers for all libraries */ diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index d154a3f365d5..9096fd0ca3ca 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -5269,7 +5269,6 @@ static const struct address_space_operations btrfs_aops = { .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations btrfs_symlink_aops = { diff --git a/trunk/fs/char_dev.c b/trunk/fs/char_dev.c index d6db933df2b2..3cbc57f932d2 100644 --- a/trunk/fs/char_dev.c +++ b/trunk/fs/char_dev.c @@ -264,6 +264,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, { struct char_device_struct *cd; struct cdev *cdev; + char *s; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); @@ -277,6 +278,8 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); + for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) + *s = '!'; err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) diff --git a/trunk/fs/coda/coda_int.h b/trunk/fs/coda/coda_int.h index d99860a33890..8ccd5ed81d9c 100644 --- a/trunk/fs/coda/coda_int.h +++ b/trunk/fs/coda/coda_int.h @@ -2,7 +2,6 @@ #define _CODA_INT_ struct dentry; -struct file; extern struct file_system_type coda_fs_type; extern unsigned long coda_timeout; diff --git a/trunk/fs/drop_caches.c b/trunk/fs/drop_caches.c index 31f4b0e6d72c..a2edb7913447 100644 --- a/trunk/fs/drop_caches.c +++ b/trunk/fs/drop_caches.c @@ -63,9 +63,9 @@ static void drop_slab(void) } int drop_caches_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, buffer, length, ppos); + proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) drop_pagecache(); diff --git a/trunk/fs/exec.c b/trunk/fs/exec.c index d49be6bc1793..5c833c18d0d4 100644 --- a/trunk/fs/exec.c +++ b/trunk/fs/exec.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include @@ -64,7 +63,6 @@ int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; -unsigned int core_pipe_limit; int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ @@ -1395,16 +1393,18 @@ int do_execve(char * filename, return retval; } -void set_binfmt(struct linux_binfmt *new) +int set_binfmt(struct linux_binfmt *new) { - struct mm_struct *mm = current->mm; - - if (mm->binfmt) - module_put(mm->binfmt->module); + struct linux_binfmt *old = current->binfmt; - mm->binfmt = new; - if (new) - __module_get(new->module); + if (new) { + if (!try_module_get(new->module)) + return -1; + } + current->binfmt = new; + if (old) + module_put(old->module); + return 0; } EXPORT_SYMBOL(set_binfmt); @@ -1728,29 +1728,6 @@ int get_dumpable(struct mm_struct *mm) return (ret >= 2) ? 2 : ret; } -static void wait_for_dump_helpers(struct file *file) -{ - struct pipe_inode_info *pipe; - - pipe = file->f_path.dentry->d_inode->i_pipe; - - pipe_lock(pipe); - pipe->readers++; - pipe->writers--; - - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); - } - - pipe->readers--; - pipe->writers++; - pipe_unlock(pipe); - -} - - void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; @@ -1767,12 +1744,11 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; - int dump_count = 0; - static atomic_t core_dump_count = ATOMIC_INIT(0); + char *delimit; audit_core_dumps(signr); - binfmt = mm->binfmt; + binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; @@ -1823,63 +1799,54 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) lock_kernel(); ispipe = format_corename(corename, signr); unlock_kernel(); - + /* + * Don't bother to check the RLIMIT_CORE value if core_pattern points + * to a pipe. Since we're not writing directly to the filesystem + * RLIMIT_CORE doesn't really apply, as no actual core file will be + * created unless the pipe reader choses to write out the core file + * at which point file size limits and permissions will be imposed + * as it does with any other process + */ if ((!ispipe) && (core_limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { - if (core_limit == 0) { - /* - * Normally core limits are irrelevant to pipes, since - * we're not writing to the file system, but we use - * core_limit of 0 here as a speacial value. Any - * non-zero limit gets set to RLIM_INFINITY below, but - * a limit of 0 skips the dump. This is a consistent - * way to catch recursive crashes. We can still crash - * if the core_pattern binary sets RLIM_CORE = !0 - * but it runs as root, and can do lots of stupid things - * Note that we use task_tgid_vnr here to grab the pid - * of the process group leader. That way we get the - * right pid if a thread in a multi-threaded - * core_pattern process dies. - */ - printk(KERN_WARNING - "Process %d(%s) has RLIMIT_CORE set to 0\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Aborting core\n"); - goto fail_unlock; - } - - dump_count = atomic_inc_return(&core_dump_count); - if (core_pipe_limit && (core_pipe_limit < dump_count)) { - printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Skipping core dump\n"); - goto fail_dropcount; - } - helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); - goto fail_dropcount; + goto fail_unlock; + } + /* Terminate the string before the first option */ + delimit = strchr(corename, ' '); + if (delimit) + *delimit = '\0'; + delimit = strrchr(helper_argv[0], '/'); + if (delimit) + delimit++; + else + delimit = helper_argv[0]; + if (!strcmp(delimit, current->comm)) { + printk(KERN_NOTICE "Recursive core dump detected, " + "aborting\n"); + goto fail_unlock; } core_limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ - if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, + if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, &file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); - goto fail_dropcount; + goto fail_unlock; } } else file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(file)) - goto fail_dropcount; + goto fail_unlock; inode = file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ @@ -1908,12 +1875,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (retval) current->signal->group_exit_code |= 0x80; close_fail: - if (ispipe && core_pipe_limit) - wait_for_dump_helpers(file); filp_close(file, NULL); -fail_dropcount: - if (dump_count) - atomic_dec(&core_dump_count); fail_unlock: if (helper_argv) argv_free(helper_argv); diff --git a/trunk/fs/ext2/inode.c b/trunk/fs/ext2/inode.c index ade634076d0a..1c1638f873a4 100644 --- a/trunk/fs/ext2/inode.c +++ b/trunk/fs/ext2/inode.c @@ -819,7 +819,6 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; const struct address_space_operations ext2_aops_xip = { @@ -838,7 +837,6 @@ const struct address_space_operations ext2_nobh_aops = { .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, .migratepage = buffer_migrate_page, - .error_remove_page = generic_error_remove_page, }; /* diff --git a/trunk/fs/ext3/inode.c b/trunk/fs/ext3/inode.c index acf1b1423327..cd098a7b77fc 100644 --- a/trunk/fs/ext3/inode.c +++ b/trunk/fs/ext3/inode.c @@ -1830,7 +1830,6 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_writeback_aops = { @@ -1846,7 +1845,6 @@ static const struct address_space_operations ext3_writeback_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_journalled_aops = { @@ -1861,7 +1859,6 @@ static const struct address_space_operations ext3_journalled_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; void ext3_set_aops(struct inode *inode) diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c index 064746fad581..3a798737e305 100644 --- a/trunk/fs/ext4/inode.c +++ b/trunk/fs/ext4/inode.c @@ -3386,7 +3386,6 @@ static const struct address_space_operations ext4_ordered_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_writeback_aops = { @@ -3402,7 +3401,6 @@ static const struct address_space_operations ext4_writeback_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_journalled_aops = { @@ -3417,7 +3415,6 @@ static const struct address_space_operations ext4_journalled_aops = { .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_da_aops = { @@ -3434,7 +3431,6 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; void ext4_set_aops(struct inode *inode) diff --git a/trunk/fs/fcntl.c b/trunk/fs/fcntl.c index fc089f2f7f56..ae413086db97 100644 --- a/trunk/fs/fcntl.c +++ b/trunk/fs/fcntl.c @@ -263,79 +263,6 @@ pid_t f_getown(struct file *filp) return pid; } -static int f_setown_ex(struct file *filp, unsigned long arg) -{ - struct f_owner_ex * __user owner_p = (void * __user)arg; - struct f_owner_ex owner; - struct pid *pid; - int type; - int ret; - - ret = copy_from_user(&owner, owner_p, sizeof(owner)); - if (ret) - return ret; - - switch (owner.type) { - case F_OWNER_TID: - type = PIDTYPE_MAX; - break; - - case F_OWNER_PID: - type = PIDTYPE_PID; - break; - - case F_OWNER_GID: - type = PIDTYPE_PGID; - break; - - default: - return -EINVAL; - } - - rcu_read_lock(); - pid = find_vpid(owner.pid); - if (owner.pid && !pid) - ret = -ESRCH; - else - ret = __f_setown(filp, pid, type, 1); - rcu_read_unlock(); - - return ret; -} - -static int f_getown_ex(struct file *filp, unsigned long arg) -{ - struct f_owner_ex * __user owner_p = (void * __user)arg; - struct f_owner_ex owner; - int ret = 0; - - read_lock(&filp->f_owner.lock); - owner.pid = pid_vnr(filp->f_owner.pid); - switch (filp->f_owner.pid_type) { - case PIDTYPE_MAX: - owner.type = F_OWNER_TID; - break; - - case PIDTYPE_PID: - owner.type = F_OWNER_PID; - break; - - case PIDTYPE_PGID: - owner.type = F_OWNER_GID; - break; - - default: - WARN_ON(1); - ret = -EINVAL; - break; - } - read_unlock(&filp->f_owner.lock); - - if (!ret) - ret = copy_to_user(owner_p, &owner, sizeof(owner)); - return ret; -} - static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -386,12 +313,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SETOWN: err = f_setown(filp, arg, 1); break; - case F_GETOWN_EX: - err = f_getown_ex(filp, arg); - break; - case F_SETOWN_EX: - err = f_setown_ex(filp, arg); - break; case F_GETSIG: err = filp->f_owner.signum; break; @@ -507,7 +428,8 @@ static inline int sigio_perm(struct task_struct *p, static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, - int fd, int reason, int group) + int fd, + int reason) { /* * F_SETSIG can change ->signum lockless in parallel, make @@ -539,11 +461,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = band_table[reason - POLL_IN]; si.si_fd = fd; - if (!do_send_sig_info(signum, &si, p, group)) + if (!group_send_sig_info(signum, &si, p)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); + group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); } } @@ -552,23 +474,16 @@ void send_sigio(struct fown_struct *fown, int fd, int band) struct task_struct *p; enum pid_type type; struct pid *pid; - int group = 1; read_lock(&fown->lock); - type = fown->pid_type; - if (type == PIDTYPE_MAX) { - group = 0; - type = PIDTYPE_PID; - } - pid = fown->pid; if (!pid) goto out_unlock_fown; read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigio_to_task(p, fown, fd, band, group); + send_sigio_to_task(p, fown, fd, band); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: @@ -576,10 +491,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) } static void send_sigurg_to_task(struct task_struct *p, - struct fown_struct *fown, int group) + struct fown_struct *fown) { if (sigio_perm(p, fown, SIGURG)) - do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group); + group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); } int send_sigurg(struct fown_struct *fown) @@ -587,17 +502,10 @@ int send_sigurg(struct fown_struct *fown) struct task_struct *p; enum pid_type type; struct pid *pid; - int group = 1; int ret = 0; read_lock(&fown->lock); - type = fown->pid_type; - if (type == PIDTYPE_MAX) { - group = 0; - type = PIDTYPE_PID; - } - pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -606,7 +514,7 @@ int send_sigurg(struct fown_struct *fown) read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigurg_to_task(p, fown, group); + send_sigurg_to_task(p, fown); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: diff --git a/trunk/fs/file_table.c b/trunk/fs/file_table.c index 8eb44042e009..334ce39881f8 100644 --- a/trunk/fs/file_table.c +++ b/trunk/fs/file_table.c @@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; diff --git a/trunk/fs/gfs2/aops.c b/trunk/fs/gfs2/aops.c index 694b5d48f036..7ebae9a4ecc0 100644 --- a/trunk/fs/gfs2/aops.c +++ b/trunk/fs/gfs2/aops.c @@ -1135,7 +1135,6 @@ static const struct address_space_operations gfs2_writeback_aops = { .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_ordered_aops = { @@ -1152,7 +1151,6 @@ static const struct address_space_operations gfs2_ordered_aops = { .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -1168,7 +1166,6 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; void gfs2_set_aops(struct inode *inode) diff --git a/trunk/fs/hugetlbfs/inode.c b/trunk/fs/hugetlbfs/inode.c index 133335479c24..eba6d552d9c9 100644 --- a/trunk/fs/hugetlbfs/inode.c +++ b/trunk/fs/hugetlbfs/inode.c @@ -936,9 +936,15 @@ static struct file_system_type hugetlbfs_fs_type = { static struct vfsmount *hugetlbfs_vfsmount; -static int can_do_hugetlb_shm(void) +static int can_do_hugetlb_shm(int creat_flags) { - return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); + if (creat_flags != HUGETLB_SHMFS_INODE) + return 0; + if (capable(CAP_IPC_LOCK)) + return 1; + if (in_group_p(sysctl_hugetlb_shm_group)) + return 1; + return 0; } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, @@ -954,7 +960,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { + if (!can_do_hugetlb_shm(creat_flags)) { *user = current_user(); if (user_shm_lock(size, *user)) { WARN_ONCE(1, diff --git a/trunk/fs/nfs/file.c b/trunk/fs/nfs/file.c index 86d6b4db1096..5021b75d2d1e 100644 --- a/trunk/fs/nfs/file.c +++ b/trunk/fs/nfs/file.c @@ -525,7 +525,6 @@ const struct address_space_operations nfs_file_aops = { .direct_IO = nfs_direct_IO, .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, - .error_remove_page = generic_error_remove_page, }; /* diff --git a/trunk/fs/ntfs/aops.c b/trunk/fs/ntfs/aops.c index cfce53cb65d7..b38f944f0667 100644 --- a/trunk/fs/ntfs/aops.c +++ b/trunk/fs/ntfs/aops.c @@ -1550,7 +1550,6 @@ const struct address_space_operations ntfs_aops = { .migratepage = buffer_migrate_page, /* Move a page cache page from one physical page to an other. */ - .error_remove_page = generic_error_remove_page, }; /** @@ -1570,7 +1569,6 @@ const struct address_space_operations ntfs_mst_aops = { .migratepage = buffer_migrate_page, /* Move a page cache page from one physical page to an other. */ - .error_remove_page = generic_error_remove_page, }; #ifdef NTFS_RW diff --git a/trunk/fs/ocfs2/aops.c b/trunk/fs/ocfs2/aops.c index deb2b132ae5e..72e76062a900 100644 --- a/trunk/fs/ocfs2/aops.c +++ b/trunk/fs/ocfs2/aops.c @@ -2022,5 +2022,4 @@ const struct address_space_operations ocfs2_aops = { .releasepage = ocfs2_releasepage, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; diff --git a/trunk/fs/proc/meminfo.c b/trunk/fs/proc/meminfo.c index c7bff4f603ff..171e052c07b3 100644 --- a/trunk/fs/proc/meminfo.c +++ b/trunk/fs/proc/meminfo.c @@ -97,11 +97,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Committed_AS: %8lu kB\n" "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n" -#ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %8lu kB\n" -#endif - , + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -148,9 +144,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, vmi.largest_chunk >> 10 -#ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) -#endif ); hugetlb_report_meminfo(m); diff --git a/trunk/fs/proc/proc_sysctl.c b/trunk/fs/proc/proc_sysctl.c index f667e8aeabdf..9b1e4e9a16bf 100644 --- a/trunk/fs/proc/proc_sysctl.c +++ b/trunk/fs/proc/proc_sysctl.c @@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, /* careful: calling conventions are nasty here */ res = count; - error = table->proc_handler(table, write, buf, &res, ppos); + error = table->proc_handler(table, write, filp, buf, &res, ppos); if (!error) error = res; out: diff --git a/trunk/fs/romfs/super.c b/trunk/fs/romfs/super.c index c117fa80d1e9..47f132df0c3f 100644 --- a/trunk/fs/romfs/super.c +++ b/trunk/fs/romfs/super.c @@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; root = romfs_iget(sb, pos); - if (IS_ERR(root)) + if (!root) goto error; sb->s_root = d_alloc_root(root); diff --git a/trunk/fs/xfs/linux-2.6/xfs_aops.c b/trunk/fs/xfs/linux-2.6/xfs_aops.c index 381854461b28..d5e5559e31db 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_aops.c +++ b/trunk/fs/xfs/linux-2.6/xfs_aops.c @@ -1635,5 +1635,4 @@ const struct address_space_operations xfs_address_space_operations = { .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, }; diff --git a/trunk/fs/xfs/linux-2.6/xfs_sysctl.c b/trunk/fs/xfs/linux-2.6/xfs_sysctl.c index c5bc67c4e3bb..916c0ffb6083 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/trunk/fs/xfs/linux-2.6/xfs_sysctl.c @@ -26,6 +26,7 @@ STATIC int xfs_stats_clear_proc_handler( ctl_table *ctl, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -33,7 +34,7 @@ xfs_stats_clear_proc_handler( int c, ret, *valp = ctl->data; __uint32_t vn_active; - ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); if (!ret && write && *valp) { printk("XFS Clearing xfsstats\n"); diff --git a/trunk/include/asm-generic/fcntl.h b/trunk/include/asm-generic/fcntl.h index 0c3dd8603927..4d3e48373e74 100644 --- a/trunk/include/asm-generic/fcntl.h +++ b/trunk/include/asm-generic/fcntl.h @@ -73,19 +73,6 @@ #define F_SETSIG 10 /* for sockets. */ #define F_GETSIG 11 /* for sockets. */ #endif -#ifndef F_SETOWN_EX -#define F_SETOWN_EX 12 -#define F_GETOWN_EX 13 -#endif - -#define F_OWNER_TID 0 -#define F_OWNER_PID 1 -#define F_OWNER_GID 2 - -struct f_owner_ex { - int type; - pid_t pid; -}; /* for F_[GET|SET]FL */ #define FD_CLOEXEC 1 /* actually anything with low bit set goes */ diff --git a/trunk/include/asm-generic/mman-common.h b/trunk/include/asm-generic/mman-common.h index 5ee13b2fd223..dd63bd38864b 100644 --- a/trunk/include/asm-generic/mman-common.h +++ b/trunk/include/asm-generic/mman-common.h @@ -34,7 +34,6 @@ #define MADV_REMOVE 9 /* remove these pages & resources */ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ -#define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ diff --git a/trunk/include/asm-generic/siginfo.h b/trunk/include/asm-generic/siginfo.h index 942d30b5aab1..c840719a8c59 100644 --- a/trunk/include/asm-generic/siginfo.h +++ b/trunk/include/asm-generic/siginfo.h @@ -82,7 +82,6 @@ typedef struct siginfo { #ifdef __ARCH_SI_TRAPNO int _trapno; /* TRAP # which caused the signal */ #endif - short _addr_lsb; /* LSB of the reported address */ } _sigfault; /* SIGPOLL */ @@ -113,7 +112,6 @@ typedef struct siginfo { #ifdef __ARCH_SI_TRAPNO #define si_trapno _sifields._sigfault._trapno #endif -#define si_addr_lsb _sifields._sigfault._addr_lsb #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd @@ -194,11 +192,7 @@ typedef struct siginfo { #define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */ #define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */ #define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */ -/* hardware memory error consumed on a machine check: action required */ -#define BUS_MCEERR_AR (__SI_FAULT|4) -/* hardware memory error detected in process but not consumed: action optional*/ -#define BUS_MCEERR_AO (__SI_FAULT|5) -#define NSIGBUS 5 +#define NSIGBUS 3 /* * SIGTRAP si_codes diff --git a/trunk/include/linux/async_tx.h b/trunk/include/linux/async_tx.h index a1c486a88e88..5fc2ef8d97fa 100644 --- a/trunk/include/linux/async_tx.h +++ b/trunk/include/linux/async_tx.h @@ -58,60 +58,25 @@ struct dma_chan_ref { * array. * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a * dependency chain - * @ASYNC_TX_FENCE: specify that the next operation in the dependency - * chain uses this operation's result as an input + * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), - ASYNC_TX_ACK = (1 << 2), - ASYNC_TX_FENCE = (1 << 3), -}; - -/** - * struct async_submit_ctl - async_tx submission/completion modifiers - * @flags: submission modifiers - * @depend_tx: parent dependency of the current operation being submitted - * @cb_fn: callback routine to run at operation completion - * @cb_param: parameter for the callback routine - * @scribble: caller provided space for dma/page address conversions - */ -struct async_submit_ctl { - enum async_tx_flags flags; - struct dma_async_tx_descriptor *depend_tx; - dma_async_tx_callback cb_fn; - void *cb_param; - void *scribble; + ASYNC_TX_ACK = (1 << 3), + ASYNC_TX_DEP_ACK = (1 << 4), }; #ifdef CONFIG_DMA_ENGINE #define async_tx_issue_pending_all dma_issue_pending_all - -/** - * async_tx_issue_pending - send pending descriptor to the hardware channel - * @tx: descriptor handle to retrieve hardware context - * - * Note: any dependent operations will have already been issued by - * async_tx_channel_switch, or (in the case of no channel switch) will - * be already pending on this channel. - */ -static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx) -{ - if (likely(tx)) { - struct dma_chan *chan = tx->chan; - struct dma_device *dma = chan->device; - - dma->device_issue_pending(chan); - } -} #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL #include #else #define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \ __async_tx_find_channel(dep, type) struct dma_chan * -__async_tx_find_channel(struct async_submit_ctl *submit, - enum dma_transaction_type tx_type); +__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type); #endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */ #else static inline void async_tx_issue_pending_all(void) @@ -119,16 +84,10 @@ static inline void async_tx_issue_pending_all(void) do { } while (0); } -static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx) -{ - do { } while (0); -} - static inline struct dma_chan * -async_tx_find_channel(struct async_submit_ctl *submit, - enum dma_transaction_type tx_type, struct page **dst, - int dst_count, struct page **src, int src_count, - size_t len) +async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type, struct page **dst, int dst_count, + struct page **src, int src_count, size_t len) { return NULL; } @@ -140,70 +99,46 @@ async_tx_find_channel(struct async_submit_ctl *submit, * @cb_fn_param: parameter to pass to the callback routine */ static inline void -async_tx_sync_epilog(struct async_submit_ctl *submit) +async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param) { - if (submit->cb_fn) - submit->cb_fn(submit->cb_param); + if (cb_fn) + cb_fn(cb_fn_param); } -typedef union { - unsigned long addr; - struct page *page; - dma_addr_t dma; -} addr_conv_t; - -static inline void -init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags, - struct dma_async_tx_descriptor *tx, - dma_async_tx_callback cb_fn, void *cb_param, - addr_conv_t *scribble) -{ - args->flags = flags; - args->depend_tx = tx; - args->cb_fn = cb_fn; - args->cb_param = cb_param; - args->scribble = scribble; -} - -void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, - struct async_submit_ctl *submit); +void +async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * async_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, struct async_submit_ctl *submit); + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * -async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum sum_check_flags *result, - struct async_submit_ctl *submit); +async_xor_zero_sum(struct page *dest, struct page **src_list, + unsigned int offset, int src_cnt, size_t len, + u32 *result, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, - unsigned int src_offset, size_t len, - struct async_submit_ctl *submit); + unsigned int src_offset, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * async_memset(struct page *dest, int val, unsigned int offset, - size_t len, struct async_submit_ctl *submit); - -struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit); - -struct dma_async_tx_descriptor * -async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt, - size_t len, struct async_submit_ctl *submit); - -struct dma_async_tx_descriptor * -async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt, - size_t len, enum sum_check_flags *pqres, struct page *spare, - struct async_submit_ctl *submit); - -struct dma_async_tx_descriptor * -async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb, - struct page **ptrs, struct async_submit_ctl *submit); + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * -async_raid6_datap_recov(int src_num, size_t bytes, int faila, - struct page **ptrs, struct async_submit_ctl *submit); +async_trigger_callback(enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); void async_tx_quiesce(struct dma_async_tx_descriptor **tx); #endif /* _ASYNC_TX_H_ */ diff --git a/trunk/include/linux/binfmts.h b/trunk/include/linux/binfmts.h index aece486ac734..2046b5b8af48 100644 --- a/trunk/include/linux/binfmts.h +++ b/trunk/include/linux/binfmts.h @@ -120,7 +120,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); -extern void set_binfmt(struct linux_binfmt *new); +extern int set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); #endif /* __KERNEL__ */ diff --git a/trunk/include/linux/cgroup.h b/trunk/include/linux/cgroup.h index b62bb9294d0c..90bba9e62286 100644 --- a/trunk/include/linux/cgroup.h +++ b/trunk/include/linux/cgroup.h @@ -141,38 +141,6 @@ enum { CGRP_WAIT_ON_RMDIR, }; -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* how many files are using the current array */ - int use_count; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore mutex; -}; - struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ @@ -211,12 +179,11 @@ struct cgroup { */ struct list_head release_list; - /* - * list of pidlists, up to two for each namespace (one for procs, one - * for tasks); created on demand. - */ - struct list_head pidlists; - struct mutex pidlist_mutex; + /* pids_mutex protects pids_list and cached pid arrays. */ + struct rw_semaphore pids_mutex; + + /* Linked list of struct cgroup_pids */ + struct list_head pids_list; /* For RCU-protected deletion */ struct rcu_head rcu_head; @@ -260,9 +227,6 @@ struct css_set { * during subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* For RCU-protected deletion */ - struct rcu_head rcu_head; }; /* @@ -425,11 +389,10 @@ struct cgroup_subsys { struct cgroup *cgrp); int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup); + int (*can_attach)(struct cgroup_subsys *ss, + struct cgroup *cgrp, struct task_struct *tsk); void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *tsk, - bool threadgroup); + struct cgroup *old_cgrp, struct task_struct *tsk); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); int (*populate)(struct cgroup_subsys *ss, diff --git a/trunk/include/linux/configfs.h b/trunk/include/linux/configfs.h index ddb7a97c78c2..7f627775c947 100644 --- a/trunk/include/linux/configfs.h +++ b/trunk/include/linux/configfs.h @@ -27,8 +27,8 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please read Documentation/filesystems/configfs/configfs.txt before using - * the configfs interface, ESPECIALLY the parts about reference counts and + * Please read Documentation/filesystems/configfs.txt before using the + * configfs interface, ESPECIALLY the parts about reference counts and * item destructors. */ diff --git a/trunk/include/linux/dca.h b/trunk/include/linux/dca.h index d27a7a05718d..9c20c7e87d0a 100644 --- a/trunk/include/linux/dca.h +++ b/trunk/include/linux/dca.h @@ -20,9 +20,6 @@ */ #ifndef DCA_H #define DCA_H - -#include - /* DCA Provider API */ /* DCA Notifier Interface */ @@ -39,12 +36,6 @@ struct dca_provider { int id; }; -struct dca_domain { - struct list_head node; - struct list_head dca_providers; - struct pci_bus *pci_rc; -}; - struct dca_ops { int (*add_requester) (struct dca_provider *, struct device *); int (*remove_requester) (struct dca_provider *, struct device *); @@ -56,7 +47,7 @@ struct dca_ops { struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size); void free_dca_provider(struct dca_provider *dca); int register_dca_provider(struct dca_provider *dca, struct device *dev); -void unregister_dca_provider(struct dca_provider *dca, struct device *dev); +void unregister_dca_provider(struct dca_provider *dca); static inline void *dca_priv(struct dca_provider *dca) { diff --git a/trunk/include/linux/debugfs.h b/trunk/include/linux/debugfs.h index fc1b930f246c..eb5c2ba2f81a 100644 --- a/trunk/include/linux/debugfs.h +++ b/trunk/include/linux/debugfs.h @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/filesystems for more details. + * See Documentation/DocBook/kernel-api for more details. */ #ifndef _DEBUGFS_H_ diff --git a/trunk/include/linux/dmaengine.h b/trunk/include/linux/dmaengine.h index 2b9f2ac7ed60..ffefba81c818 100644 --- a/trunk/include/linux/dmaengine.h +++ b/trunk/include/linux/dmaengine.h @@ -48,20 +48,19 @@ enum dma_status { /** * enum dma_transaction_type - DMA transaction types/indexes - * - * Note: The DMA_ASYNC_TX capability is not to be set by drivers. It is - * automatically set as dma devices are registered. */ enum dma_transaction_type { DMA_MEMCPY, DMA_XOR, - DMA_PQ, - DMA_XOR_VAL, - DMA_PQ_VAL, + DMA_PQ_XOR, + DMA_DUAL_XOR, + DMA_PQ_UPDATE, + DMA_ZERO_SUM, + DMA_PQ_ZERO_SUM, DMA_MEMSET, + DMA_MEMCPY_CRC32C, DMA_INTERRUPT, DMA_PRIVATE, - DMA_ASYNC_TX, DMA_SLAVE, }; @@ -71,25 +70,18 @@ enum dma_transaction_type { /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, - * control completion, and communicate status. + * control completion, and communicate status. * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of - * this transaction + * this transaction * @DMA_CTRL_ACK - the descriptor cannot be reused until the client - * acknowledges receipt, i.e. has has a chance to establish any dependency - * chains + * acknowledges receipt, i.e. has has a chance to establish any + * dependency chains * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single * (if not set, do the source dma-unmapping as page) * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single * (if not set, do the destination dma-unmapping as page) - * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q - * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P - * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as - * sources that were the result of a previous operation, in the case of a PQ - * operation it continues the calculation with new sources - * @DMA_PREP_FENCE - tell the driver that subsequent operations depend - * on the result of this operation */ enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), @@ -98,31 +90,8 @@ enum dma_ctrl_flags { DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4), DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5), - DMA_PREP_PQ_DISABLE_P = (1 << 6), - DMA_PREP_PQ_DISABLE_Q = (1 << 7), - DMA_PREP_CONTINUE = (1 << 8), - DMA_PREP_FENCE = (1 << 9), }; -/** - * enum sum_check_bits - bit position of pq_check_flags - */ -enum sum_check_bits { - SUM_CHECK_P = 0, - SUM_CHECK_Q = 1, -}; - -/** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations - * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise - * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise - */ -enum sum_check_flags { - SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P), - SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q), -}; - - /** * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. * See linux/cpumask.h @@ -211,6 +180,8 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param); * @flags: flags to augment operation preparation, control completion, and * communicate status * @phys: physical address of the descriptor + * @tx_list: driver common field for operations that require multiple + * descriptors * @chan: target channel for this operation * @tx_submit: set the prepared descriptor(s) to be executed by the engine * @callback: routine to call after this operation is complete @@ -224,6 +195,7 @@ struct dma_async_tx_descriptor { dma_cookie_t cookie; enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */ dma_addr_t phys; + struct list_head tx_list; struct dma_chan *chan; dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx); dma_async_tx_callback callback; @@ -241,11 +213,6 @@ struct dma_async_tx_descriptor { * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags * @max_xor: maximum number of xor sources, 0 if no capability - * @max_pq: maximum number of PQ sources and PQ-continue capability - * @copy_align: alignment shift for memcpy operations - * @xor_align: alignment shift for xor operations - * @pq_align: alignment shift for pq operations - * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @device_alloc_chan_resources: allocate resources and return the @@ -253,9 +220,7 @@ struct dma_async_tx_descriptor { * @device_free_chan_resources: release DMA channel's resources * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation - * @device_prep_dma_xor_val: prepares a xor validation operation - * @device_prep_dma_pq: prepares a pq operation - * @device_prep_dma_pq_val: prepares a pqzero_sum operation + * @device_prep_dma_zero_sum: prepares a zero_sum operation * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation @@ -270,13 +235,7 @@ struct dma_device { struct list_head channels; struct list_head global_node; dma_cap_mask_t cap_mask; - unsigned short max_xor; - unsigned short max_pq; - u8 copy_align; - u8 xor_align; - u8 pq_align; - u8 fill_align; - #define DMA_HAS_PQ_CONTINUE (1 << 15) + int max_xor; int dev_id; struct device *dev; @@ -290,17 +249,9 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_xor)( struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)( + struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, - size_t len, enum sum_check_flags *result, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_pq)( - struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, - size_t len, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)( - struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, - unsigned int src_cnt, const unsigned char *scf, size_t len, - enum sum_check_flags *pqres, unsigned long flags); + size_t len, u32 *result, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags); @@ -319,96 +270,6 @@ struct dma_device { void (*device_issue_pending)(struct dma_chan *chan); }; -static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len) -{ - size_t mask; - - if (!align) - return true; - mask = (1 << align) - 1; - if (mask & (off1 | off2 | len)) - return false; - return true; -} - -static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1, - size_t off2, size_t len) -{ - return dmaengine_check_align(dev->copy_align, off1, off2, len); -} - -static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1, - size_t off2, size_t len) -{ - return dmaengine_check_align(dev->xor_align, off1, off2, len); -} - -static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, - size_t off2, size_t len) -{ - return dmaengine_check_align(dev->pq_align, off1, off2, len); -} - -static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, - size_t off2, size_t len) -{ - return dmaengine_check_align(dev->fill_align, off1, off2, len); -} - -static inline void -dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) -{ - dma->max_pq = maxpq; - if (has_pq_continue) - dma->max_pq |= DMA_HAS_PQ_CONTINUE; -} - -static inline bool dmaf_continue(enum dma_ctrl_flags flags) -{ - return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE; -} - -static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags) -{ - enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P; - - return (flags & mask) == mask; -} - -static inline bool dma_dev_has_pq_continue(struct dma_device *dma) -{ - return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE; -} - -static unsigned short dma_dev_to_maxpq(struct dma_device *dma) -{ - return dma->max_pq & ~DMA_HAS_PQ_CONTINUE; -} - -/* dma_maxpq - reduce maxpq in the face of continued operations - * @dma - dma device with PQ capability - * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set - * - * When an engine does not support native continuation we need 3 extra - * source slots to reuse P and Q with the following coefficients: - * 1/ {00} * P : remove P from Q', but use it as a source for P' - * 2/ {01} * Q : use Q to continue Q' calculation - * 3/ {00} * Q : subtract Q from P' to cancel (2) - * - * In the case where P is disabled we only need 1 extra source: - * 1/ {01} * Q : use Q to continue Q' calculation - */ -static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) -{ - if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags)) - return dma_dev_to_maxpq(dma); - else if (dmaf_p_disabled_continue(flags)) - return dma_dev_to_maxpq(dma) - 1; - else if (dmaf_continue(flags)) - return dma_dev_to_maxpq(dma) - 3; - BUG(); -} - /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE @@ -438,11 +299,7 @@ static inline void net_dmaengine_put(void) #ifdef CONFIG_ASYNC_TX_DMA #define async_dmaengine_get() dmaengine_get() #define async_dmaengine_put() dmaengine_put() -#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH -#define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX) -#else #define async_dma_find_channel(type) dma_find_channel(type) -#endif /* CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH */ #else static inline void async_dmaengine_get(void) { @@ -455,7 +312,7 @@ async_dma_find_channel(enum dma_transaction_type type) { return NULL; } -#endif /* CONFIG_ASYNC_TX_DMA */ +#endif dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h index 78e95b8b66d4..51803528b095 100644 --- a/trunk/include/linux/fs.h +++ b/trunk/include/linux/fs.h @@ -595,7 +595,6 @@ struct address_space_operations { int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); - int (*error_remove_page)(struct address_space *, struct page *); }; /* @@ -2468,7 +2467,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); diff --git a/trunk/include/linux/ftrace.h b/trunk/include/linux/ftrace.h index cd3d2abaf30a..3c0924a18daf 100644 --- a/trunk/include/linux/ftrace.h +++ b/trunk/include/linux/ftrace.h @@ -19,7 +19,7 @@ extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); @@ -94,7 +94,7 @@ static inline void ftrace_start(void) { } extern int stack_tracer_enabled; int stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/trunk/include/linux/futex.h b/trunk/include/linux/futex.h index 8ec17997d94f..34956c8fdebf 100644 --- a/trunk/include/linux/futex.h +++ b/trunk/include/linux/futex.h @@ -4,6 +4,11 @@ #include #include +struct inode; +struct mm_struct; +struct task_struct; +union ktime; + /* Second argument to futex syscall */ @@ -124,11 +129,6 @@ struct robust_list_head { #define FUTEX_BITSET_MATCH_ANY 0xffffffff #ifdef __KERNEL__ -struct inode; -struct mm_struct; -struct task_struct; -union ktime; - long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, u32 __user *uaddr2, u32 val2, u32 val3); diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h index 11ab19ac6b3d..176e7ee73eff 100644 --- a/trunk/include/linux/hugetlb.h +++ b/trunk/include/linux/hugetlb.h @@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) } void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, diff --git a/trunk/include/linux/memcontrol.h b/trunk/include/linux/memcontrol.h index bf9213b2db8f..e46a0734ab6e 100644 --- a/trunk/include/linux/memcontrol.h +++ b/trunk/include/linux/memcontrol.h @@ -118,9 +118,6 @@ static inline bool mem_cgroup_disabled(void) extern bool mem_cgroup_oom_called(struct task_struct *task); void mem_cgroup_update_mapped_file_stat(struct page *page, int val); -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, - int zid); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -279,13 +276,6 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page, { } -static inline -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, int zid) -{ - return 0; -} - #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index 6953a5a53e44..b6eae5e3144b 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -695,12 +695,11 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. @@ -795,11 +794,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern int vmtruncate(struct inode * inode, loff_t offset); extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); -int generic_error_remove_page(struct address_space *mapping, struct page *page); - -int invalidate_inode_page(struct page *page); - #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -1285,7 +1279,7 @@ int in_gate_area_no_task(unsigned long addr); #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ -int drop_caches_sysctl_handler(struct ctl_table *, int, +int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages); @@ -1314,12 +1308,5 @@ void vmemmap_populate_print_last(void); extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, size_t size); extern void refund_locked_memory(struct mm_struct *mm, size_t size); - -extern void memory_failure(unsigned long pfn, int trapno); -extern int __memory_failure(unsigned long pfn, int trapno, int ref); -extern int sysctl_memory_failure_early_kill; -extern int sysctl_memory_failure_recovery; -extern atomic_long_t mce_bad_pages; - #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h index 21d6aa45206a..0042090a4d70 100644 --- a/trunk/include/linux/mm_types.h +++ b/trunk/include/linux/mm_types.h @@ -240,8 +240,6 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - struct linux_binfmt *binfmt; - cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ @@ -261,10 +259,11 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ -#ifdef CONFIG_AIO + + /* aio bits */ spinlock_t ioctx_lock; struct hlist_head ioctx_list; -#endif + #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h index 6f7561730d88..652ef01be582 100644 --- a/trunk/include/linux/mmzone.h +++ b/trunk/include/linux/mmzone.h @@ -755,20 +755,21 @@ static inline int is_dma(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, +struct file; +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + struct file *, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + struct file *, void __user *, size_t *, loff_t *); extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + struct file *, void __user *, size_t *, loff_t *); extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ diff --git a/trunk/include/linux/page-flags.h b/trunk/include/linux/page-flags.h index 6b202b173955..13de789f0a5c 100644 --- a/trunk/include/linux/page-flags.h +++ b/trunk/include/linux/page-flags.h @@ -51,9 +51,6 @@ * PG_buddy is set to indicate that the page is free and in the buddy system * (see mm/page_alloc.c). * - * PG_hwpoison indicates that a page got corrupted in hardware and contains - * data with incorrect ECC bits that triggered a machine check. Accessing is - * not safe since it may cause another machine check. Don't touch! */ /* @@ -104,9 +101,6 @@ enum pageflags { #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ -#endif -#ifdef CONFIG_MEMORY_FAILURE - PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif __NR_PAGEFLAGS, @@ -275,15 +269,6 @@ PAGEFLAG(Uncached, uncached) PAGEFLAG_FALSE(Uncached) #endif -#ifdef CONFIG_MEMORY_FAILURE -PAGEFLAG(HWPoison, hwpoison) -TESTSETFLAG(HWPoison, hwpoison) -#define __PG_HWPOISON (1UL << PG_hwpoison) -#else -PAGEFLAG_FALSE(HWPoison) -#define __PG_HWPOISON 0 -#endif - static inline int PageUptodate(struct page *page) { int ret = test_bit(PG_uptodate, &(page)->flags); @@ -408,7 +393,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/trunk/include/linux/page_cgroup.h b/trunk/include/linux/page_cgroup.h index 4b938d4f3ac2..ada779f24178 100644 --- a/trunk/include/linux/page_cgroup.h +++ b/trunk/include/linux/page_cgroup.h @@ -38,7 +38,6 @@ enum { PCG_LOCK, /* page cgroup is locked */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ }; #define TESTPCGFLAG(uname, lname) \ @@ -53,23 +52,11 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ { clear_bit(PCG_##lname, &pc->flags); } -#define TESTCLEARPCGFLAG(uname, lname) \ -static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ - { return test_and_clear_bit(PCG_##lname, &pc->flags); } - /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) -CLEARPCGFLAG(Cache, CACHE) -SETPCGFLAG(Cache, CACHE) TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) -SETPCGFLAG(Used, USED) - -SETPCGFLAG(AcctLRU, ACCT_LRU) -CLEARPCGFLAG(AcctLRU, ACCT_LRU) -TESTPCGFLAG(AcctLRU, ACCT_LRU) -TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) static inline int page_cgroup_nid(struct page_cgroup *pc) { diff --git a/trunk/include/linux/pci_ids.h b/trunk/include/linux/pci_ids.h index da1fda8623e0..7803565aa877 100644 --- a/trunk/include/linux/pci_ids.h +++ b/trunk/include/linux/pci_ids.h @@ -2527,16 +2527,6 @@ #define PCI_DEVICE_ID_INTEL_E7525_MCH 0x359e #define PCI_DEVICE_ID_INTEL_IOAT_CNB 0x360b #define PCI_DEVICE_ID_INTEL_FBD_CNB 0x360c -#define PCI_DEVICE_ID_INTEL_IOAT_JSF0 0x3710 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF1 0x3711 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF2 0x3712 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF3 0x3713 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF4 0x3714 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF5 0x3715 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF6 0x3716 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF7 0x3717 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF8 0x3718 -#define PCI_DEVICE_ID_INTEL_IOAT_JSF9 0x3719 #define PCI_DEVICE_ID_INTEL_ICH10_0 0x3a14 #define PCI_DEVICE_ID_INTEL_ICH10_1 0x3a16 #define PCI_DEVICE_ID_INTEL_ICH10_2 0x3a18 diff --git a/trunk/include/linux/prctl.h b/trunk/include/linux/prctl.h index 931150566ade..07bff666e65b 100644 --- a/trunk/include/linux/prctl.h +++ b/trunk/include/linux/prctl.h @@ -88,6 +88,4 @@ #define PR_TASK_PERF_EVENTS_DISABLE 31 #define PR_TASK_PERF_EVENTS_ENABLE 32 -#define PR_MCE_KILL 33 - #endif /* _LINUX_PRCTL_H */ diff --git a/trunk/include/linux/relay.h b/trunk/include/linux/relay.h index 14a86bc7102b..953fc055e875 100644 --- a/trunk/include/linux/relay.h +++ b/trunk/include/linux/relay.h @@ -140,7 +140,7 @@ struct rchan_callbacks * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * - * See Documentation/filesystems/relay.txt for more info. + * See Documentation/filesystems/relayfs.txt for more info. */ struct dentry *(*create_buf_file)(const char *filename, struct dentry *parent, diff --git a/trunk/include/linux/res_counter.h b/trunk/include/linux/res_counter.h index 731af71cddc9..511f42fc6816 100644 --- a/trunk/include/linux/res_counter.h +++ b/trunk/include/linux/res_counter.h @@ -34,10 +34,6 @@ struct res_counter { * the limit that usage cannot exceed */ unsigned long long limit; - /* - * the limit that usage can be exceed - */ - unsigned long long soft_limit; /* * the number of unsuccessful attempts to consume the resource */ @@ -91,7 +87,6 @@ enum { RES_MAX_USAGE, RES_LIMIT, RES_FAILCNT, - RES_SOFT_LIMIT, }; /* @@ -114,8 +109,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); int __must_check res_counter_charge_locked(struct res_counter *counter, unsigned long val); int __must_check res_counter_charge(struct res_counter *counter, - unsigned long val, struct res_counter **limit_fail_at, - struct res_counter **soft_limit_at); + unsigned long val, struct res_counter **limit_fail_at); /* * uncharge - tell that some portion of the resource is released @@ -128,8 +122,7 @@ int __must_check res_counter_charge(struct res_counter *counter, */ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); -void res_counter_uncharge(struct res_counter *counter, unsigned long val, - bool *was_soft_limit_excess); +void res_counter_uncharge(struct res_counter *counter, unsigned long val); static inline bool res_counter_limit_check_locked(struct res_counter *cnt) { @@ -139,36 +132,6 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt) return false; } -static inline bool res_counter_soft_limit_check_locked(struct res_counter *cnt) -{ - if (cnt->usage < cnt->soft_limit) - return true; - - return false; -} - -/** - * Get the difference between the usage and the soft limit - * @cnt: The counter - * - * Returns 0 if usage is less than or equal to soft limit - * The difference between usage and soft limit, otherwise. - */ -static inline unsigned long long -res_counter_soft_limit_excess(struct res_counter *cnt) -{ - unsigned long long excess; - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - if (cnt->usage <= cnt->soft_limit) - excess = 0; - else - excess = cnt->usage - cnt->soft_limit; - spin_unlock_irqrestore(&cnt->lock, flags); - return excess; -} - /* * Helper function to detect if the cgroup is within it's limit or * not. It's currently called from cgroup_rss_prepare() @@ -184,17 +147,6 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt) return ret; } -static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt) -{ - bool ret; - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - ret = res_counter_soft_limit_check_locked(cnt); - spin_unlock_irqrestore(&cnt->lock, flags); - return ret; -} - static inline void res_counter_reset_max(struct res_counter *cnt) { unsigned long flags; @@ -228,16 +180,4 @@ static inline int res_counter_set_limit(struct res_counter *cnt, return ret; } -static inline int -res_counter_set_soft_limit(struct res_counter *cnt, - unsigned long long soft_limit) -{ - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - cnt->soft_limit = soft_limit; - spin_unlock_irqrestore(&cnt->lock, flags); - return 0; -} - #endif diff --git a/trunk/include/linux/rmap.h b/trunk/include/linux/rmap.h index cb0ba7032609..477841d29fce 100644 --- a/trunk/include/linux/rmap.h +++ b/trunk/include/linux/rmap.h @@ -81,19 +81,7 @@ static inline void page_dup_rmap(struct page *page) */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt, unsigned long *vm_flags); -enum ttu_flags { - TTU_UNMAP = 0, /* unmap mode */ - TTU_MIGRATION = 1, /* migration mode */ - TTU_MUNLOCK = 2, /* munlock mode */ - TTU_ACTION_MASK = 0xff, - - TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ - TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ - TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ -}; -#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) - -int try_to_unmap(struct page *, enum ttu_flags flags); +int try_to_unmap(struct page *, int ignore_refs); /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -120,13 +108,6 @@ int page_mkclean(struct page *); */ int try_to_munlock(struct page *); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); - #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index 75e6e60bf583..848d1f20086e 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -309,7 +309,7 @@ extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; @@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -1271,6 +1271,7 @@ struct task_struct { struct mm_struct *mm, *active_mm; /* task state */ + struct linux_binfmt *binfmt; int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ @@ -1734,7 +1735,6 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ -#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ @@ -1754,7 +1754,6 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ -#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1907,7 +1906,7 @@ extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos); #endif #ifdef CONFIG_SCHED_DEBUG @@ -1925,7 +1924,7 @@ extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int sysctl_sched_compat_yield; @@ -2060,7 +2059,6 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); extern int do_notify_parent(struct task_struct *, int); -extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); @@ -2338,10 +2336,7 @@ static inline int signal_pending(struct task_struct *p) return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } -static inline int __fatal_signal_pending(struct task_struct *p) -{ - return unlikely(sigismember(&p->pending.signal, SIGKILL)); -} +extern int __fatal_signal_pending(struct task_struct *p); static inline int fatal_signal_pending(struct task_struct *p) { diff --git a/trunk/include/linux/security.h b/trunk/include/linux/security.h index 239e40d0450b..d050b66ab9ef 100644 --- a/trunk/include/linux/security.h +++ b/trunk/include/linux/security.h @@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return PAGE_ALIGN(mmap_min_addr); return hint; } -extern int mmap_min_addr_handler(struct ctl_table *table, int write, +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_SECURITY diff --git a/trunk/include/linux/signal.h b/trunk/include/linux/signal.h index ab9272cc270c..c7552836bd95 100644 --- a/trunk/include/linux/signal.h +++ b/trunk/include/linux/signal.h @@ -233,8 +233,6 @@ static inline int valid_signal(unsigned long sig) } extern int next_signal(struct sigpending *pending, sigset_t *mask); -extern int do_send_sig_info(int sig, struct siginfo *info, - struct task_struct *p, bool group); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, diff --git a/trunk/include/linux/swap.h b/trunk/include/linux/swap.h index 4ec90019c1a4..6c990e658f4e 100644 --- a/trunk/include/linux/swap.h +++ b/trunk/include/linux/swap.h @@ -34,37 +34,15 @@ static inline int current_is_kswapd(void) * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 - -/* - * Use some of the swap files numbers for other purposes. This - * is a convenient way to hook into the VM to trigger special - * actions on faults. - */ - -/* - * NUMA node memory migration support - */ -#ifdef CONFIG_MIGRATION -#define SWP_MIGRATION_NUM 2 -#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) -#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) +#ifndef CONFIG_MIGRATION +#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) #else -#define SWP_MIGRATION_NUM 0 +/* Use last two entries for page migration swap entries */ +#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) +#define SWP_MIGRATION_READ MAX_SWAPFILES +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) #endif -/* - * Handling of hardware poisoned pages with memory corruption. - */ -#ifdef CONFIG_MEMORY_FAILURE -#define SWP_HWPOISON_NUM 1 -#define SWP_HWPOISON MAX_SWAPFILES -#else -#define SWP_HWPOISON_NUM 0 -#endif - -#define MAX_SWAPFILES \ - ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) - /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) @@ -239,11 +217,6 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness); -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone, - int nid); extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; @@ -267,7 +240,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); extern unsigned long scan_unevictable_pages; -extern int scan_unevictable_handler(struct ctl_table *, int, +extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); diff --git a/trunk/include/linux/swapops.h b/trunk/include/linux/swapops.h index cd42e30b7c6e..6ec39ab27b4b 100644 --- a/trunk/include/linux/swapops.h +++ b/trunk/include/linux/swapops.h @@ -131,41 +131,3 @@ static inline int is_write_migration_entry(swp_entry_t entry) #endif -#ifdef CONFIG_MEMORY_FAILURE -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} -#else -static inline int non_swap_entry(swp_entry_t entry) -{ - return 0; -} -#endif diff --git a/trunk/include/linux/sysctl.h b/trunk/include/linux/sysctl.h index 1e4743ee6831..e76d3b22a466 100644 --- a/trunk/include/linux/sysctl.h +++ b/trunk/include/linux/sysctl.h @@ -29,6 +29,7 @@ #include #include +struct file; struct completion; #define CTL_MAXNAME 10 /* how many path components do we allow in a @@ -976,25 +977,25 @@ typedef int ctl_handler (struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); -typedef int proc_handler (struct ctl_table *ctl, int write, +typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos); -extern int proc_dostring(struct ctl_table *, int, +extern int proc_dostring(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, +extern int proc_dointvec(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, +extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_jiffies(struct ctl_table *, int, +extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, +extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, +extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, +extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); + struct file *, void __user *, size_t *, loff_t *); extern int do_sysctl (int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, diff --git a/trunk/include/linux/time.h b/trunk/include/linux/time.h index fe04e5ef6a59..56787c093345 100644 --- a/trunk/include/linux/time.h +++ b/trunk/include/linux/time.h @@ -155,34 +155,6 @@ extern void timekeeping_leap_insert(int leapsecond); struct tms; extern void do_sys_times(struct tms *); -/* - * Similar to the struct tm in userspace , but it needs to be here so - * that the kernel source is self contained. - */ -struct tm { - /* - * the number of seconds after the minute, normally in the range - * 0 to 59, but can be up to 60 to allow for leap seconds - */ - int tm_sec; - /* the number of minutes after the hour, in the range 0 to 59*/ - int tm_min; - /* the number of hours past midnight, in the range 0 to 23 */ - int tm_hour; - /* the day of the month, in the range 1 to 31 */ - int tm_mday; - /* the number of months since January, in the range 0 to 11 */ - int tm_mon; - /* the number of years since 1900 */ - long tm_year; - /* the number of days since Sunday, in the range 0 to 6 */ - int tm_wday; - /* the number of days since January 1, in the range 0 to 365 */ - int tm_yday; -}; - -void time_to_tm(time_t totalsecs, int offset, struct tm *result); - /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted diff --git a/trunk/include/linux/tracehook.h b/trunk/include/linux/tracehook.h index 1eb44a924e56..17ba82efa483 100644 --- a/trunk/include/linux/tracehook.h +++ b/trunk/include/linux/tracehook.h @@ -1,7 +1,7 @@ /* * Tracing hooks * - * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -463,38 +463,22 @@ static inline int tracehook_get_signal(struct task_struct *task, /** * tracehook_notify_jctl - report about job control stop/continue - * @notify: zero, %CLD_STOPPED or %CLD_CONTINUED + * @notify: nonzero if this is the last thread in the group to stop * @why: %CLD_STOPPED or %CLD_CONTINUED * * This is called when we might call do_notify_parent_cldstop(). + * It's called when about to stop for job control; we are already in + * %TASK_STOPPED state, about to call schedule(). It's also called when + * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made. * - * @notify is zero if we would not ordinarily send a %SIGCHLD, - * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD. + * Return nonzero to generate a %SIGCHLD with @why, which is + * normal if @notify is nonzero. * - * @why is %CLD_STOPPED when about to stop for job control; - * we are already in %TASK_STOPPED state, about to call schedule(). - * It might also be that we have just exited (check %PF_EXITING), - * but need to report that a group-wide stop is complete. - * - * @why is %CLD_CONTINUED when waking up after job control stop and - * ready to make a delayed @notify report. - * - * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal. - * - * Called with the siglock held. + * Called with no locks held. */ static inline int tracehook_notify_jctl(int notify, int why) { - return notify ?: (current->ptrace & PT_PTRACED) ? why : 0; -} - -/** - * tracehook_finish_jctl - report about return from job control stop - * - * This is called by do_signal_stop() after wakeup. - */ -static inline void tracehook_finish_jctl(void) -{ + return notify || (current->ptrace & PT_PTRACED); } #define DEATH_REAP -1 diff --git a/trunk/include/linux/tracepoint.h b/trunk/include/linux/tracepoint.h index 660a9de96f81..63a3f7a80580 100644 --- a/trunk/include/linux/tracepoint.h +++ b/trunk/include/linux/tracepoint.h @@ -4,7 +4,7 @@ /* * Kernel Tracepoint API. * - * See Documentation/trace/tracepoints.txt. + * See Documentation/tracepoint.txt. * * (C) Copyright 2008 Mathieu Desnoyers * diff --git a/trunk/include/linux/unaligned/be_byteshift.h b/trunk/include/linux/unaligned/be_byteshift.h index 9356b24223ac..46dd12c5709e 100644 --- a/trunk/include/linux/unaligned/be_byteshift.h +++ b/trunk/include/linux/unaligned/be_byteshift.h @@ -1,7 +1,7 @@ #ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H #define _LINUX_UNALIGNED_BE_BYTESHIFT_H -#include +#include static inline u16 __get_unaligned_be16(const u8 *p) { diff --git a/trunk/include/linux/unaligned/le_byteshift.h b/trunk/include/linux/unaligned/le_byteshift.h index be376fb79b64..59777e951baf 100644 --- a/trunk/include/linux/unaligned/le_byteshift.h +++ b/trunk/include/linux/unaligned/le_byteshift.h @@ -1,7 +1,7 @@ #ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H #define _LINUX_UNALIGNED_LE_BYTESHIFT_H -#include +#include static inline u16 __get_unaligned_le16(const u8 *p) { diff --git a/trunk/include/linux/writeback.h b/trunk/include/linux/writeback.h index 66ebddcff664..75cf58666ff9 100644 --- a/trunk/include/linux/writeback.h +++ b/trunk/include/linux/writeback.h @@ -110,20 +110,21 @@ extern int laptop_mode; extern unsigned long determine_dirtyable_memory(void); extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, +struct file; +int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, diff --git a/trunk/include/net/ip.h b/trunk/include/net/ip.h index 5b26a0bd178e..72c36926c26d 100644 --- a/trunk/include/net/ip.h +++ b/trunk/include/net/ip.h @@ -399,7 +399,7 @@ extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, * fed into the routing cache should use these handlers. */ int ipv4_doint_and_flush(ctl_table *ctl, int write, - void __user *buffer, + struct file* filp, void __user *buffer, size_t *lenp, loff_t *ppos); int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, diff --git a/trunk/include/net/ndisc.h b/trunk/include/net/ndisc.h index f76f22d05721..1459ed3e2697 100644 --- a/trunk/include/net/ndisc.h +++ b/trunk/include/net/ndisc.h @@ -55,6 +55,7 @@ enum { #include struct ctl_table; +struct file; struct inet6_dev; struct net_device; struct net_proto_family; @@ -138,6 +139,7 @@ extern int igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, + struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/trunk/ipc/ipc_sysctl.c b/trunk/ipc/ipc_sysctl.c index 7d3704750efc..40eab7314aeb 100644 --- a/trunk/ipc/ipc_sysctl.c +++ b/trunk/ipc/ipc_sysctl.c @@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table) } #ifdef CONFIG_PROC_SYSCTL -static int proc_ipc_dointvec(ctl_table *table, int write, +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); + return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); } static int proc_ipc_callback_dointvec(ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; size_t lenp_bef = *lenp; @@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos); + rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) /* @@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - return proc_doulongvec_minmax(&ipc_table, write, buffer, + return proc_doulongvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos); } @@ -95,7 +95,7 @@ static void ipc_auto_callback(int val) } static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; size_t lenp_bef = *lenp; @@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, ipc_table.data = get_ipc(table); oldval = *((int *)(ipc_table.data)); - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); + rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) { int newval = *((int *)(ipc_table.data)); diff --git a/trunk/ipc/mq_sysctl.c b/trunk/ipc/mq_sysctl.c index 8a058711fc10..24ae46dfe45d 100644 --- a/trunk/ipc/mq_sysctl.c +++ b/trunk/ipc/mq_sysctl.c @@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table) return which; } -static int proc_mq_dointvec(ctl_table *table, int write, +static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); mq_table.data = get_mq(table); - return proc_dointvec(&mq_table, write, buffer, lenp, ppos); + return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos); } static int proc_mq_dointvec_minmax(ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); mq_table.data = get_mq(table); - return proc_dointvec_minmax(&mq_table, write, buffer, + return proc_dointvec_minmax(&mq_table, write, filp, buffer, lenp, ppos); } #else diff --git a/trunk/kernel/Makefile b/trunk/kernel/Makefile index b8d4cd8ac0b9..187c89b4783d 100644 --- a/trunk/kernel/Makefile +++ b/trunk/kernel/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o +obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o diff --git a/trunk/kernel/audit_watch.c b/trunk/kernel/audit_watch.c index 0e96dbc60ea9..cc7e87936cbc 100644 --- a/trunk/kernel/audit_watch.c +++ b/trunk/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c index 7ccba4bc5e3b..cd83d9933b6b 100644 --- a/trunk/kernel/cgroup.c +++ b/trunk/kernel/cgroup.c @@ -23,7 +23,6 @@ */ #include -#include #include #include #include @@ -49,8 +48,6 @@ #include #include #include -#include -#include /* TODO: replace with more sophisticated array */ #include @@ -63,8 +60,6 @@ static struct cgroup_subsys *subsys[] = { #include }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -79,9 +74,6 @@ struct cgroupfs_root { */ unsigned long subsys_bits; - /* Unique id for this hierarchy. */ - int hierarchy_id; - /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -102,9 +94,6 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -152,10 +141,6 @@ struct css_id { static LIST_HEAD(roots); static int root_count; -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); - /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -216,7 +201,6 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; - struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -243,11 +227,8 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* - * hash table for cgroup groups. This improves the performance to find - * an existing css_set. This hash doesn't (currently) take into - * account cgroups in empty hierarchies. - */ +/* hash table for cgroup groups. This improves the performance to + * find an existing css_set */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -267,22 +248,48 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -static void __put_css_set(struct css_set *cg, int taskexit) +/* When we create or destroy a css_set, the operation simply + * takes/releases a reference count on all the cgroups referenced + * by subsystems in this css_set. This can end up multiple-counting + * some cgroups, but that's OK - the ref-count is just a + * busy/not-busy indicator; ensuring that we only count each cgroup + * once would require taking a global lock to ensure that no + * subsystems moved between hierarchies while we were doing so. + * + * Possible TODO: decide at boot time based on the number of + * registered subsystems and the number of CPUs or NUMA nodes whether + * it's better for performance to ref-count every subsystem, or to + * take a global lock and only add one ref count to each hierarchy. + */ + +/* + * unlink a css_set from the list and free it + */ +static void unlink_css_set(struct css_set *cg) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; + + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } +} + +static void __put_css_set(struct css_set *cg, int taskexit) +{ + int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -295,28 +302,21 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } + unlink_css_set(cg); + write_unlock(&css_set_lock); - /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - struct cgroup *cgrp = link->cgrp; - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + rcu_read_lock(); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - - kfree(link); } - - write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + rcu_read_unlock(); + kfree(cg); } /* @@ -337,78 +337,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) __put_css_set(cg, 1); } -/* - * compare_css_sets - helper function for find_existing_css_set(). - * @cg: candidate css_set being tested - * @old_cg: existing css_set for a task - * @new_cgrp: cgroup that's being entered by the task - * @template: desired set of css pointers in css_set (pre-calculated) - * - * Returns true if "cg" matches "old_cg" except for the hierarchy - * which "new_cgrp" belongs to, for which it should match "new_cgrp". - */ -static bool compare_css_sets(struct css_set *cg, - struct css_set *old_cg, - struct cgroup *new_cgrp, - struct cgroup_subsys_state *template[]) -{ - struct list_head *l1, *l2; - - if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* Not all subsystems matched */ - return false; - } - - /* - * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. - */ - - l1 = &cg->cg_links; - l2 = &old_cg->cg_links; - while (1) { - struct cg_cgroup_link *cgl1, *cgl2; - struct cgroup *cg1, *cg2; - - l1 = l1->next; - l2 = l2->next; - /* See if we reached the end - both lists are equal length. */ - if (l1 == &cg->cg_links) { - BUG_ON(l2 != &old_cg->cg_links); - break; - } else { - BUG_ON(l2 == &old_cg->cg_links); - } - /* Locate the cgroups associated with these links. */ - cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); - cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cg1 = cgl1->cgrp; - cg2 = cgl2->cgrp; - /* Hierarchies should be linked in the same order. */ - BUG_ON(cg1->root != cg2->root); - - /* - * If this hierarchy is the hierarchy of the cgroup - * that's changing, then we need to check that this - * css_set points to the new cgroup; if it's any other - * hierarchy, then this css_set should point to the - * same cgroup as the old css_set. - */ - if (cg1->root == new_cgrp->root) { - if (cg1 != new_cgrp) - return false; - } else { - if (cg1 != cg2) - return false; - } - } - return true; -} - /* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing @@ -450,11 +378,10 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!compare_css_sets(cg, oldcg, cgrp, template)) - continue; - - /* This css_set matches what we need */ - return cg; + if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* All subsystems matched */ + return cg; + } } /* No existing cgroup group matched */ @@ -508,14 +435,8 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; - link->cgrp = cgrp; - atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - /* - * Always add links to the tail of the list so that the list - * is sorted by order of hierarchy creation - */ - list_add_tail(&link->cg_link_list, &cg->cg_links); + list_add(&link->cg_link_list, &cg->cg_links); } /* @@ -530,11 +451,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + int i; struct list_head tmp_cg_links; struct hlist_head *hhead; - struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -568,12 +489,20 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - if (c->root == cgrp->root) - c = cgrp; - link_css_set(&tmp_cg_links, res, c); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup *cgrp = res->subsys[i]->cgroup; + struct cgroup_subsys *ss = subsys[i]; + atomic_inc(&cgrp->count); + /* + * We want to add a link once per cgroup, so we + * only do it for the first subsystem in each + * hierarchy + */ + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -588,41 +517,6 @@ static struct css_set *find_css_set( return res; } -/* - * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. - */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) -{ - struct css_set *css; - struct cgroup *res = NULL; - - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); - /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. - */ - css = task->cgroups; - if (css == &init_css_set) { - res = &root->top_cgroup; - } else { - struct cg_cgroup_link *link; - list_for_each_entry(link, &css->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - if (c->root == root) { - res = c; - break; - } - } - } - read_unlock(&css_set_lock); - BUG_ON(!res); - return res; -} - /* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. @@ -783,12 +677,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -953,8 +841,6 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -963,12 +849,6 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; - char *name; - /* User explicitly requested empty subsystem */ - bool none; - - struct cgroupfs_root *new_root; - }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -983,7 +863,9 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - memset(opts, 0, sizeof(*opts)); + opts->subsys_bits = 0; + opts->flags = 0; + opts->release_agent = NULL; while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -997,42 +879,17 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } - } else if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX, GFP_KERNEL); + opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { - int i; - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; + strncpy(opts->release_agent, token + 14, PATH_MAX - 1); + opts->release_agent[PATH_MAX - 1] = 0; } else { struct cgroup_subsys *ss; int i; @@ -1049,8 +906,6 @@ static int parse_cgroupfs_options(char *data, } } - /* Consistency checks */ - /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -1060,16 +915,8 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_bits && opts->none) - return -EINVAL; - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_bits && !opts->name) + /* We can't have an empty hierarchy */ + if (!opts->subsys_bits) return -EINVAL; return 0; @@ -1097,12 +944,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* Don't allow name to change at remount */ - if (opts.name && strcmp(opts.name, root->name)) { - ret = -EINVAL; - goto out_unlock; - } - ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -1114,7 +955,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); @@ -1134,10 +974,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pidlists); - mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->pids_list); + init_rwsem(&cgrp->pids_mutex); } - static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -1149,106 +988,33 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } -static bool init_root_id(struct cgroupfs_root *root) -{ - int ret = 0; - - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; - spin_lock(&hierarchy_id_lock); - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - spin_unlock(&hierarchy_id_lock); - } while (ret); - return true; -} - static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroup_sb_opts *opts = data; + struct cgroupfs_root *new = data; struct cgroupfs_root *root = sb->s_fs_info; - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; + /* First check subsystems */ + if (new->subsys_bits != root->subsys_bits) + return 0; - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_bits || opts->none) - && (opts->subsys_bits != root->subsys_bits)) + /* Next check flags */ + if (new->flags != root->flags) return 0; return 1; } -static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) -{ - struct cgroupfs_root *root; - - if (!opts->subsys_bits && !opts->none) - return NULL; - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - return ERR_PTR(-ENOMEM); - - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - init_cgroup_root(root); - - root->subsys_bits = opts->subsys_bits; - root->flags = opts->flags; - if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); - if (opts->name) - strcpy(root->name, opts->name); - return root; -} - -static void cgroup_drop_root(struct cgroupfs_root *root) -{ - if (!root) - return; - - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - kfree(root); -} - static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroup_sb_opts *opts = data; - - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; - - BUG_ON(!opts->subsys_bits && !opts->none); + struct cgroupfs_root *root = data; ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; + sb->s_fs_info = root; + root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1285,43 +1051,48 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; - struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *new_root; + struct cgroupfs_root *root; + struct list_head tmp_cg_links; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_err; + if (ret) { + kfree(opts.release_agent); + return ret; + } - /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. - */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_err; + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + kfree(opts.release_agent); + return -ENOMEM; } - opts.new_root = new_root; - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); + init_cgroup_root(root); + root->subsys_bits = opts.subsys_bits; + root->flags = opts.flags; + if (opts.release_agent) { + strcpy(root->release_agent_path, opts.release_agent); + kfree(opts.release_agent); + } + + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); + if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); - goto out_err; + kfree(root); + return PTR_ERR(sb); } - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_cg_links; + if (sb->s_fs_info != root) { + /* Reusing an existing superblock */ + BUG_ON(sb->s_root == NULL); + kfree(root); + root = NULL; + } else { + /* New superblock */ struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; - struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1334,18 +1105,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (strlen(root->name)) { - /* Check for name clashes with existing mounts */ - for_each_active_root(existing_root) { - if (!strcmp(existing_root->name, root->name)) { - ret = -EBUSY; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - } - } - /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1364,8 +1123,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - free_cg_links(&tmp_cg_links); - goto drop_new_super; + goto free_cg_links; } /* EBUSY should be the only error here */ @@ -1397,27 +1155,17 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - } else { - /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed - */ - cgroup_drop_root(opts.new_root); + mutex_unlock(&cgroup_mutex); } simple_set_mnt(mnt, sb); - kfree(opts.release_agent); - kfree(opts.name); return 0; + free_cg_links: + free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ret; } @@ -1463,7 +1211,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - cgroup_drop_root(root); + kfree(root); } static struct file_system_type cgroup_fs_type = { @@ -1528,6 +1276,27 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } +/* + * Return the first subsystem attached to a cgroup's hierarchy, and + * its subsystem id. + */ + +static void get_first_subsys(const struct cgroup *cgrp, + struct cgroup_subsys_state **css, int *subsys_id) +{ + const struct cgroupfs_root *root = cgrp->root; + const struct cgroup_subsys *test_ss; + BUG_ON(list_empty(&root->subsys_list)); + test_ss = list_entry(root->subsys_list.next, + struct cgroup_subsys, sibling); + if (css) { + *css = cgrp->subsys[test_ss->subsys_id]; + BUG_ON(!*css); + } + if (subsys_id) + *subsys_id = test_ss->subsys_id; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1544,15 +1313,18 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; + int subsys_id; + + get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); + oldcgrp = task_cgroup(tsk, subsys_id); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) return retval; } @@ -1590,7 +1362,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1651,6 +1423,15 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } +/* The various types of files and directories in a cgroup file system */ +enum cgroup_filetype { + FILE_ROOT, + FILE_DIR, + FILE_TASKLIST, + FILE_NOTIFY_ON_RELEASE, + FILE_RELEASE_AGENT, +}; + /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -2095,7 +1876,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2348,7 +2129,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks'/'procs' files. + * Stuff for reading the 'tasks' file. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2358,196 +2139,27 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} -static void pidlist_free(void *p) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); -} -static void *pidlist_resize(void *p, int newcount) -{ - void *newlist; - /* note: if new alloc fails, old p will still be valid either way */ - if (is_vmalloc_addr(p)) { - newlist = vmalloc(newcount * sizeof(pid_t)); - if (!newlist) - return NULL; - memcpy(newlist, p, newcount * sizeof(pid_t)); - vfree(p); - } else { - newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); - } - return newlist; -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. - */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) -{ - int src, dest = 1; - pid_t *list = *p; - pid_t *newlist; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - /* - * if the length difference is large enough, we want to allocate a - * smaller buffer to save memory. if this fails due to out of memory, - * we'll just stay with what we've got. - */ - if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = pidlist_resize(list, dest); - if (newlist) - *p = newlist; - } - return dest; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. + * Load into 'pidarray' up to 'npids' of the tasks using cgroup + * 'cgrp'. Return actual number of pids loaded. No need to + * task_lock(p) when reading out p->cgroup, since we're in an RCU + * read section, so the css_set can't go away, and is + * immutable after creation. */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) +static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); - /* - * We can't drop the pidlist_mutex before taking the l->mutex in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* found a matching list - drop the extra refcount */ - put_pid_ns(ns); - /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); - mutex_unlock(&cgrp->pidlist_mutex); - l->use_count++; - return l; - } - } - /* entry not found; create a new one */ - l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); - put_pid_ns(ns); - return l; - } - init_rwsem(&l->mutex); - down_write(&l->mutex); - l->key.type = type; - l->key.ns = ns; - l->use_count = 0; /* don't increment here */ - l->list = NULL; - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; - struct cgroup_pidlist *l; - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == length)) + if (unlikely(n == npids)) break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); - length = n; - /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(&array, length); - l = cgroup_pidlist_find(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; - } - /* store array, freeing old if necessary - lock already held */ - pidlist_free(l->list); - l->list = array; - l->length = length; - l->use_count++; - up_write(&l->mutex); - *lp = l; - return 0; + return n; } /** @@ -2604,14 +2216,37 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} /* - * seq_file methods for the tasks/procs files. The seq_file position is the + * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. + * in the cgroup->tasks_pids array. */ -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2619,45 +2254,48 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pidlist *l = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&cgrp->pids_mutex); if (pid) { - int end = l->length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= l->length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } -static void cgroup_pidlist_stop(struct seq_file *s, void *v) +static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; + up_read(&cgrp->pids_mutex); } -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pidlist *l = s->private; - pid_t *p = v; - pid_t *end = l->list + l->length; + struct cgroup_pids *cp = s->private; + int *p = v; + int *end = cp->tasks_pids + cp->length; + /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2671,107 +2309,124 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_pidlist_show(struct seq_file *s, void *v) +static int cgroup_tasks_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, +static const struct seq_operations cgroup_tasks_seq_operations = { + .start = cgroup_tasks_start, + .stop = cgroup_tasks_stop, + .next = cgroup_tasks_next, + .show = cgroup_tasks_show, }; -static void cgroup_release_pid_array(struct cgroup_pidlist *l) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->mutex); - kfree(l); - return; + struct cgroup *cgrp = cp->cgrp; + + down_write(&cgrp->pids_mutex); + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&cgrp->pids_mutex); } -static int cgroup_pidlist_release(struct inode *inode, struct file *file) +static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup_pidlist *l; + struct seq_file *seq; + struct cgroup_pids *cp; + if (!(file->f_mode & FMODE_READ)) return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); + + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } -static const struct file_operations cgroup_pidlist_operations = { +static struct file_operations cgroup_tasks_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_pidlist_release, + .release = cgroup_tasks_release, }; /* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. + * Handle an open on 'tasks' file. Prepare an array containing the + * process id's of tasks currently attached to the cgroup being opened. */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) + +static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; + pid_t *pidarray; + int npids; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + npids = cgroup_task_count(cgrp); + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + return -ENOMEM; + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); + + /* + * Store the array in the cgroup, freeing the old + * array if necessary + */ + down_write(&cgrp->pids_mutex); + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; + up_write(&cgrp->pids_mutex); + + file->f_op = &cgroup_tasks_operations; - retval = seq_open(file, &cgroup_pidlist_seq_operations); + retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - cgroup_release_pid_array(l); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = l; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2794,27 +2449,21 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, + .release = cgroup_tasks_release, + .private = FILE_TASKLIST, .mode = S_IRUGO | S_IWUSR, }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", - .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ - .release = cgroup_pidlist_release, - .mode = S_IRUGO, - }, + { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, + .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2823,6 +2472,7 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, + .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -3229,7 +2879,6 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; - init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -3284,7 +2933,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - BUG_ON(!init_root_id(&rootnode)); + err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -3337,16 +2986,15 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; + int subsys_id; int count = 0; - seq_printf(m, "%d:", root->hierarchy_id); + seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); - if (strlen(root->name)) - seq_printf(m, "%sname=%s", count ? "," : "", - root->name); seq_putc(m, ':'); - cgrp = task_cgroup_from_root(tsk, root); + get_first_subsys(&root->top_cgroup, NULL, &subsys_id); + cgrp = task_cgroup(tsk, subsys_id); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3385,8 +3033,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->name, ss->root->hierarchy_id, + seq_printf(m, "%s\t%lu\t%d\t%d\n", + ss->name, ss->root->subsys_bits, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3672,11 +3320,13 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; + int subsys_id; if (cgrp == dummytop) return 1; - target = task_cgroup_from_root(task, cgrp->root); + get_first_subsys(cgrp, NULL, &subsys_id); + target = task_cgroup(task, subsys_id); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -4043,154 +3693,3 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - return cgroup_task_count(cont); -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct cgroup *cont, - struct cftype *cft, - struct seq_file *seq) -{ - struct cg_cgroup_link *link; - struct css_set *cg; - - read_lock(&css_set_lock); - rcu_read_lock(); - cg = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cg->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - const char *name; - - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); - } - rcu_read_unlock(); - read_unlock(&css_set_lock); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, - struct cftype *cft, - struct seq_file *seq) -{ - struct cg_cgroup_link *link; - - read_lock(&css_set_lock); - list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; - struct task_struct *task; - int count = 0; - seq_printf(seq, "css_set %p\n", cg); - list_for_each_entry(task, &cg->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - } - } - read_unlock(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype debug_files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, debug_files, - ARRAY_SIZE(debug_files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/trunk/kernel/cgroup_debug.c b/trunk/kernel/cgroup_debug.c new file mode 100644 index 000000000000..0c92d797baa6 --- /dev/null +++ b/trunk/kernel/cgroup_debug.c @@ -0,0 +1,105 @@ +/* + * kernel/cgroup_debug.c - Example cgroup subsystem that + * exposes debug info + * + * Copyright (C) Google Inc, 2007 + * + * Developed by Paul Menage (menage@google.com) + * + */ + +#include +#include +#include +#include + +#include + +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + u64 count; + + count = cgroup_task_count(cont); + return count; +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; diff --git a/trunk/kernel/cgroup_freezer.c b/trunk/kernel/cgroup_freezer.c index 59e9ef6aab40..fb249e2bcada 100644 --- a/trunk/kernel/cgroup_freezer.c +++ b/trunk/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) + struct task_struct *task) { struct freezer *freezer; @@ -177,19 +177,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (is_task_frozen_enough(c)) { - rcu_read_unlock(); - return -EBUSY; - } - } - rcu_read_unlock(); - } - return 0; } diff --git a/trunk/kernel/cpuset.c b/trunk/kernel/cpuset.c index b5cb469d2545..7e75a41bd508 100644 --- a/trunk/kernel/cpuset.c +++ b/trunk/kernel/cpuset.c @@ -1324,10 +1324,9 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk, bool threadgroup) +static int cpuset_can_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct task_struct *tsk) { - int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1344,51 +1343,18 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); - if (ret) - return ret; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); - if (ret) { - rcu_read_unlock(); - return ret; - } - } - rcu_read_unlock(); - } - return 0; -} - -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, - struct cpuset *cs) -{ - int err; - /* - * can_attach beforehand should guarantee that this doesn't fail. - * TODO: have a better way to handle failure here - */ - err = set_cpus_allowed_ptr(tsk, cpus_attach); - WARN_ON_ONCE(err); - - task_lock(tsk); - cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); - + return security_task_setscheduler(tsk, 0, NULL); } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk, - bool threadgroup) +static void cpuset_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct cgroup *oldcont, + struct task_struct *tsk) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1397,19 +1363,15 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } + err = set_cpus_allowed_ptr(tsk, cpus_attach); + if (err) + return; - /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); - } - rcu_read_unlock(); - } + task_lock(tsk); + cpuset_change_task_nodemask(tsk, &to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); - /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index 5859f598c951..60d6fdcc9265 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -976,6 +976,8 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); + if (tsk->binfmt) + module_put(tsk->binfmt->module); proc_exit_connector(tsk); @@ -1095,28 +1097,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; - wait_queue_t child_wait; int notask_error; }; -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - -static int eligible_pid(struct wait_opts *wo, struct task_struct *p) -{ - return wo->wo_type == PIDTYPE_MAX || - task_pid_type(p, wo->wo_type) == wo->wo_pid; + struct pid *pid = NULL; + if (type == PIDTYPE_PID) + pid = task->pids[type].pid; + else if (type < PIDTYPE_MAX) + pid = task->group_leader->pids[type].pid; + return pid; } static int eligible_child(struct wait_opts *wo, struct task_struct *p) { - if (!eligible_pid(wo, p)) - return 0; + int err; + + if (wo->wo_type < PIDTYPE_MAX) { + if (task_pid_type(p, wo->wo_type) != wo->wo_pid) + return 0; + } + /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1126,6 +1128,10 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; + err = security_task_wait(p); + if (err) + return err; + return 1; } @@ -1138,20 +1144,18 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); - } + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); if (!retval) retval = pid; return retval; @@ -1481,14 +1485,13 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, int ptrace, - struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, + int ptrace, struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; - ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1550,7 +1553,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, 0, p); + int ret = wait_consider_task(wo, tsk, 0, p); if (ret) return ret; } @@ -1564,7 +1567,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, 1, p); + int ret = wait_consider_task(wo, tsk, 1, p); if (ret) return ret; } @@ -1572,38 +1575,15 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct wait_opts *wo = container_of(wait, struct wait_opts, - child_wait); - struct task_struct *p = key; - - if (!eligible_pid(wo, p)) - return 0; - - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; - - return default_wake_function(wait, mode, sync, key); -} - -void __wake_up_parent(struct task_struct *p, struct task_struct *parent) -{ - __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); -} - static long do_wait(struct wait_opts *wo) { + DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1644,7 +1624,32 @@ static long do_wait(struct wait_opts *wo) } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + remove_wait_queue(¤t->signal->wait_chldexit,&wait); + if (wo->wo_info) { + struct siginfo __user *infop = wo->wo_info; + + if (retval > 0) + retval = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!retval) + retval = put_user(0, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user(0, &infop->si_code); + if (!retval) + retval = put_user(0, &infop->si_pid); + if (!retval) + retval = put_user(0, &infop->si_uid); + if (!retval) + retval = put_user(0, &infop->si_status); + } + } return retval; } @@ -1689,29 +1694,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); - - if (ret > 0) { - ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } - put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 266c6af6ef1b..51ad0b0b7266 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -434,14 +434,6 @@ __setup("coredump_filter=", coredump_filter_setup); #include -static void mm_init_aio(struct mm_struct *mm) -{ -#ifdef CONFIG_AIO - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); -#endif -} - static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -455,9 +447,10 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -518,8 +511,6 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); - if (mm->binfmt) - module_put(mm->binfmt->module); mmdrop(mm); } } @@ -645,14 +636,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; - if (mm->binfmt && !try_module_get(mm->binfmt->module)) - goto free_pt; - return mm; free_pt: - /* don't put binfmt in mmput, we haven't got module yet */ - mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -993,16 +979,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); - /* - * Siblings of global init remain as zombies on exit since they are - * not reaped by their parent (swapper). To solve this and to avoid - * multi-rooted process trees, prevent global and container-inits - * from creating siblings. - */ - if ((clone_flags & CLONE_PARENT) && - current->signal->flags & SIGNAL_UNKILLABLE) - return ERR_PTR(-EINVAL); - retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1044,6 +1020,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; + if (p->binfmt && !try_module_get(p->binfmt->module)) + goto bad_fork_cleanup_put_domain; + p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1331,6 +1310,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); + if (p->binfmt) + module_put(p->binfmt->module); +bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/trunk/kernel/hung_task.c b/trunk/kernel/hung_task.c index d4e841747400..022a4927b785 100644 --- a/trunk/kernel/hung_task.c +++ b/trunk/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/trunk/kernel/ns_cgroup.c b/trunk/kernel/ns_cgroup.c index 2a5dfec8efe0..5aa854f9e5ae 100644 --- a/trunk/kernel/ns_cgroup.c +++ b/trunk/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) +static int ns_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, struct task_struct *task) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,18 +56,6 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (!cgroup_is_descendant(new_cgroup, c)) { - rcu_read_unlock(); - return -EPERM; - } - } - rcu_read_unlock(); - } - return 0; } diff --git a/trunk/kernel/pid_namespace.c b/trunk/kernel/pid_namespace.c index 86b3796b0436..821722ae58a7 100644 --- a/trunk/kernel/pid_namespace.c +++ b/trunk/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) + if (flags & CLONE_THREAD) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/trunk/kernel/ptrace.c b/trunk/kernel/ptrace.c index 23bd09cd042e..307c285af59e 100644 --- a/trunk/kernel/ptrace.c +++ b/trunk/kernel/ptrace.c @@ -266,10 +266,9 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. But if our normal - * children self-reap, then this child was prevented by ptrace and we must - * reap it now, in that case we must also wake up sub-threads sleeping in - * do_wait(). + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -279,10 +278,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); + else if (ignoring_children(tracer->sighand)) p->exit_signal = -1; - } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/trunk/kernel/res_counter.c b/trunk/kernel/res_counter.c index 88faec23e833..e1338f074314 100644 --- a/trunk/kernel/res_counter.c +++ b/trunk/kernel/res_counter.c @@ -19,7 +19,6 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; - counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -37,27 +36,17 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) } int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, - struct res_counter **soft_limit_fail_at) + struct res_counter **limit_fail_at) { int ret; unsigned long flags; struct res_counter *c, *u; *limit_fail_at = NULL; - if (soft_limit_fail_at) - *soft_limit_fail_at = NULL; local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); ret = res_counter_charge_locked(c, val); - /* - * With soft limits, we return the highest ancestor - * that exceeds its soft limit - */ - if (soft_limit_fail_at && - !res_counter_soft_limit_check_locked(c)) - *soft_limit_fail_at = c; spin_unlock(&c->lock); if (ret < 0) { *limit_fail_at = c; @@ -85,8 +74,7 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val, - bool *was_soft_limit_excess) +void res_counter_uncharge(struct res_counter *counter, unsigned long val) { unsigned long flags; struct res_counter *c; @@ -94,9 +82,6 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val, local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); - if (was_soft_limit_excess) - *was_soft_limit_excess = - !res_counter_soft_limit_check_locked(c); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); } @@ -116,8 +101,6 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; }; BUG(); diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index ee61f454a98b..2f76e06bea58 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10377,7 +10377,8 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10387,45 +10388,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif - return 0; -} -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) + struct cgroup *old_cont, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/trunk/kernel/sched_fair.c b/trunk/kernel/sched_fair.c index 4e777b47eeda..ecc637a0d591 100644 --- a/trunk/kernel/sched_fair.c +++ b/trunk/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret || !write) return ret; diff --git a/trunk/kernel/signal.c b/trunk/kernel/signal.c index 6705320784fd..64c5deeaca5d 100644 --- a/trunk/kernel/signal.c +++ b/trunk/kernel/signal.c @@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from do_signal_stop() + * The first thread which returns from finish_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -971,20 +971,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } -int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) -{ - unsigned long flags; - int ret = -ESRCH; - - if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); - unlock_task_sighand(p, &flags); - } - - return ret; -} - /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1050,6 +1036,12 @@ void zap_other_threads(struct task_struct *p) } } +int __fatal_signal_pending(struct task_struct *tsk) +{ + return sigismember(&tsk->pending.signal, SIGKILL); +} +EXPORT_SYMBOL(__fatal_signal_pending); + struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1076,10 +1068,18 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + unsigned long flags; + int ret; - if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = check_kill_permission(sig, info, p); + + if (!ret && sig) { + ret = -ESRCH; + if (lock_task_sighand(p, &flags)) { + ret = __group_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); + } + } return ret; } @@ -1224,9 +1224,15 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ +/* + * The caller must ensure the task can't exit. + */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { + int ret; + unsigned long flags; + /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1234,7 +1240,10 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - return do_send_sig_info(sig, info, p, false); + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = specific_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + return ret; } #define __si_special(priv) \ @@ -1373,6 +1382,15 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) return ret; } +/* + * Wake up any threads in the parent blocked in wait* syscalls. + */ +static inline void __wake_up_parent(struct task_struct *p, + struct task_struct *parent) +{ + wake_up_interruptible_sync(&parent->signal->wait_chldexit); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1655,6 +1673,29 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } +static void +finish_stop(int stop_count) +{ + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, + * report to the parent. When ptraced, every thread reports itself. + */ + if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, CLD_STOPPED); + read_unlock(&tasklist_lock); + } + + do { + schedule(); + } while (try_to_freeze()); + /* + * Now we don't run again until continued. + */ + current->exit_code = 0; +} + /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1664,9 +1705,15 @@ void ptrace_notify(int exit_code) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int notify; + int stop_count; - if (!sig->group_stop_count) { + if (sig->group_stop_count > 0) { + /* + * There is a group stop in progress. We don't need to + * start another one. + */ + stop_count = --sig->group_stop_count; + } else { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1678,7 +1725,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - sig->group_stop_count = 1; + stop_count = 0; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1687,44 +1734,19 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - sig->group_stop_count++; + stop_count++; signal_wake_up(t, 0); } + sig->group_stop_count = stop_count; } - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, report - * to the parent. When ptraced, every thread reports itself. - */ - notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; - notify = tracehook_notify_jctl(notify, CLD_STOPPED); - /* - * tracehook_notify_jctl() can drop and reacquire siglock, so - * we keep ->group_stop_count != 0 before the call. If SIGCONT - * or SIGKILL comes in between ->group_stop_count == 0. - */ - if (sig->group_stop_count) { - if (!--sig->group_stop_count) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); - } - spin_unlock_irq(¤t->sighand->siglock); - if (notify) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, notify); - read_unlock(&tasklist_lock); - } - - /* Now we don't run again until woken by SIGCONT or SIGKILL */ - do { - schedule(); - } while (try_to_freeze()); - - tracehook_finish_jctl(); - current->exit_code = 0; + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + finish_stop(stop_count); return 1; } @@ -1793,15 +1815,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - - why = tracehook_notify_jctl(why, CLD_CONTINUED); spin_unlock_irq(&sighand->siglock); - if (why) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); - } + if (unlikely(!tracehook_notify_jctl(1, why))) + goto relock; + + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); goto relock; } @@ -1966,14 +1987,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); + group_stop = 1; } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop)) { + if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, group_stop); + do_notify_parent_cldstop(tsk, CLD_STOPPED); read_unlock(&tasklist_lock); } } @@ -2269,6 +2290,7 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; + unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2278,16 +2300,14 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. + * + * If lock_task_sighand() fails we pretend the task dies + * after receiving the signal. The window is tiny, and the + * signal is private anyway. */ - if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); - /* - * If lock_task_sighand() failed we pretend the task - * dies after receiving the signal. The window is tiny, - * and the signal is private anyway. - */ - if (unlikely(error == -ESRCH)) - error = 0; + if (!error && sig && lock_task_sighand(p, &flags)) { + error = specific_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); } } rcu_read_unlock(); diff --git a/trunk/kernel/slow-work.c b/trunk/kernel/slow-work.c index 0d31135efbf4..09d7519557d3 100644 --- a/trunk/kernel/slow-work.c +++ b/trunk/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , +static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/trunk/kernel/softlockup.c b/trunk/kernel/softlockup.c index 81324d12eb35..88796c330838 100644 --- a/trunk/kernel/softlockup.c +++ b/trunk/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); } /* diff --git a/trunk/kernel/sys.c b/trunk/kernel/sys.c index 255475d163e0..ebcb15611728 100644 --- a/trunk/kernel/sys.c +++ b/trunk/kernel/sys.c @@ -1542,28 +1542,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; - case PR_MCE_KILL: - if (arg4 | arg5) - return -EINVAL; - switch (arg2) { - case 0: - if (arg3 != 0) - return -EINVAL; - current->flags &= ~PF_MCE_PROCESS; - break; - case 1: - current->flags |= PF_MCE_PROCESS; - if (arg3 != 0) - current->flags |= PF_MCE_EARLY; - else - current->flags &= ~PF_MCE_EARLY; - break; - default: - return -EINVAL; - } - error = 0; - break; - default: error = -EINVAL; break; diff --git a/trunk/kernel/sysctl.c b/trunk/kernel/sysctl.c index 0d949c517412..7f4f57bea4ce 100644 --- a/trunk/kernel/sysctl.c +++ b/trunk/kernel/sysctl.c @@ -76,7 +76,6 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -163,9 +162,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, +static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -424,14 +423,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1398,31 +1389,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, -#ifdef CONFIG_MEMORY_FAILURE - { - .ctl_name = CTL_UNNUMBERED, - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one, - }, -#endif - /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2251,7 +2217,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2312,6 +2278,7 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2325,10 +2292,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, +int proc_dostring(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, + return _proc_do_string(table->data, table->maxlen, write, filp, buffer, lenp, ppos); } @@ -2353,7 +2320,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2460,13 +2427,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, +static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, + return __do_proc_dointvec(table->data, table, write, filp, buffer, lenp, ppos, conv, data); } @@ -2474,6 +2441,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2483,10 +2451,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, +int proc_dointvec(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, NULL,NULL); } @@ -2494,7 +2462,7 @@ int proc_dointvec(struct ctl_table *table, int write, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2506,7 +2474,7 @@ static int proc_taint(struct ctl_table *table, int write, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); if (err < 0) return err; @@ -2558,6 +2526,7 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2570,18 +2539,19 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, +int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2686,19 +2656,21 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); + filp, buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2711,16 +2683,17 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, +int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2735,10 +2708,11 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, buffer, + return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, HZ, 1000l); } @@ -2814,6 +2788,7 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2825,10 +2800,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, +int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2836,6 +2811,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2847,10 +2823,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2858,6 +2834,7 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2870,14 +2847,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, +static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2886,7 +2863,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, buffer, + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2901,49 +2878,50 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, +int proc_dostring(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, +int proc_dointvec(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, +int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, +int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, +int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/trunk/kernel/time/Makefile b/trunk/kernel/time/Makefile index ee266620b06c..0b0a6366c9d4 100644 --- a/trunk/kernel/time/Makefile +++ b/trunk/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/trunk/kernel/time/timeconv.c b/trunk/kernel/time/timeconv.c deleted file mode 100644 index 86628e755f38..000000000000 --- a/trunk/kernel/time/timeconv.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. - * This file is part of the GNU C Library. - * Contributed by Paul Eggert (eggert@twinsun.com). - * - * The GNU C Library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * The GNU C Library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the GNU C Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* - * Converts the calendar time to broken-down time representation - * Based on code from glibc-2.6 - * - * 2009-7-14: - * Moved from glibc-2.6 to kernel by Zhaolei - */ - -#include -#include - -/* - * Nonzero if YEAR is a leap year (every 4 years, - * except every 100th isn't, and every 400th is). - */ -static int __isleap(long year) -{ - return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); -} - -/* do a mathdiv for long type */ -static long math_div(long a, long b) -{ - return a / b - (a % b < 0); -} - -/* How many leap years between y1 and y2, y1 must less or equal to y2 */ -static long leaps_between(long y1, long y2) -{ - long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) - + math_div(y1 - 1, 400); - long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) - + math_div(y2 - 1, 400); - return leaps2 - leaps1; -} - -/* How many days come before each month (0-12). */ -static const unsigned short __mon_yday[2][13] = { - /* Normal years. */ - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, - /* Leap years. */ - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} -}; - -#define SECS_PER_HOUR (60 * 60) -#define SECS_PER_DAY (SECS_PER_HOUR * 24) - -/** - * time_to_tm - converts the calendar time to local broken-down time - * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, - * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time - */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) -{ - long days, rem, y; - const unsigned short *ip; - - days = totalsecs / SECS_PER_DAY; - rem = totalsecs % SECS_PER_DAY; - rem += offset; - while (rem < 0) { - rem += SECS_PER_DAY; - --days; - } - while (rem >= SECS_PER_DAY) { - rem -= SECS_PER_DAY; - ++days; - } - - result->tm_hour = rem / SECS_PER_HOUR; - rem %= SECS_PER_HOUR; - result->tm_min = rem / 60; - result->tm_sec = rem % 60; - - /* January 1, 1970 was a Thursday. */ - result->tm_wday = (4 + days) % 7; - if (result->tm_wday < 0) - result->tm_wday += 7; - - y = 1970; - - while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { - /* Guess a corrected year, assuming 365 days per year. */ - long yg = y + math_div(days, 365); - - /* Adjust DAYS and Y to match the guessed year. */ - days -= (yg - y) * 365 + leaps_between(y, yg); - y = yg; - } - - result->tm_year = y - 1900; - - result->tm_yday = days; - - ip = __mon_yday[__isleap(y)]; - for (y = 11; days < ip[y]; y--) - continue; - days -= ip[y]; - - result->tm_mon = y; - result->tm_mday = days + 1; -} -EXPORT_SYMBOL(time_to_tm); diff --git a/trunk/kernel/trace/ftrace.c b/trunk/kernel/trace/ftrace.c index a142579765bf..23df7771c937 100644 --- a/trunk/kernel/trace/ftrace.c +++ b/trunk/kernel/trace/ftrace.c @@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/trunk/kernel/trace/trace_stack.c b/trunk/kernel/trace/trace_stack.c index 8504ac71e4e8..0f6facb050a1 100644 --- a/trunk/kernel/trace/trace_stack.c +++ b/trunk/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/trunk/kernel/utsname_sysctl.c b/trunk/kernel/utsname_sysctl.c index 69eae358a726..92359cc747a7 100644 --- a/trunk/kernel/utsname_sysctl.c +++ b/trunk/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } diff --git a/trunk/lib/decompress_inflate.c b/trunk/lib/decompress_inflate.c index fc686c7a0a0d..68dfce59c1b8 100644 --- a/trunk/lib/decompress_inflate.c +++ b/trunk/lib/decompress_inflate.c @@ -27,11 +27,6 @@ #define GZIP_IOBUF_SIZE (16*1024) -static int nofill(void *buffer, unsigned int len) -{ - return -1; -} - /* Included from initramfs et al code */ STATIC int INIT gunzip(unsigned char *buf, int len, int(*fill)(void*, unsigned int), @@ -81,9 +76,6 @@ STATIC int INIT gunzip(unsigned char *buf, int len, goto gunzip_nomem4; } - if (!fill) - fill = nofill; - if (len == 0) len = fill(zbuf, GZIP_IOBUF_SIZE); diff --git a/trunk/lib/decompress_unlzma.c b/trunk/lib/decompress_unlzma.c index ca82fde81c8f..0b954e04bd30 100644 --- a/trunk/lib/decompress_unlzma.c +++ b/trunk/lib/decompress_unlzma.c @@ -82,11 +82,6 @@ struct rc { #define RC_MODEL_TOTAL_BITS 11 -static int nofill(void *buffer, unsigned int len) -{ - return -1; -} - /* Called twice: once at startup and once in rc_normalize() */ static void INIT rc_read(struct rc *rc) { @@ -102,10 +97,7 @@ static inline void INIT rc_init(struct rc *rc, int (*fill)(void*, unsigned int), char *buffer, int buffer_size) { - if (fill) - rc->fill = fill; - else - rc->fill = nofill; + rc->fill = fill; rc->buffer = (uint8_t *)buffer; rc->buffer_size = buffer_size; rc->buffer_end = rc->buffer + rc->buffer_size; diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig index 247760729593..71eb0b4cce8d 100644 --- a/trunk/mm/Kconfig +++ b/trunk/mm/Kconfig @@ -245,20 +245,6 @@ config DEFAULT_MMAP_MIN_ADDR /proc/sys/vm/mmap_min_addr tunable. -config MEMORY_FAILURE - depends on MMU - depends on X86_MCE - bool "Enable recovery from hardware memory errors" - help - Enables code to recover from some memory failures on systems - with MCA recovery. This allows a system to continue running - even when some of its memory has uncorrected errors. This requires - special hardware support and typically ECC memory. - -config HWPOISON_INJECT - tristate "Poison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL - config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" depends on !MMU diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile index 515fd793c17f..88193d73cd1a 100644 --- a/trunk/mm/Makefile +++ b/trunk/mm/Makefile @@ -41,7 +41,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o -obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o -obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c index c1fc205a92c6..bcc7372aebbc 100644 --- a/trunk/mm/filemap.c +++ b/trunk/mm/filemap.c @@ -104,10 +104,6 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) - * - * (code doesn't rely on that order, so you could switch it around) - * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_lock */ /* diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index 6f048fcc749c..815dbd4a6dcb 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); diff --git a/trunk/mm/hwpoison-inject.c b/trunk/mm/hwpoison-inject.c deleted file mode 100644 index e1d85137f086..000000000000 --- a/trunk/mm/hwpoison-inject.c +++ /dev/null @@ -1,41 +0,0 @@ -/* Inject a hwpoison memory failure on a arbitary pfn */ -#include -#include -#include -#include - -static struct dentry *hwpoison_dir, *corrupt_pfn; - -static int hwpoison_inject(void *data, u64 val) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); - return __memory_failure(val, 18, 0); -} - -DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); - -static void pfn_inject_exit(void) -{ - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); -} - -static int pfn_inject_init(void) -{ - hwpoison_dir = debugfs_create_dir("hwpoison", NULL); - if (hwpoison_dir == NULL) - return -ENOMEM; - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, - NULL, &hwpoison_fops); - if (corrupt_pfn == NULL) { - pfn_inject_exit(); - return -ENOMEM; - } - return 0; -} - -module_init(pfn_inject_init); -module_exit(pfn_inject_exit); -MODULE_LICENSE("GPL"); diff --git a/trunk/mm/ksm.c b/trunk/mm/ksm.c index f7edac356f46..37cc37325094 100644 --- a/trunk/mm/ksm.c +++ b/trunk/mm/ksm.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -163,10 +162,10 @@ static unsigned long ksm_pages_unshared; static unsigned long ksm_rmap_items; /* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; +static unsigned long ksm_max_kernel_pages = 2000; /* Number of pages ksmd should scan in one batch */ -static unsigned int ksm_thread_pages_to_scan = 100; +static unsigned int ksm_thread_pages_to_scan = 200; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; @@ -174,7 +173,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20; #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run = KSM_RUN_STOP; +static unsigned int ksm_run = KSM_RUN_MERGE; static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -184,11 +183,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) -static void __init ksm_init_max_kernel_pages(void) -{ - ksm_max_kernel_pages = nr_free_buffer_pages() / 4; -} - static int __init ksm_slab_init(void) { rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); @@ -1673,8 +1667,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_init_max_kernel_pages(); - err = ksm_slab_init(); if (err) goto out; diff --git a/trunk/mm/madvise.c b/trunk/mm/madvise.c index 35b1479b7c9d..d9ae2067952e 100644 --- a/trunk/mm/madvise.c +++ b/trunk/mm/madvise.c @@ -218,32 +218,6 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } -#ifdef CONFIG_MEMORY_FAILURE -/* - * Error injection support for memory error handling. - */ -static int madvise_hwpoison(unsigned long start, unsigned long end) -{ - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - for (; start < end; start += PAGE_SIZE) { - struct page *p; - int ret = get_user_pages(current, current->mm, start, 1, - 0, 0, &p, NULL); - if (ret != 1) - return ret; - printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", - page_to_pfn(p), start); - /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, 1); - put_page(p); - } - return ret; -} -#endif - static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -334,10 +308,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON) - return madvise_hwpoison(start, start+len_in); -#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c index e2b98a6875c0..9b10d8753784 100644 --- a/trunk/mm/memcontrol.c +++ b/trunk/mm/memcontrol.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -44,7 +43,6 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 -struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ @@ -55,7 +53,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #endif static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ -#define SOFTLIMIT_EVENTS_THRESH (1000) /* * Statistics for memory cgroup. @@ -69,8 +66,6 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ - MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ - MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -83,20 +78,6 @@ struct mem_cgroup_stat { struct mem_cgroup_stat_cpu cpustat[0]; }; -static inline void -__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - stat->count[idx] = 0; -} - -static inline s64 -__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - return stat->count[idx]; -} - /* * For accounting under irq disable, no need for increment preempt count. */ @@ -136,12 +117,6 @@ struct mem_cgroup_per_zone { unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; - struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; - struct mem_cgroup *mem; /* Back pointer, we cannot */ - /* use container_of */ }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -154,26 +129,6 @@ struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; }; -/* - * Cgroups above their limits are maintained in a RB-Tree, independent of - * their hierarchy representation - */ - -struct mem_cgroup_tree_per_zone { - struct rb_root rb_root; - spinlock_t lock; -}; - -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - -struct mem_cgroup_tree { - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; -}; - -static struct mem_cgroup_tree soft_limit_tree __read_mostly; - /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -231,13 +186,6 @@ struct mem_cgroup { struct mem_cgroup_stat stat; }; -/* - * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft - * limit reclaim to prevent infinite loops, if they ever occur. - */ -#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) - enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, @@ -252,8 +200,13 @@ enum charge_type { #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) #define PCGF_LOCK (1UL << PCG_LOCK) -/* Not used, but added here for completeness */ -#define PCGF_ACCT (1UL << PCG_ACCT) +static const unsigned long +pcg_default_flags[NR_CHARGE_TYPE] = { + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + 0, /* FORCE */ +}; /* for encoding cft->private value on file */ #define _MEM (0) @@ -262,241 +215,15 @@ enum charge_type { #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* - * Reclaim flags for mem_cgroup_hierarchical_reclaim - */ -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) - static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) -{ - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; -} - -static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) -{ - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; - - return mem_cgroup_zoneinfo(mem, nid, zid); -} - -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) -{ - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - int zid = page_zonenum(page); - - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - struct rb_node **p = &mctz->rb_root.rb_node; - struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; - - if (mz->on_tree) - return; - - mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); - while (*p) { - parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, - tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) - p = &(*p)->rb_left; - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) - p = &(*p)->rb_right; - } - rb_link_node(&mz->tree_node, parent, p); - rb_insert_color(&mz->tree_node, &mctz->rb_root); - mz->on_tree = true; -} - -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - if (!mz->on_tree) - return; - rb_erase(&mz->tree_node, &mctz->rb_root); - mz->on_tree = false; -} - -static void -mem_cgroup_insert_exceeded(struct mem_cgroup *mem, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_insert_exceeded(mem, mz, mctz); - spin_unlock(&mctz->lock); -} - -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *mem, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(mem, mz, mctz); - spin_unlock(&mctz->lock); -} - -static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) -{ - bool ret = false; - int cpu; - s64 val; - struct mem_cgroup_stat_cpu *cpustat; - - cpu = get_cpu(); - cpustat = &mem->stat.cpustat[cpu]; - val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); - if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { - __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); - ret = true; - } - put_cpu(); - return ret; -} - -static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) -{ - unsigned long long prev_usage_in_excess, new_usage_in_excess; - bool updated_tree = false; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - - mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); - mctz = soft_limit_tree_from_page(page); - - /* - * We do updates in lazy mode, mem's are removed - * lazily from the per-zone, per-node rb tree - */ - prev_usage_in_excess = mz->usage_in_excess; - - new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); - if (prev_usage_in_excess) { - mem_cgroup_remove_exceeded(mem, mz, mctz); - updated_tree = true; - } - if (!new_usage_in_excess) - goto done; - mem_cgroup_insert_exceeded(mem, mz, mctz); - -done: - if (updated_tree) { - spin_lock(&mctz->lock); - mz->usage_in_excess = new_usage_in_excess; - spin_unlock(&mctz->lock); - } -} - -static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) -{ - int node, zone; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - - for_each_node_state(node, N_POSSIBLE) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(mem, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(mem, mz, mctz); - } - } -} - -static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) -{ - return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; -} - -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz = NULL; - -retry: - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) - goto done; /* Nothing to reclaim from */ - - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); - /* - * Remove the node now but someone else can add it back, - * we will to add it back at the end of reclaim to its correct - * position in the tree. - */ - __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); - if (!res_counter_soft_limit_excess(&mz->mem->res) || - !css_tryget(&mz->mem->css)) - goto retry; -done: - return mz; -} - -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct mem_cgroup_per_zone *mz; - - spin_lock(&mctz->lock); - mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); - return mz; -} - -static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, - bool charge) -{ - int val = (charge) ? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); - put_cpu(); -} - static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) { - int val = (charge) ? 1 : -1; + int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; int cpu = get_cpu(); @@ -513,10 +240,28 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); put_cpu(); } +static struct mem_cgroup_per_zone * +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +{ + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_per_zone * +page_cgroup_zoneinfo(struct page_cgroup *pc) +{ + struct mem_cgroup *mem = pc->mem_cgroup; + int nid = page_cgroup_nid(pc); + int zid = page_cgroup_zid(pc); + + if (!mem) + return NULL; + + return mem_cgroup_zoneinfo(mem, nid, zid); +} + static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { @@ -609,11 +354,6 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, return ret; } -static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) -{ - return (mem == root_mem_cgroup); -} - /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -631,24 +371,22 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; + struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) + if (list_empty(&pc->lru) || !pc->mem_cgroup) return; - VM_BUG_ON(!pc->mem_cgroup); /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); + mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); return; } @@ -672,8 +410,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + /* unused page is not rotated. */ + if (!PageCgroupUsed(pc)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -687,7 +425,6 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. @@ -698,9 +435,6 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; - SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; list_add(&pc->lru, &mz->lists[lru]); } @@ -735,7 +469,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) + if (PageLRU(page) && list_empty(&pc->lru)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -1121,62 +855,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options) + gfp_t gfp_mask, bool noswap, bool shrink) { struct mem_cgroup *victim; int ret, total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - unsigned long excess = mem_cgroup_get_excess(root_mem); /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) noswap = true; - while (1) { + while (loop < 2) { victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) { + if (victim == root_mem) loop++; - if (loop >= 2) { - /* - * If we have not been able to reclaim - * anything, it might because there are - * no reclaimable pages under this hierarchy - */ - if (!check_soft || !total) { - css_put(&victim->css); - break; - } - /* - * We want to do more targetted reclaim. - * excess >> 2 is not to excessive so as to - * reclaim too much, nor too less that we keep - * coming back to reclaim from this cgroup - */ - if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); - break; - } - } - } if (!mem_cgroup_local_usage(&victim->stat)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ - if (check_soft) - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - zone->zone_pgdat->node_id); - else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, get_swappiness(victim)); + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, + get_swappiness(victim)); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -1186,10 +886,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (shrink) return ret; total += ret; - if (check_soft) { - if (res_counter_check_under_soft_limit(&root_mem->res)) - return total; - } else if (mem_cgroup_check_under_limit(root_mem)) + if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; } return total; @@ -1268,11 +965,11 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom, struct page *page) + bool oom) { - struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; + struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct res_counter *fail_res, *soft_fail_res = NULL; + struct res_counter *fail_res; if (unlikely(test_thread_flag(TIF_MEMDIE))) { /* Don't account this! */ @@ -1299,23 +996,20 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret = 0; - unsigned long flags = 0; + int ret; + bool noswap = false; - if (mem_cgroup_is_root(mem)) - goto done; - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, - &soft_fail_res); + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) break; ret = res_counter_charge(&mem->memsw, PAGE_SIZE, - &fail_res, NULL); + &fail_res); if (likely(!ret)) break; /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; + res_counter_uncharge(&mem->res, PAGE_SIZE); + noswap = true; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); } else @@ -1326,8 +1020,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) goto nomem; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, + noswap, false); if (ret) continue; @@ -1352,24 +1046,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; } } - /* - * Insert just the ancestor, we should trickle down to the correct - * cgroup for reclaim, since the other nodes will be below their - * soft limit - */ - if (soft_fail_res) { - mem_over_soft_limit = - mem_cgroup_from_res_counter(soft_fail_res, res); - if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) - mem_cgroup_update_tree(mem_over_soft_limit, page); - } -done: return 0; nomem: css_put(&mem->css); return -ENOMEM; } + /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -1436,38 +1119,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, - NULL); - } + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); css_put(&mem->css); return; } - pc->mem_cgroup = mem; - /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. - */ smp_wmb(); - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_CACHE: - case MEM_CGROUP_CHARGE_TYPE_SHMEM: - SetPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - case MEM_CGROUP_CHARGE_TYPE_MAPPED: - ClearPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - default: - break; - } + pc->flags = pcg_default_flags[ctype]; mem_cgroup_charge_statistics(mem, pc, true); @@ -1518,8 +1178,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->res, PAGE_SIZE, NULL); + res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1538,8 +1197,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account && !mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); css_get(&to->css); @@ -1579,7 +1238,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); if (ret || !parent) return ret; @@ -1609,11 +1268,9 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - if (!mem_cgroup_is_root(parent)) { - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); - } + res_counter_uncharge(&parent->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE); return ret; } @@ -1638,7 +1295,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); if (ret || !mem) return ret; @@ -1757,14 +1414,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, page); + return __mem_cgroup_try_charge(mm, mask, ptr, true); } static void @@ -1802,10 +1459,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, - NULL); - mem_cgroup_swap_statistics(memcg, false); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1830,11 +1484,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); - } + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); css_put(&mem->css); } @@ -1848,7 +1500,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) struct page_cgroup *pc; struct mem_cgroup *mem = NULL; struct mem_cgroup_per_zone *mz; - bool soft_limit_excess = false; if (mem_cgroup_disabled()) return NULL; @@ -1887,14 +1538,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); - if (do_swap_account && - (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); - } - if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - mem_cgroup_swap_statistics(mem, true); + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1908,8 +1554,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) - mem_cgroup_update_tree(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) css_put(&mem->css); @@ -1985,9 +1629,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); - mem_cgroup_swap_statistics(memcg, false); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -2016,8 +1658,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - page); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); css_put(&mem->css); } *ptr = mem; @@ -2157,9 +1798,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, - GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, + false, true); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -2211,9 +1851,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -2224,97 +1862,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, - int zid) -{ - unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; - unsigned long reclaimed; - int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; - - if (order > 0) - return 0; - - mctz = soft_limit_tree_node_zone(nid, zid); - /* - * This loop can run a while, specially if mem_cgroup's continuously - * keep exceeding their soft limit and putting the system under - * pressure - */ - do { - if (next_mz) - mz = next_mz; - else - mz = mem_cgroup_largest_soft_limit_node(mctz); - if (!mz) - break; - - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT); - nr_reclaimed += reclaimed; - spin_lock(&mctz->lock); - - /* - * If we failed to reclaim anything from this memory cgroup - * it is time to move on to the next cgroup - */ - next_mz = NULL; - if (!reclaimed) { - do { - /* - * Loop until we find yet another one. - * - * By the time we get the soft_limit lock - * again, someone might have aded the - * group back on the RB tree. Iterate to - * make sure we get a different mem. - * mem_cgroup_largest_soft_limit_node returns - * NULL if no other cgroup is present on - * the tree - */ - next_mz = - __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) { - css_put(&next_mz->mem->css); - next_mz = NULL; - } else /* next_mz == NULL or other memcg */ - break; - } while (1); - } - mz->usage_in_excess = - res_counter_soft_limit_excess(&mz->mem->res); - __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); - /* - * One school of thought says that we should not add - * back the node to the tree if reclaim returns 0. - * But our reclaim could return 0, simply because due - * to priority we are exposing a smaller subset of - * memory to reclaim from. Consider this as a longer - * term TODO. - */ - if (mz->usage_in_excess) - __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); - spin_unlock(&mctz->lock); - css_put(&mz->mem->css); - loop++; - /* - * Could not reclaim anything and there are no more - * mem cgroups to try or we seem to be looping without - * reclaiming anything. - */ - if (!nr_reclaimed && - (next_mz == NULL || - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) - break; - } while (!nr_reclaimed); - if (next_mz) - css_put(&next_mz->mem->css); - return nr_reclaimed; -} - /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -2499,64 +2046,20 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } -struct mem_cgroup_idx_data { - s64 val; - enum mem_cgroup_stat_index idx; -}; - -static int -mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) -{ - struct mem_cgroup_idx_data *d = data; - d->val += mem_cgroup_read_stat(&mem->stat, d->idx); - return 0; -} - -static void -mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx, s64 *val) -{ - struct mem_cgroup_idx_data d; - d.idx = idx; - d.val = 0; - mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); - *val = d.val; -} - static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 idx_val, val; + u64 val = 0; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - val <<= PAGE_SHIFT; - } else - val = res_counter_read_u64(&mem->res, name); + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT, &idx_val); - val <<= PAGE_SHIFT; - } else - val = res_counter_read_u64(&mem->memsw, name); + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2580,10 +2083,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, name = MEMFILE_ATTR(cft->private); switch (name) { case RES_LIMIT: - if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ - ret = -EINVAL; - break; - } /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); if (ret) @@ -2593,20 +2092,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; - case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buffer, &val); - if (ret) - break; - /* - * For memsw, soft limits are hard to implement in terms - * of semantics, for now, we support soft limits for - * control without swap - */ - if (type == _MEM) - ret = res_counter_set_soft_limit(&memcg->res, val); - else - ret = -EINVAL; - break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -2664,7 +2149,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) res_counter_reset_failcnt(&mem->memsw); break; } - return 0; } @@ -2676,7 +2160,6 @@ enum { MCS_MAPPED_FILE, MCS_PGPGIN, MCS_PGPGOUT, - MCS_SWAP, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -2698,7 +2181,6 @@ struct { {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, - {"swap", "total_swap"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -2723,10 +2205,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; - if (do_swap_account) { - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); - s->stat[MCS_SWAP] += val * PAGE_SIZE; - } /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -2758,11 +2236,8 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) - continue; + for (i = 0; i < NR_MCS_STAT; i++) cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); - } /* Hierarchical information */ { @@ -2775,11 +2250,9 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_total_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) - continue; + for (i = 0; i < NR_MCS_STAT; i++) cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); - } + #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2871,12 +2344,6 @@ static struct cftype mem_cgroup_files[] = { .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, - { - .name = "soft_limit_in_bytes", - .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), - .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, - }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), @@ -2971,9 +2438,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); - mz->usage_in_excess = 0; - mz->on_tree = false; - mz->mem = mem; } return 0; } @@ -3019,7 +2483,6 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; - mem_cgroup_remove_from_trees(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) @@ -3068,31 +2531,6 @@ static void __init enable_swap_cgroup(void) } #endif -static int mem_cgroup_soft_limit_tree_init(void) -{ - struct mem_cgroup_tree_per_node *rtpn; - struct mem_cgroup_tree_per_zone *rtpz; - int tmp, node, zone; - - for_each_node_state(node, N_POSSIBLE) { - tmp = node; - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - if (!rtpn) - return 1; - - soft_limit_tree.rb_tree_per_node[node] = rtpn; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } - } - return 0; -} - static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -3107,15 +2545,10 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; - /* root ? */ if (cont->parent == NULL) { enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = mem; - if (mem_cgroup_soft_limit_tree_init()) - goto free_out; - } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -3144,7 +2577,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); - root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -3180,8 +2612,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mutex_lock(&memcg_tasklist); /* diff --git a/trunk/mm/memory-failure.c b/trunk/mm/memory-failure.c deleted file mode 100644 index 729d4b15b645..000000000000 --- a/trunk/mm/memory-failure.c +++ /dev/null @@ -1,832 +0,0 @@ -/* - * Copyright (C) 2008, 2009 Intel Corporation - * Authors: Andi Kleen, Fengguang Wu - * - * This software may be redistributed and/or modified under the terms of - * the GNU General Public License ("GPL") version 2 only as published by the - * Free Software Foundation. - * - * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache - * failure. - * - * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. - */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ -#define DEBUG 1 /* remove me in 2.6.34 */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "internal.h" - -int sysctl_memory_failure_early_kill __read_mostly = 0; - -int sysctl_memory_failure_recovery __read_mostly = 1; - -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); - -/* - * Send all the processes who have the page mapped an ``action optional'' - * signal. - */ -static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) -{ - struct siginfo si; - int ret; - - printk(KERN_ERR - "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", - pfn, t->comm, t->pid); - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_MCEERR_AO; - si.si_addr = (void *)addr; -#ifdef __ARCH_SI_TRAPNO - si.si_trapno = trapno; -#endif - si.si_addr_lsb = PAGE_SHIFT; - /* - * Don't use force here, it's convenient if the signal - * can be temporarily blocked. - * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? - */ - ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ - if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); - return ret; -} - -/* - * Kill all processes that have a poisoned page mapped and then isolate - * the page. - * - * General strategy: - * Find all processes having the page mapped and kill them. - * But we keep a page reference around so that the page is not - * actually freed yet. - * Then stash the page away - * - * There's no convenient way to get back to mapped processes - * from the VMAs. So do a brute-force search over all - * running processes. - * - * Remember that machine checks are not common (or rather - * if they are common you have other problems), so this shouldn't - * be a performance issue. - * - * Also there are some races possible while we get from the - * error detection to actually handle it. - */ - -struct to_kill { - struct list_head nd; - struct task_struct *tsk; - unsigned long addr; - unsigned addr_valid:1; -}; - -/* - * Failure handling: if we can't find or can't kill a process there's - * not much we can do. We just print a message and ignore otherwise. - */ - -/* - * Schedule a process for later kill. - * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * TBD would GFP_NOIO be enough? - */ -static void add_to_kill(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill, - struct to_kill **tkc) -{ - struct to_kill *tk; - - if (*tkc) { - tk = *tkc; - *tkc = NULL; - } else { - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); - if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); - return; - } - } - tk->addr = page_address_in_vma(p, vma); - tk->addr_valid = 1; - - /* - * In theory we don't have to kill when the page was - * munmaped. But it could be also a mremap. Since that's - * likely very rare kill anyways just out of paranoia, but use - * a SIGKILL because the error is not contained anymore. - */ - if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", - page_to_pfn(p), tsk->comm); - tk->addr_valid = 0; - } - get_task_struct(tsk); - tk->tsk = tsk; - list_add_tail(&tk->nd, to_kill); -} - -/* - * Kill the processes that have been collected earlier. - * - * Only do anything when DOIT is set, otherwise just free the list - * (this is used for clean pages which do not need killing) - * Also when FAIL is set do a force kill because something went - * wrong earlier. - */ -static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) -{ - struct to_kill *tk, *next; - - list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { - /* - * In case something went wrong with munmaping - * make sure the process doesn't catch the - * signal and then access the memory. Just kill it. - * the signal handlers - */ - if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); - } - - /* - * In theory the process could have mapped - * something else on the address in-between. We could - * check for that, but we need to tell the - * process anyways. - */ - else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); - } - put_task_struct(tk->tsk); - kfree(tk); - } -} - -static int task_early_kill(struct task_struct *tsk) -{ - if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; -} - -/* - * Collect processes when the error hit an anonymous page. - */ -static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) -{ - struct vm_area_struct *vma; - struct task_struct *tsk; - struct anon_vma *av; - - read_lock(&tasklist_lock); - av = page_lock_anon_vma(page); - if (av == NULL) /* Not actually mapped anymore */ - goto out; - for_each_process (tsk) { - if (!task_early_kill(tsk)) - continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { - if (!page_mapped_in_vma(page, vma)) - continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); - } - } - page_unlock_anon_vma(av); -out: - read_unlock(&tasklist_lock); -} - -/* - * Collect processes when the error hit a file mapped page. - */ -static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) -{ - struct vm_area_struct *vma; - struct task_struct *tsk; - struct prio_tree_iter iter; - struct address_space *mapping = page->mapping; - - /* - * A note on the locking order between the two locks. - * We don't rely on this particular order. - * If you have some other code that needs a different order - * feel free to switch them around. Or add a reverse link - * from mm_struct to task_struct, then this could be all - * done without taking tasklist_lock and looping over all tasks. - */ - - read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); - for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (!task_early_kill(tsk)) - continue; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, - pgoff) { - /* - * Send early kill signal to tasks where a vma covers - * the page but the corrupted page is not necessarily - * mapped it in its pte. - * Assume applications who requested early kill want - * to be informed of all such data corruptions. - */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); - } - } - spin_unlock(&mapping->i_mmap_lock); - read_unlock(&tasklist_lock); -} - -/* - * Collect the processes who have the corrupted page mapped to kill. - * This is done in two steps for locking reasons. - * First preallocate one tokill structure outside the spin locks, - * so that we can kill at least one process reasonably reliable. - */ -static void collect_procs(struct page *page, struct list_head *tokill) -{ - struct to_kill *tk; - - if (!page->mapping) - return; - - tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); - if (!tk) - return; - if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); - else - collect_procs_file(page, tokill, &tk); - kfree(tk); -} - -/* - * Error handlers for various types of pages. - */ - -enum outcome { - FAILED, /* Error handling failed */ - DELAYED, /* Will be handled later */ - IGNORED, /* Error safely ignored */ - RECOVERED, /* Successfully recovered */ -}; - -static const char *action_name[] = { - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [IGNORED] = "Ignored", - [RECOVERED] = "Recovered", -}; - -/* - * Error hit kernel page. - * Do nothing, try to be lucky and not touch this instead. For a few cases we - * could be more sophisticated. - */ -static int me_kernel(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - -/* - * Already poisoned page. - */ -static int me_ignore(struct page *p, unsigned long pfn) -{ - return IGNORED; -} - -/* - * Page in unknown state. Do nothing. - */ -static int me_unknown(struct page *p, unsigned long pfn) -{ - printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; -} - -/* - * Free memory - */ -static int me_free(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - -/* - * Clean (or cleaned) page cache page. - */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) -{ - int err; - int ret = FAILED; - struct address_space *mapping; - - if (!isolate_lru_page(p)) - page_cache_release(p); - - /* - * For anonymous pages we're done the only reference left - * should be the one m_f() holds. - */ - if (PageAnon(p)) - return RECOVERED; - - /* - * Now truncate the page in the page cache. This is really - * more like a "temporary hole punch" - * Don't do this for block devices when someone else - * has a reference, because it could be file system metadata - * and that's not safe to truncate. - */ - mapping = page_mapping(p); - if (!mapping) { - /* - * Page has been teared down in the meanwhile - */ - return FAILED; - } - - /* - * Truncation is a bit tricky. Enable it per file system for now. - * - * Open: to take i_mutex or not for this? Right now we don't. - */ - if (mapping->a_ops->error_remove_page) { - err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { - printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", - pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); - } else { - ret = RECOVERED; - } - } else { - /* - * If the file system doesn't support it just invalidate - * This fails on dirty or anything with private pages - */ - if (invalidate_inode_page(p)) - ret = RECOVERED; - else - printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", - pfn); - } - return ret; -} - -/* - * Dirty cache page page - * Issues: when the error hit a hole page the error is not properly - * propagated. - */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) -{ - struct address_space *mapping = page_mapping(p); - - SetPageError(p); - /* TBD: print more information about the file. */ - if (mapping) { - /* - * IO error will be reported by write(), fsync(), etc. - * who check the mapping. - * This way the application knows that something went - * wrong with its dirty file data. - * - * There's one open issue: - * - * The EIO will be only reported on the next IO - * operation and then cleared through the IO map. - * Normally Linux has two mechanisms to pass IO error - * first through the AS_EIO flag in the address space - * and then through the PageError flag in the page. - * Since we drop pages on memory failure handling the - * only mechanism open to use is through AS_AIO. - * - * This has the disadvantage that it gets cleared on - * the first operation that returns an error, while - * the PageError bit is more sticky and only cleared - * when the page is reread or dropped. If an - * application assumes it will always get error on - * fsync, but does other operations on the fd before - * and the page is dropped inbetween then the error - * will not be properly reported. - * - * This can already happen even without hwpoisoned - * pages: first on metadata IO errors (which only - * report through AS_EIO) or when the page is dropped - * at the wrong time. - * - * So right now we assume that the application DTRT on - * the first EIO, but we're not worse than other parts - * of the kernel. - */ - mapping_set_error(mapping, EIO); - } - - return me_pagecache_clean(p, pfn); -} - -/* - * Clean and dirty swap cache. - * - * Dirty swap cache page is tricky to handle. The page could live both in page - * cache and swap cache(ie. page is freshly swapped in). So it could be - * referenced concurrently by 2 types of PTEs: - * normal PTEs and swap PTEs. We try to handle them consistently by calling - * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, - * and then - * - clear dirty bit to prevent IO - * - remove from LRU - * - but keep in the swap cache, so that when we return to it on - * a later page fault, we know the application is accessing - * corrupted data and shall be killed (we installed simple - * interception code in do_swap_page to catch it). - * - * Clean swap cache pages can be directly isolated. A later page fault will - * bring in the known good data from disk. - */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) -{ - int ret = FAILED; - - ClearPageDirty(p); - /* Trigger EIO in shmem: */ - ClearPageUptodate(p); - - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = DELAYED; - } - - return ret; -} - -static int me_swapcache_clean(struct page *p, unsigned long pfn) -{ - int ret = FAILED; - - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = RECOVERED; - } - delete_from_swap_cache(p); - return ret; -} - -/* - * Huge pages. Needs work. - * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. - */ -static int me_huge_page(struct page *p, unsigned long pfn) -{ - return FAILED; -} - -/* - * Various page states we can handle. - * - * A page state is defined by its current page->flags bits. - * The table matches them in order and calls the right handler. - * - * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. - * - * This is not complete. More states could be added. - * For any missing state don't attempt recovery. - */ - -#define dirty (1UL << PG_dirty) -#define sc (1UL << PG_swapcache) -#define unevict (1UL << PG_unevictable) -#define mlock (1UL << PG_mlocked) -#define writeback (1UL << PG_writeback) -#define lru (1UL << PG_lru) -#define swapbacked (1UL << PG_swapbacked) -#define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) -#define slab (1UL << PG_slab) -#define buddy (1UL << PG_buddy) -#define reserved (1UL << PG_reserved) - -static struct page_state { - unsigned long mask; - unsigned long res; - char *msg; - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { - { reserved, reserved, "reserved kernel", me_ignore }, - { buddy, buddy, "free kernel", me_free }, - - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, "kernel slab", me_kernel }, - -#ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, "huge", me_huge_page }, - { tail, tail, "huge", me_huge_page }, -#else - { compound, compound, "huge", me_huge_page }, -#endif - - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, - - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, - -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, -#endif - - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, - { lru|dirty, lru, "clean LRU", me_pagecache_clean }, - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, - - /* - * Catchall entry: must be at end. - */ - { 0, 0, "unknown page state", me_unknown }, -}; - -#undef lru - -static void action_result(unsigned long pfn, char *msg, int result) -{ - struct page *page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - page && PageDirty(page) ? "dirty " : "", - msg, action_name[result]); -} - -static int page_action(struct page_state *ps, struct page *p, - unsigned long pfn, int ref) -{ - int result; - - result = ps->action(p, pfn); - action_result(pfn, ps->msg, result); - if (page_count(p) != 1 + ref) - printk(KERN_ERR - "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, page_count(p) - 1); - - /* Could do more checks here if page looks ok */ - /* - * Could adjust zone counters here to correct for the missing page. - */ - - return result == RECOVERED ? 0 : -EBUSY; -} - -#define N_UNMAP_TRIES 5 - -/* - * Do all that is necessary to remove user space mappings. Unmap - * the pages and send SIGBUS to the processes if the data was dirty. - */ -static void hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno) -{ - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; - struct address_space *mapping; - LIST_HEAD(tokill); - int ret; - int i; - int kill = 1; - - if (PageReserved(p) || PageCompound(p) || PageSlab(p)) - return; - - if (!PageLRU(p)) - lru_add_drain_all(); - - /* - * This check implies we don't kill processes if their pages - * are in the swap cache early. Those are always late kills. - */ - if (!page_mapped(p)) - return; - - if (PageSwapCache(p)) { - printk(KERN_ERR - "MCE %#lx: keeping poisoned page in swap cache\n", pfn); - ttu |= TTU_IGNORE_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - */ - mapping = page_mapping(p); - if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { - if (page_mkclean(p)) { - SetPageDirty(p); - } else { - kill = 0; - ttu |= TTU_IGNORE_HWPOISON; - printk(KERN_INFO - "MCE %#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - - /* - * First collect all the processes that have the page - * mapped in dirty form. This has to be done before try_to_unmap, - * because ttu takes the rmap data structures down. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. - */ - if (kill) - collect_procs(p, &tokill); - - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(p, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } - - if (ret != SWAP_SUCCESS) - printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); - - /* - * Now that the dirty bit has been propagated to the - * struct page and all unmaps done we can decide if - * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely - * freed. When there was a problem unmapping earlier - * use a more force-full uncatchable kill to prevent - * any accesses to the poisoned memory. - */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); -} - -int __memory_failure(unsigned long pfn, int trapno, int ref) -{ - struct page_state *ps; - struct page *p; - int res; - - if (!sysctl_memory_failure_recovery) - panic("Memory failure from trap %d on page %lx", trapno, pfn); - - if (!pfn_valid(pfn)) { - action_result(pfn, "memory outside kernel control", IGNORED); - return -EIO; - } - - p = pfn_to_page(pfn); - if (TestSetPageHWPoison(p)) { - action_result(pfn, "already hardware poisoned", IGNORED); - return 0; - } - - atomic_long_add(1, &mce_bad_pages); - - /* - * We need/can do nothing about count=0 pages. - * 1) it's a free page, and therefore in safe hand: - * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. - * Implies some kernel user: cannot stop them from - * R/W the page; let's pray that the page has been - * used and will be freed some time later. - * In fact it's dangerous to directly bump up page count from 0, - * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. - */ - if (!get_page_unless_zero(compound_head(p))) { - action_result(pfn, "free or high order kernel", IGNORED); - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; - } - - /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. - */ - lock_page_nosync(p); - wait_on_page_writeback(p); - - /* - * Now take care of user space mappings. - */ - hwpoison_user_mappings(p, pfn, trapno); - - /* - * Torn down by someone else? - */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, "already truncated LRU", IGNORED); - res = 0; - goto out; - } - - res = -EBUSY; - for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { - res = page_action(ps, p, pfn, ref); - break; - } - } -out: - unlock_page(p); - return res; -} -EXPORT_SYMBOL_GPL(__memory_failure); - -/** - * memory_failure - Handle memory failure of a page. - * @pfn: Page Number of the corrupted page - * @trapno: Trap number reported in the signal to user space. - * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. - * - * The function is primarily of use for corruptions that - * happen outside the current execution context (e.g. when - * detected by a background scrubber) - * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. - */ -void memory_failure(unsigned long pfn, int trapno) -{ - __memory_failure(pfn, trapno, 0); -} diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index 987389a809e7..b1443ac07c00 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -1325,8 +1325,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) + else if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } @@ -2560,15 +2559,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); - } else if (is_hwpoison_entry(entry)) { - ret = VM_FAULT_HWPOISON; - } else { - print_bad_pte(vma, address, orig_pte, NULL); - ret = VM_FAULT_OOM; - } + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); @@ -2592,10 +2584,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - } else if (PageHWPoison(page)) { - ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - goto out; } lock_page(page); @@ -2772,12 +2760,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; - if (unlikely(PageHWPoison(vmf.page))) { - if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - return VM_FAULT_HWPOISON; - } - /* * For consistency in subsequent calls, make the faulted page always * locked. diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c index 1a4bf4813780..16052e80aaac 100644 --- a/trunk/mm/migrate.c +++ b/trunk/mm/migrate.c @@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + try_to_unmap(page, 1); skip_unmap: if (!page_mapped(page)) diff --git a/trunk/mm/page-writeback.c b/trunk/mm/page-writeback.c index d99664e8607e..5f378dd58802 100644 --- a/trunk/mm/page-writeback.c +++ b/trunk/mm/page-writeback.c @@ -155,37 +155,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); return 0; } @@ -1149,13 +1149,6 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* - * Dirty a page. - * - * For pages with a mapping this should be done under the page lock - * for the benefit of asynchronous memory errors who prefer a consistent - * dirty state. This rule can be broken in some special cases, - * but should be better not to. - * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c index bf720550b44d..5717f27a0704 100644 --- a/trunk/mm/page_alloc.c +++ b/trunk/mm/page_alloc.c @@ -234,12 +234,6 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; - /* Don't complain about poisoned pages */ - if (PageHWPoison(page)) { - __ClearPageBuddy(page); - return; - } - /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -672,7 +666,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static inline int check_new_page(struct page *page) +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -681,18 +675,6 @@ static inline int check_new_page(struct page *page) bad_page(page); return 1; } - return 0; -} - -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) -{ - int i; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (unlikely(check_new_page(p))) - return 1; - } set_page_private(page, 0); set_page_refcounted(page); @@ -2391,7 +2373,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2400,7 +2382,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, buffer, length, ppos); + ret = proc_dostring(table, write, file, buffer, length, ppos); if (ret) return ret; if (write) { @@ -4724,9 +4706,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4734,12 +4716,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (rc) return rc; @@ -4750,12 +4732,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (rc) return rc; @@ -4776,9 +4758,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, buffer, length, ppos); + proc_dointvec_minmax(table, write, file, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4790,13 +4772,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c index 28aafe2b5306..720fc03a7bc4 100644 --- a/trunk/mm/rmap.c +++ b/trunk/mm/rmap.c @@ -36,11 +36,6 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) - * - * (code doesn't rely on that order so it could be switched around) - * ->tasklist_lock - * anon_vma->lock (memory_failure, collect_procs_anon) - * pte map lock */ #include @@ -196,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +static struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -216,7 +211,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page) return NULL; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +static void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -316,7 +311,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -761,7 +756,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - enum ttu_flags flags) + int migration) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -783,13 +778,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!(flags & TTU_IGNORE_MLOCK)) { + if (!migration) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } - } - if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -807,14 +800,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); - else - dec_mm_counter(mm, file_rss); - set_pte_at(mm, address, pte, - swp_entry_to_pte(make_hwpoison_entry(page))); - } else if (PageAnon(page)) { + if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -836,12 +822,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); + BUG_ON(!migration); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { + } else if (PAGE_MIGRATION && migration) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1010,13 +996,12 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) +static int try_to_unmap_anon(struct page *page, int unlock, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1032,7 +1017,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { - ret = try_to_unmap_one(page, vma, flags); + ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -1056,7 +1041,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @flags: action and flags + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1068,7 +1054,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) +static int try_to_unmap_file(struct page *page, int unlock, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1080,7 +1066,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; unsigned int mlocked = 0; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1093,7 +1078,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { - ret = try_to_unmap_one(page, vma, flags); + ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -1118,8 +1103,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) ret = SWAP_MLOCK; /* leave mlocked == 0 */ goto out; /* no need to look further */ } - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && - (vma->vm_flags & VM_LOCKED)) + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -1153,7 +1137,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; @@ -1193,7 +1177,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @flags: action and flags + * @migration: migration flag * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1204,16 +1188,16 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +int try_to_unmap(struct page *page, int migration) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); + ret = try_to_unmap_anon(page, 0, migration); else - ret = try_to_unmap_file(page, flags); + ret = try_to_unmap_file(page, 0, migration); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1238,8 +1222,8 @@ int try_to_munlock(struct page *page) VM_BUG_ON(!PageLocked(page) || PageLRU(page)); if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); + return try_to_unmap_anon(page, 1, 0); else - return try_to_unmap_file(page, TTU_MUNLOCK); + return try_to_unmap_file(page, 1, 0); } diff --git a/trunk/mm/shmem.c b/trunk/mm/shmem.c index 98631c26c200..b206a7a32e2a 100644 --- a/trunk/mm/shmem.c +++ b/trunk/mm/shmem.c @@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - set_page_dirty(page); unlock_page(page); + set_page_dirty(page); page_cache_release(page); return copied; @@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } + unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); - unlock_page(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) @@ -2420,7 +2420,6 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, - .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { diff --git a/trunk/mm/swapfile.c b/trunk/mm/swapfile.c index 4de7f02f820b..f1bf19daadc6 100644 --- a/trunk/mm/swapfile.c +++ b/trunk/mm/swapfile.c @@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (non_swap_entry(entry)) + if (is_migration_entry(entry)) return 1; p = swap_info_get(entry); @@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) int count; bool has_cache; - if (non_swap_entry(entry)) + if (is_migration_entry(entry)) return -EINVAL; type = swp_type(entry); diff --git a/trunk/mm/truncate.c b/trunk/mm/truncate.c index a17b3977cfdf..ccc3ecf7cb98 100644 --- a/trunk/mm/truncate.c +++ b/trunk/mm/truncate.c @@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static int +static void truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return -EIO; + return; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,7 +108,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ - return 0; } /* @@ -136,51 +135,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } -int truncate_inode_page(struct address_space *mapping, struct page *page) -{ - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - return truncate_complete_page(mapping, page); -} - -/* - * Used to get rid of pages on hardware memory corruption. - */ -int generic_error_remove_page(struct address_space *mapping, struct page *page) -{ - if (!mapping) - return -EINVAL; - /* - * Only punch for normal data pages for now. - * Handling other types like directories would need more auditing. - */ - if (!S_ISREG(mapping->host->i_mode)) - return -EIO; - return truncate_inode_page(mapping, page); -} -EXPORT_SYMBOL(generic_error_remove_page); - -/* - * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. The page must be locked. - * - * Returns 1 if the page is successfully invalidated, otherwise 0. - */ -int invalidate_inode_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - if (!mapping) - return 0; - if (PageDirty(page) || PageWriteback(page)) - return 0; - if (page_mapped(page)) - return 0; - return invalidate_complete_page(mapping, page); -} - /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -242,7 +196,12 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - truncate_inode_page(mapping, page); + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page_index<index<index > next) next = page->index; next++; + truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -347,8 +311,12 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - ret += invalidate_inode_page(page); - + if (PageDirty(page) || PageWriteback(page)) + goto unlock; + if (page_mapped(page)) + goto unlock; + ret += invalidate_complete_page(mapping, page); +unlock: unlock_page(page); if (next > end) break; diff --git a/trunk/mm/vmscan.c b/trunk/mm/vmscan.c index 1219ceb8a9b2..613e89f471d9 100644 --- a/trunk/mm/vmscan.c +++ b/trunk/mm/vmscan.c @@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1836,45 +1836,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone, int nid) -{ - struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = swappiness, - .order = 0, - .mem_cgroup = mem, - .isolate_pages = mem_cgroup_isolate_pages, - }; - nodemask_t nm = nodemask_of_node(nid); - - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - sc.nodemask = &nm; - sc.nr_reclaimed = 0; - sc.nr_scanned = 0; - /* - * NOTE: Although we can get the priority field, using it - * here is not a good idea, since it limits the pages we can scan. - * if we don't reclaim here, the shrink_zone from balance_pgdat - * will pick up pages from other mem cgroup's as well. We hack - * the priority and make it zero. - */ - shrink_zone(0, zone, &sc); - return sc.nr_reclaimed; -} - unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { - struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1886,6 +1852,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; + struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2007,7 +1974,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; - int nid, zid; if (!populated_zone(zone)) continue; @@ -2022,15 +1988,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); - - nid = pgdat->node_id; - zid = zone_idx(zone); - /* - * Call soft limit reclaim before calling shrink_zone. - * For now we ignore the return value - */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, - nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2844,10 +2801,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, buffer, length, ppos); + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); diff --git a/trunk/net/bridge/br_netfilter.c b/trunk/net/bridge/br_netfilter.c index a16a2342f6bf..907a82e9023d 100644 --- a/trunk/net/bridge/br_netfilter.c +++ b/trunk/net/bridge/br_netfilter.c @@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { #ifdef CONFIG_SYSCTL static -int brnf_sysctl_call_tables(ctl_table * ctl, int write, +int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp, void __user * buffer, size_t * lenp, loff_t * ppos) { int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; diff --git a/trunk/net/decnet/dn_dev.c b/trunk/net/decnet/dn_dev.c index 6e1f085db06a..1c6a5bb6f0c8 100644 --- a/trunk/net/decnet/dn_dev.c +++ b/trunk/net/decnet/dn_dev.c @@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(ctl_table *, int, +static int dn_forwarding_proc(ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, @@ -274,6 +274,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(ctl_table *table, int write, + struct file *filep, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -289,7 +290,7 @@ static int dn_forwarding_proc(ctl_table *table, int write, dn_db = dev->dn_ptr; old = dn_db->parms.forwarding; - err = proc_dointvec(table, write, buffer, lenp, ppos); + err = proc_dointvec(table, write, filep, buffer, lenp, ppos); if ((err >= 0) && write) { if (dn_db->parms.forwarding < 0) diff --git a/trunk/net/decnet/sysctl_net_decnet.c b/trunk/net/decnet/sysctl_net_decnet.c index 26b0ab1e9f56..5bcd592ae6dd 100644 --- a/trunk/net/decnet/sysctl_net_decnet.c +++ b/trunk/net/decnet/sysctl_net_decnet.c @@ -165,6 +165,7 @@ static int dn_node_address_strategy(ctl_table *table, } static int dn_node_address_handler(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -275,6 +276,7 @@ static int dn_def_dev_strategy(ctl_table *table, static int dn_def_dev_handler(ctl_table *table, int write, + struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/trunk/net/ipv4/devinet.c b/trunk/net/ipv4/devinet.c index e92f1fd28aa5..07336c6201f0 100644 --- a/trunk/net/ipv4/devinet.c +++ b/trunk/net/ipv4/devinet.c @@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net) } static int devinet_conf_proc(ctl_table *ctl, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) { struct ipv4_devconf *cnf = ctl->extra1; @@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table, } static int devinet_sysctl_forward(ctl_table *ctl, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *valp != val) { struct net *net = ctl->extra2; @@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, } int ipv4_doint_and_flush(ctl_table *ctl, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) diff --git a/trunk/net/ipv4/route.c b/trunk/net/ipv4/route.c index bb4199252026..df9347314538 100644 --- a/trunk/net/ipv4/route.c +++ b/trunk/net/ipv4/route.c @@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) #ifdef CONFIG_SYSCTL static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, - void __user *buffer, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { @@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, memcpy(&ctl, __ctl, sizeof(ctl)); ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); + proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); @@ -3106,11 +3106,12 @@ static void rt_secret_reschedule(int old) } static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); rt_secret_reschedule(old); diff --git a/trunk/net/ipv4/sysctl_net_ipv4.c b/trunk/net/ipv4/sysctl_net_ipv4.c index 2dcf04d9b005..4710d219f06a 100644 --- a/trunk/net/ipv4/sysctl_net_ipv4.c +++ b/trunk/net/ipv4/sysctl_net_ipv4.c @@ -36,7 +36,7 @@ static void set_local_port_range(int range[2]) } /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, +static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, }; inet_get_local_port_range(range, range + 1); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) @@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { char val[TCP_CA_NAME_MAX]; @@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, tcp_get_default_congestion_control(val); - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(val); return ret; @@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, } static int proc_tcp_available_congestion_control(ctl_table *ctl, - int write, + int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl, if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(ctl_table *ctl, - int write, + int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); diff --git a/trunk/net/ipv6/addrconf.c b/trunk/net/ipv6/addrconf.c index 1fd0a3d775d2..55f486d89c88 100644 --- a/trunk/net/ipv6/addrconf.c +++ b/trunk/net/ipv6/addrconf.c @@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL static -int addrconf_sysctl_forward(ctl_table *ctl, int write, +int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) ret = addrconf_fixup_forwarding(ctl, valp, val); @@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) } static -int addrconf_sysctl_disable(ctl_table *ctl, int write, +int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) ret = addrconf_disable_ipv6(ctl, valp, val); diff --git a/trunk/net/ipv6/ndisc.c b/trunk/net/ipv6/ndisc.c index 498b9b0b0fad..7015478797f6 100644 --- a/trunk/net/ipv6/ndisc.c +++ b/trunk/net/ipv6/ndisc.c @@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; @@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *bu ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = proc_dointvec_jiffies(ctl, write, - buffer, lenp, ppos); + filp, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = proc_dointvec_ms_jiffies(ctl, write, - buffer, lenp, ppos); + filp, buffer, lenp, ppos); else ret = -1; diff --git a/trunk/net/ipv6/route.c b/trunk/net/ipv6/route.c index d6fe7646a8ff..77aecbe8ff6c 100644 --- a/trunk/net/ipv6/route.c +++ b/trunk/net/ipv6/route.c @@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = { #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; int delay = net->ipv6.sysctl.flush_delay; if (write) { - proc_dointvec(ctl, write, buffer, lenp, ppos); + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); return 0; } else diff --git a/trunk/net/irda/irsysctl.c b/trunk/net/irda/irsysctl.c index 5c86567e5a78..57f8817c3979 100644 --- a/trunk/net/irda/irsysctl.c +++ b/trunk/net/irda/irsysctl.c @@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100; /* 100us */ /* For other sysctl, I've no idea of the range. Maybe Dag could help * us on that - Jean II */ -static int do_devname(ctl_table *table, int write, +static int do_devname(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dostring(table, write, buffer, lenp, ppos); + ret = proc_dostring(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write) { struct ias_value *val; @@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, } -static int do_discovery(ctl_table *table, int write, +static int do_discovery(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (ret) return ret; diff --git a/trunk/net/netfilter/ipvs/ip_vs_ctl.c b/trunk/net/netfilter/ipvs/ip_vs_ctl.c index 446e9bd4b4bc..fba2892b99e1 100644 --- a/trunk/net/netfilter/ipvs/ip_vs_ctl.c +++ b/trunk/net/netfilter/ipvs/ip_vs_ctl.c @@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void) static int -proc_do_defense_mode(ctl_table *table, int write, +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; - rc = proc_dointvec(table, write, buffer, lenp, ppos); + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (write && (*valp != val)) { if ((*valp < 0) || (*valp > 3)) { /* Restore the correct value */ @@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, static int -proc_do_sync_threshold(ctl_table *table, int write, +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, /* backup the value first */ memcpy(val, valp, sizeof(val)); - rc = proc_dointvec(table, write, buffer, lenp, ppos); + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); diff --git a/trunk/net/netfilter/nf_log.c b/trunk/net/netfilter/nf_log.c index c93494fef8ef..4e620305f28c 100644 --- a/trunk/net/netfilter/nf_log.c +++ b/trunk/net/netfilter/nf_log.c @@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; static struct ctl_table_header *nf_log_dir_header; -static int nf_log_proc_dostring(ctl_table *table, int write, +static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, table->data = "NONE"; else table->data = logger->name; - r = proc_dostring(table, write, buffer, lenp, ppos); + r = proc_dostring(table, write, filp, buffer, lenp, ppos); mutex_unlock(&nf_log_mutex); } diff --git a/trunk/net/phonet/sysctl.c b/trunk/net/phonet/sysctl.c index 2220f3322326..7b5749ee2765 100644 --- a/trunk/net/phonet/sysctl.c +++ b/trunk/net/phonet/sysctl.c @@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max) } while (read_seqretry(&local_port_range_lock, seq)); } -static int proc_local_port_range(ctl_table *table, int write, +static int proc_local_port_range(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, .extra2 = &local_port_range_max, }; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) diff --git a/trunk/net/sunrpc/sysctl.c b/trunk/net/sunrpc/sysctl.c index 42f9748ae093..5231f7aaac0e 100644 --- a/trunk/net/sunrpc/sysctl.c +++ b/trunk/net/sunrpc/sysctl.c @@ -56,7 +56,7 @@ rpc_unregister_sysctl(void) } } -static int proc_do_xprt(ctl_table *table, int write, +static int proc_do_xprt(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; @@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, } static int -proc_dodebug(ctl_table *table, int write, +proc_dodebug(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[20], c, *s; diff --git a/trunk/net/sunrpc/xprtrdma/svc_rdma.c b/trunk/net/sunrpc/xprtrdma/svc_rdma.c index 35fb68b9c8ec..87101177825b 100644 --- a/trunk/net/sunrpc/xprtrdma/svc_rdma.c +++ b/trunk/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep; * current value. */ static int read_reset_stat(ctl_table *table, int write, - void __user *buffer, size_t *lenp, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; diff --git a/trunk/security/device_cgroup.c b/trunk/security/device_cgroup.c index 6cf8fd2b79e8..b8186bac8b7e 100644 --- a/trunk/security/device_cgroup.c +++ b/trunk/security/device_cgroup.c @@ -61,8 +61,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; static int devcgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task, - bool threadgroup) + struct cgroup *new_cgroup, struct task_struct *task) { if (current != task && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/trunk/security/min_addr.c b/trunk/security/min_addr.c index c844eed7915d..14cc7b3b8d03 100644 --- a/trunk/security/min_addr.c +++ b/trunk/security/min_addr.c @@ -28,12 +28,12 @@ static void update_mmap_min_addr(void) * sysctl handler which just sets dac_mmap_min_addr = the new value and then * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ -int mmap_min_addr_handler(struct ctl_table *table, int write, +int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); update_mmap_min_addr(); diff --git a/trunk/security/selinux/hooks.c b/trunk/security/selinux/hooks.c index bb230d5d7085..417f7c994522 100644 --- a/trunk/security/selinux/hooks.c +++ b/trunk/security/selinux/hooks.c @@ -2411,7 +2411,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); - __wake_up_parent(current, current->real_parent); + wake_up_interruptible(¤t->real_parent->signal->wait_chldexit); read_unlock(&tasklist_lock); }