diff --git a/[refs] b/[refs] index e3efa24b6c47..db3e9e42b222 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 5f0b2976cb2b62668a076f54419c24b8ab677167 +refs/heads/master: 5f78e4d33945b291d12765cdd7e4304f437b9361 diff --git a/trunk/Documentation/DMA-API.txt b/trunk/Documentation/DMA-API.txt index b939ebb62871..80d150458c80 100644 --- a/trunk/Documentation/DMA-API.txt +++ b/trunk/Documentation/DMA-API.txt @@ -145,7 +145,7 @@ Part Ic - DMA addressing limitations int dma_supported(struct device *dev, u64 mask) int -pci_dma_supported(struct device *dev, u64 mask) +pci_dma_supported(struct pci_dev *hwdev, u64 mask) Checks to see if the device can support DMA to the memory described by mask. @@ -189,7 +189,7 @@ dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) dma_addr_t -pci_map_single(struct device *dev, void *cpu_addr, size_t size, +pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size, int direction) Maps a piece of processor virtual memory so it can be accessed by the @@ -395,6 +395,71 @@ Notes: You must do this: See also dma_map_single(). +dma_addr_t +dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) + +void +dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) + +int +dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + +void +dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + +The four functions above are just like the counterpart functions +without the _attrs suffixes, except that they pass an optional +struct dma_attrs*. + +struct dma_attrs encapsulates a set of "dma attributes". For the +definition of struct dma_attrs see linux/dma-attrs.h. + +The interpretation of dma attributes is architecture-specific, and +each attribute should be documented in Documentation/DMA-attributes.txt. + +If struct dma_attrs* is NULL, the semantics of each of these +functions is identical to those of the corresponding function +without the _attrs suffix. As a result dma_map_single_attrs() +can generally replace dma_map_single(), etc. + +As an example of the use of the *_attrs functions, here's how +you could pass an attribute DMA_ATTR_FOO when mapping memory +for DMA: + +#include +/* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and + * documented in Documentation/DMA-attributes.txt */ +... + + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_FOO, &attrs); + .... + n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr); + .... + +Architectures that care about DMA_ATTR_FOO would check for its +presence in their implementations of the mapping and unmapping +routines, e.g.: + +void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + .... + int foo = dma_get_attr(DMA_ATTR_FOO, attrs); + .... + if (foo) + /* twizzle the frobnozzle */ + .... + Part II - Advanced dma_ usage ----------------------------- diff --git a/trunk/Documentation/DMA-attributes.txt b/trunk/Documentation/DMA-attributes.txt new file mode 100644 index 000000000000..6d772f84b477 --- /dev/null +++ b/trunk/Documentation/DMA-attributes.txt @@ -0,0 +1,24 @@ + DMA attributes + ============== + +This document describes the semantics of the DMA attributes that are +defined in linux/dma-attrs.h. + +DMA_ATTR_WRITE_BARRIER +---------------------- + +DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA. DMA +to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces +all pending DMA writes to complete, and thus provides a mechanism to +strictly order DMA from a device across all intervening busses and +bridges. This barrier is not specific to a particular type of +interconnect, it applies to the system as a whole, and so its +implementation must account for the idiosyncracies of the system all +the way from the DMA device to memory. + +As an example of a situation where DMA_ATTR_WRITE_BARRIER would be +useful, suppose that a device does a DMA write to indicate that data is +ready and available in memory. The DMA of the "completion indication" +could race with data DMA. Mapping the memory used for completion +indications with DMA_ATTR_WRITE_BARRIER would prevent the race. + diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl index 488dd4a4945b..b7b1482f6e04 100644 --- a/trunk/Documentation/DocBook/kernel-api.tmpl +++ b/trunk/Documentation/DocBook/kernel-api.tmpl @@ -119,7 +119,7 @@ X!Ilib/string.c !Elib/string.c Bit Operations -!Iinclude/asm-x86/bitops_32.h +!Iinclude/asm-x86/bitops.h @@ -645,4 +645,58 @@ X!Idrivers/video/console/fonts.c !Edrivers/i2c/i2c-core.c + + Clock Framework + + + The clock framework defines programming interfaces to support + software management of the system clock tree. + This framework is widely used with System-On-Chip (SOC) platforms + to support power management and various devices which may need + custom clock rates. + Note that these "clocks" don't relate to timekeeping or real + time clocks (RTCs), each of which have separate frameworks. + These struct clk instances may be used + to manage for example a 96 MHz signal that is used to shift bits + into and out of peripherals or busses, or otherwise trigger + synchronous state machine transitions in system hardware. + + + + Power management is supported by explicit software clock gating: + unused clocks are disabled, so the system doesn't waste power + changing the state of transistors that aren't in active use. + On some systems this may be backed by hardware clock gating, + where clocks are gated without being disabled in software. + Sections of chips that are powered but not clocked may be able + to retain their last state. + This low power state is often called a retention + mode. + This mode still incurs leakage currents, especially with finer + circuit geometries, but for CMOS circuits power is mostly used + by clocked state changes. + + + + Power-aware drivers only enable their clocks when the device + they manage is in active use. Also, system sleep states often + differ according to which clock domains are active: while a + "standby" state may allow wakeup from several active domains, a + "mem" (suspend-to-RAM) state may require a more wholesale shutdown + of clocks derived from higher speed PLLs and oscillators, limiting + the number of possible wakeup event sources. A driver's suspend + method may need to be aware of system-specific clock constraints + on the target sleep state. + + + + Some platforms support programmable clock generators. These + can be used by external chips of various kinds, such as other + CPUs, multimedia codecs, and devices with strict requirements + for interface clocking. + + +!Iinclude/linux/clk.h + + diff --git a/trunk/Documentation/cgroups.txt b/trunk/Documentation/cgroups.txt index 31d12e21ff8a..c298a6690e0d 100644 --- a/trunk/Documentation/cgroups.txt +++ b/trunk/Documentation/cgroups.txt @@ -500,8 +500,7 @@ post-attachment activity that requires memory allocations or blocking. void fork(struct cgroup_subsy *ss, struct task_struct *task) -Called when a task is forked into a cgroup. Also called during -registration for all existing tasks. +Called when a task is forked into a cgroup. void exit(struct cgroup_subsys *ss, struct task_struct *task) diff --git a/trunk/Documentation/controllers/devices.txt b/trunk/Documentation/controllers/devices.txt new file mode 100644 index 000000000000..4dcea42432c2 --- /dev/null +++ b/trunk/Documentation/controllers/devices.txt @@ -0,0 +1,48 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* mrw' entry. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/trunk/Documentation/controllers/resource_counter.txt b/trunk/Documentation/controllers/resource_counter.txt new file mode 100644 index 000000000000..f196ac1d7d25 --- /dev/null +++ b/trunk/Documentation/controllers/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_ + max_usage max_usage_in_ + limit limit_in_ + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/trunk/Documentation/cpu-freq/user-guide.txt b/trunk/Documentation/cpu-freq/user-guide.txt index af3b925ece08..6c442d8426b5 100644 --- a/trunk/Documentation/cpu-freq/user-guide.txt +++ b/trunk/Documentation/cpu-freq/user-guide.txt @@ -154,6 +154,11 @@ scaling_governor, and by "echoing" the name of another that some governors won't load - they only work on some specific architectures or processors. + +cpuinfo_cur_freq : Current speed of the CPU, in KHz. + +scaling_available_frequencies : List of available frequencies, in KHz. + scaling_min_freq and scaling_max_freq show the current "policy limits" (in kHz). By echoing new values into these @@ -162,6 +167,15 @@ scaling_max_freq show the current "policy limits" (in first set scaling_max_freq, then scaling_min_freq. +affected_cpus : List of CPUs that require software coordination + of frequency. + +related_cpus : List of CPUs that need some sort of frequency + coordination, whether software or hardware. + +scaling_driver : Hardware driver for cpufreq. + +scaling_cur_freq : Current frequency of the CPU, in KHz. If you have selected the "userspace" governor which allows you to set the CPU operating frequency to a specific value, you can read out diff --git a/trunk/Documentation/cpusets.txt b/trunk/Documentation/cpusets.txt index aa854b9b18cd..fb7b361e6eea 100644 --- a/trunk/Documentation/cpusets.txt +++ b/trunk/Documentation/cpusets.txt @@ -171,6 +171,7 @@ files describing that cpuset: - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled - memory_pressure: measure of how much paging pressure in cpuset In addition, the root cpuset only has the following file: @@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct ancestor or descendent, may share any of the same CPUs or Memory Nodes. -A cpuset that is mem_exclusive restricts kernel allocations for -page, buffer and other data commonly shared by the kernel across -multiple users. All cpusets, whether mem_exclusive or not, restrict -allocations of memory for user space. This enables configuring a -system so that several independent jobs can share common kernel data, -such as file system pages, while isolating each jobs user allocation in -its own cpuset. To do this, construct a large mem_exclusive cpuset to -hold all the jobs, and construct child, non-mem_exclusive cpusets for -each individual job. Only a small amount of typical kernel memory, -such as requests from interrupt handlers, is allowed to be taken -outside even a mem_exclusive cpuset. +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. 1.5 What is memory_pressure ? @@ -707,7 +709,7 @@ Now you want to do something with this cpuset. In this directory you can find several files: # ls -cpus cpu_exclusive mems mem_exclusive tasks +cpus cpu_exclusive mems mem_exclusive mem_hardwall tasks Reading them will give you information about the state of this cpuset: the CPUs and Memory Nodes it can use, the processes that are using diff --git a/trunk/Documentation/dontdiff b/trunk/Documentation/dontdiff index 354aec047c0e..881e6dd03aea 100644 --- a/trunk/Documentation/dontdiff +++ b/trunk/Documentation/dontdiff @@ -141,6 +141,7 @@ mkprep mktables mktree modpost +modules.order modversions.h* offset.h offsets.h @@ -171,6 +172,7 @@ sm_tbl* split-include tags tftpboot.img +timeconst.h times.h* tkparse trix_boot.h diff --git a/trunk/Documentation/fb/gxfb.txt b/trunk/Documentation/fb/gxfb.txt new file mode 100644 index 000000000000..2f640903bbb2 --- /dev/null +++ b/trunk/Documentation/fb/gxfb.txt @@ -0,0 +1,52 @@ +[This file is cloned from VesaFB/aty128fb] + +What is gxfb? +================= + +This is a graphics framebuffer driver for AMD Geode GX2 based processors. + +Advantages: + + * No need to use AMD's VSA code (or other VESA emulation layer) in the + BIOS. + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF68_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... + + +How to use it? +============== + +Switching modes is done using gxfb.mode_option=... boot +parameter or using `fbset' program. + +See Documentation/fb/modedb.txt for more information on modedb +resolutions. + + +X11 +=== + +XF68_FBDev should generally work fine, but it is non-accelerated. + + +Configuration +============= + +You can pass kernel command line options to gxfb with gxfb.