Skip to content

Commit

Permalink
writeback: dirty ratelimit - think time compensation
Browse files Browse the repository at this point in the history
Compensate the task's think time when computing the final pause time,
so that ->dirty_ratelimit can be executed accurately.

        think time := time spend outside of balance_dirty_pages()

In the rare case that the task slept longer than the 200ms period time
(result in negative pause time), the sleep time will be compensated in
the following periods, too, if it's less than 1 second.

Accumulated errors are carefully avoided as long as the max pause area
is not hitted.

Pseudo code:

        period = pages_dirtied / task_ratelimit;
        think = jiffies - dirty_paused_when;
        pause = period - think;

1) normal case: period > think

        pause = period - think
        dirty_paused_when = jiffies + pause
        nr_dirtied = 0

                             period time
              |===============================>|
                  think time      pause time
              |===============>|==============>|
        ------|----------------|---------------|------------------------
        dirty_paused_when   jiffies

2) no pause case: period <= think

        don't pause; reduce future pause time by:
        dirty_paused_when += period
        nr_dirtied = 0

                           period time
              |===============================>|
                                  think time
              |===================================================>|
        ------|--------------------------------+-------------------|----
        dirty_paused_when                                       jiffies

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
  • Loading branch information
Wu Fengguang committed Dec 18, 2011
1 parent 32c7f20 commit 8371235
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 7 deletions.
1 change: 1 addition & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,7 @@ struct task_struct {
*/
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when; /* start of a write-and-pause period */

#ifdef CONFIG_LATENCYTOP
int latency_record_count;
Expand Down
14 changes: 11 additions & 3 deletions include/trace/events/writeback.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
unsigned long dirty_ratelimit,
unsigned long task_ratelimit,
unsigned long dirtied,
unsigned long period,
long pause,
unsigned long start_time),

TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
dirty_ratelimit, task_ratelimit,
dirtied, pause, start_time),
dirtied, period, pause, start_time),

TP_STRUCT__entry(
__array( char, bdi, 32)
Expand All @@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
__field(unsigned int, dirtied_pause)
__field(unsigned long, paused)
__field( long, pause)
__field(unsigned long, period)
__field( long, think)
),

TP_fast_assign(
Expand All @@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->dirtied = dirtied;
__entry->dirtied_pause = current->nr_dirtied_pause;
__entry->think = current->dirty_paused_when == 0 ? 0 :
(long)(jiffies - current->dirty_paused_when) * 1000/HZ;
__entry->period = period * 1000 / HZ;
__entry->pause = pause * 1000 / HZ;
__entry->paused = (jiffies - start_time) * 1000 / HZ;
),
Expand All @@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
"bdi_setpoint=%lu bdi_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u "
"paused=%lu pause=%ld",
"paused=%lu pause=%ld period=%lu think=%ld",
__entry->bdi,
__entry->limit,
__entry->setpoint,
Expand All @@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
__entry->dirtied,
__entry->dirtied_pause,
__entry->paused, /* ms */
__entry->pause /* ms */
__entry->pause, /* ms */
__entry->period, /* ms */
__entry->think /* ms */
)
);

Expand Down
1 change: 1 addition & 0 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,

p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;

/*
* Ok, make it visible to the rest of the system.
Expand Down
36 changes: 32 additions & 4 deletions mm/page-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
long period;
long pause = 0;
long uninitialized_var(max_pause);
bool dirty_exceeded = false;
Expand All @@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long start_time = jiffies;

for (;;) {
unsigned long now = jiffies;

/*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
Expand All @@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
freerun = dirty_freerun_ceiling(dirty_thresh,
background_thresh);
if (nr_dirty <= freerun)
if (nr_dirty <= freerun) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
break;
}

if (unlikely(!writeback_in_progress(bdi)))
bdi_start_background_writeback(bdi);
Expand Down Expand Up @@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
RATELIMIT_CALC_SHIFT;
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
pause = max_pause;
goto pause;
}
pause = HZ * pages_dirtied / task_ratelimit;
period = HZ * pages_dirtied / task_ratelimit;
pause = period;
if (current->dirty_paused_when)
pause -= now - current->dirty_paused_when;
/*
* For less than 1s think time (ext3/4 may block the dirtier
* for up to 800ms from time to time on 1-HDD; so does xfs,
* however at much less frequency), try to compensate it in
* future periods by updating the virtual time; otherwise just
* do a reset, as it may be a light dirtier.
*/
if (unlikely(pause <= 0)) {
trace_balance_dirty_pages(bdi,
dirty_thresh,
Expand All @@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
pause,
start_time);
if (pause < -HZ) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
} else if (period) {
current->dirty_paused_when += period;
current->nr_dirtied = 0;
}
pause = 1; /* avoid resetting nr_dirtied_pause below */
break;
}
Expand All @@ -1135,11 +1160,15 @@ static void balance_dirty_pages(struct address_space *mapping,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
pause,
start_time);
__set_current_state(TASK_KILLABLE);
io_schedule_timeout(pause);

current->dirty_paused_when = now + pause;
current->nr_dirtied = 0;

/*
* This is typically equal to (nr_dirty < dirty_thresh) and can
* also keep "1000+ dd on a slow USB stick" under control.
Expand Down Expand Up @@ -1167,11 +1196,10 @@ static void balance_dirty_pages(struct address_space *mapping,
if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;

current->nr_dirtied = 0;
if (pause == 0) { /* in freerun area */
current->nr_dirtied_pause =
dirty_poll_interval(nr_dirty, dirty_thresh);
} else if (pause <= max_pause / 4 &&
} else if (period <= max_pause / 4 &&
pages_dirtied >= current->nr_dirtied_pause) {
current->nr_dirtied_pause = clamp_val(
dirty_ratelimit * (max_pause / 2) / HZ,
Expand Down

0 comments on commit 8371235

Please sign in to comment.