Skip to content

Commit

Permalink
Merge tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/li…
Browse files Browse the repository at this point in the history
…nux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner:

 - Fix a memory ordering issue in posix-timers

   Posix-timer lookup is lockless and reevaluates the timer validity
   under the timer lock, but the update which validates the timer is not
   protected by the timer lock. That allows the store to be reordered
   against the initialization stores, so that the lookup side can
   observe a partially initialized timer. That's mostly a theoretical
   problem, but incorrect nevertheless.

 - Fix a long standing inconsistency of the coarse time getters

   The coarse time getters read the base time of the current update
   cycle without reading the actual hardware clock. NTP frequency
   adjustment can set the base time backwards. The fine grained
   interfaces compensate this by reading the clock and applying the new
   conversion factor, but the coarse grained time getters use the base
   time directly. That allows the user to observe time going backwards.

   Cure it by always forwarding base time, when NTP changes the
   frequency with an immediate step.

 - Rework of posix-timer hashing

   The posix-timer hash is not scalable and due to the CRIU timer
   restore mechanism prone to massive contention on the global hash
   bucket lock.

   Replace the global hash lock with a fine grained per bucket locking
   scheme to address that.

 - Rework the proc/$PID/timers interface.

   /proc/$PID/timers is provided for CRIU to be able to restore a timer.
   The printout happens with sighand lock held and interrupts disabled.
   That's not required as this can be done with RCU protection as well.

 - Provide a sane mechanism for CRIU to restore a timer ID

   CRIU restores timers by creating and deleting them until the kernel
   internal per process ID counter reached the requested ID. That's
   horribly slow for sparse timer IDs.

   Provide a prctl() which allows CRIU to restore a timer with a given
   ID. When enabled the ID pointer is used as input pointer to read the
   requested ID from user space. When disabled, the normal allocation
   scheme (next ID) is active as before. This is backwards compatible
   for both kernel and user space.

 - Make hrtimer_update_function() less expensive.

   The sanity checks are valuable, but expensive for high frequency
   usage in io/uring. Make the debug checks conditional and enable them
   only when lockdep is enabled.

 - Small updates, cleanups and improvements

* tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
  selftests/timers: Improve skew_consistency by testing with other clockids
  timekeeping: Fix possible inconsistencies in _COARSE clockids
  posix-timers: Drop redundant memset() invocation
  selftests/timers/posix-timers: Add a test for exact allocation mode
  posix-timers: Provide a mechanism to allocate a given timer ID
  posix-timers: Dont iterate /proc/$PID/timers with sighand:: Siglock held
  posix-timers: Make per process list RCU safe
  posix-timers: Avoid false cacheline sharing
  posix-timers: Switch to jhash32()
  posix-timers: Improve hash table performance
  posix-timers: Make signal_struct:: Next_posix_timer_id an atomic_t
  posix-timers: Make lock_timer() use guard()
  posix-timers: Rework timer removal
  posix-timers: Simplify lock/unlock_timer()
  posix-timers: Use guards in a few places
  posix-timers: Remove SLAB_PANIC from kmem cache
  posix-timers: Remove a few paranoid warnings
  posix-timers: Cleanup includes
  posix-timers: Add cond_resched() to posix_timer_add() search loop
  posix-timers: Initialise timer before adding it to the hash table
  ...
  • Loading branch information
Linus Torvalds committed Mar 25, 2025
2 parents 0ae2062 + e40d370 commit d5048d1
Show file tree
Hide file tree
Showing 16 changed files with 524 additions and 379 deletions.
48 changes: 20 additions & 28 deletions fs/proc/base.c
Original file line number Diff line number Diff line change
Expand Up @@ -2494,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = {

#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
struct pid *pid;
struct task_struct *task;
struct sighand_struct *sighand;
struct pid_namespace *ns;
unsigned long flags;
struct pid *pid;
struct task_struct *task;
struct pid_namespace *ns;
};

static void *timers_start(struct seq_file *m, loff_t *pos)
Expand All @@ -2509,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);

tp->sighand = lock_task_sighand(tp->task, &tp->flags);
if (!tp->sighand)
return ERR_PTR(-ESRCH);

return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
rcu_read_lock();
return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}

static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);

return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}

static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;

if (tp->sighand) {
unlock_task_sighand(tp->task, &tp->flags);
tp->sighand = NULL;
}

if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
rcu_read_unlock();
}
}

static int show_timer(struct seq_file *m, void *v)
{
struct k_itimer *timer;
struct timers_private *tp = m->private;
int notify;
static const char * const nstr[] = {
[SIGEV_SIGNAL] = "signal",
[SIGEV_NONE] = "none",
[SIGEV_THREAD] = "thread",
[SIGEV_SIGNAL] = "signal",
[SIGEV_NONE] = "none",
[SIGEV_THREAD] = "thread",
};

timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
notify = timer->it_sigev_notify;
struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
struct timers_private *tp = m->private;
int notify = timer->it_sigev_notify;

guard(spinlock_irq)(&timer->it_lock);
if (!posixtimer_valid(timer))
return 0;

seq_printf(m, "ID: %d\n", timer->it_id);
seq_printf(m, "signal: %d/%px\n",
timer->sigq.info.si_signo,
seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
timer->sigq.info.si_value.sival_ptr);
seq_printf(m, "notify: %s/%s.%d\n",
nstr[notify & ~SIGEV_THREAD_ID],
seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
Expand Down
22 changes: 14 additions & 8 deletions include/linux/cleanup.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,11 +308,21 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond

#define DEFINE_GUARD(_name, _type, _lock, _unlock) \
#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
{ return (void *)(__force unsigned long)*(_exp); }

#define DEFINE_CLASS_IS_GUARD(_name) \
__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
__DEFINE_GUARD_LOCK_PTR(_name, _T)

#define DEFINE_CLASS_IS_COND_GUARD(_name) \
__DEFINE_CLASS_IS_CONDITIONAL(_name, true); \
__DEFINE_GUARD_LOCK_PTR(_name, _T)

#define DEFINE_GUARD(_name, _type, _lock, _unlock) \
DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
{ return (void *)(__force unsigned long)*_T; }
DEFINE_CLASS_IS_GUARD(_name)

#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \
Expand Down Expand Up @@ -392,11 +402,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \
if (_T->lock) { _unlock; } \
} \
\
static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
{ \
return (void *)(__force unsigned long)_T->lock; \
}

__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)

#define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \
static inline class_##_name##_t class_##_name##_constructor(_type *l) \
Expand Down
3 changes: 2 additions & 1 deletion include/linux/hrtimer.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,15 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
static inline void hrtimer_update_function(struct hrtimer *timer,
enum hrtimer_restart (*function)(struct hrtimer *))
{
#ifdef CONFIG_PROVE_LOCKING
guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock);

if (WARN_ON_ONCE(hrtimer_is_queued(timer)))
return;

if (WARN_ON_ONCE(!function))
return;

#endif
timer->function = function;
}

Expand Down
30 changes: 21 additions & 9 deletions include/linux/posix-timers.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q);
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
long posixtimer_create_prctl(unsigned long ctrl);

/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
Expand All @@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
Expand Down Expand Up @@ -177,23 +179,26 @@ static inline void posix_cputimers_init_work(void) { }
* @rcu: RCU head for freeing the timer.
*/
struct k_itimer {
struct hlist_node list;
struct hlist_node ignored_list;
/* 1st cacheline contains read-mostly fields */
struct hlist_node t_hash;
spinlock_t it_lock;
const struct k_clock *kclock;
clockid_t it_clock;
struct hlist_node list;
timer_t it_id;
clockid_t it_clock;
int it_sigev_notify;
enum pid_type it_pid_type;
struct signal_struct *it_signal;
const struct k_clock *kclock;

/* 2nd cacheline and above contain fields which are modified regularly */
spinlock_t it_lock;
int it_status;
bool it_sig_periodic;
s64 it_overrun;
s64 it_overrun_last;
unsigned int it_signal_seq;
unsigned int it_sigqueue_seq;
int it_sigev_notify;
enum pid_type it_pid_type;
ktime_t it_interval;
struct signal_struct *it_signal;
struct hlist_node ignored_list;
union {
struct pid *it_pid;
struct task_struct *it_process;
Expand All @@ -210,7 +215,7 @@ struct k_itimer {
} alarm;
} it;
struct rcu_head rcu;
};
} ____cacheline_aligned_in_smp;

void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
Expand Down Expand Up @@ -240,6 +245,13 @@ static inline void posixtimer_sigqueue_putref(struct sigqueue *q)

posixtimer_putref(tmr);
}

static inline bool posixtimer_valid(const struct k_itimer *timer)
{
unsigned long val = (unsigned long)timer->it_signal;

return !(val & 0x1UL);
}
#else /* CONFIG_POSIX_TIMERS */
static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { }
Expand Down
3 changes: 2 additions & 1 deletion include/linux/sched/signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS

/* POSIX.1b Interval Timers */
unsigned int next_posix_timer_id;
unsigned int timer_create_restore_ids:1;
atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;

Expand Down
11 changes: 11 additions & 0 deletions include/uapi/linux/prctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,4 +353,15 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76

/*
* Controls the mode of timer_create() for CRIU restore operations.
* Enabling this allows CRIU to restore timers with explicit IDs.
*
* Don't use for normal operations as the result might be undefined.
*/
#define PR_TIMER_CREATE_RESTORE_IDS 77
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2

#endif /* _LINUX_PRCTL_H */
2 changes: 1 addition & 1 deletion kernel/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu
* from a non-periodic timer, then just drop the reference
* count. Otherwise queue it on the ignored list.
*/
if (tmr->it_signal && tmr->it_sig_periodic)
if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
else
posixtimer_putref(tmr);
Expand Down
5 changes: 5 additions & 0 deletions kernel/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -2815,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
case PR_TIMER_CREATE_RESTORE_IDS:
if (arg3 || arg4 || arg5)
return -EINVAL;
error = posixtimer_create_prctl(arg2);
break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
Expand Down
2 changes: 1 addition & 1 deletion kernel/time/clocksource.c
Original file line number Diff line number Diff line change
Expand Up @@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
strscpy(override_name, str, sizeof(override_name));
strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}
Expand Down
29 changes: 12 additions & 17 deletions kernel/time/hrtimer.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.csd = CSD_INIT(retrigger_next_event, NULL)
};

static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
/* Make sure we catch unsupported clockids */
[0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,

[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};

static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
Expand Down Expand Up @@ -1587,14 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
if (likely(clock_id < MAX_CLOCKS)) {
int base = hrtimer_clock_to_base_table[clock_id];

if (likely(base != HRTIMER_MAX_CLOCK_BASES))
return base;
switch (clock_id) {
case CLOCK_REALTIME:
return HRTIMER_BASE_REALTIME;
case CLOCK_MONOTONIC:
return HRTIMER_BASE_MONOTONIC;
case CLOCK_BOOTTIME:
return HRTIMER_BASE_BOOTTIME;
case CLOCK_TAI:
return HRTIMER_BASE_TAI;
default:
WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
return HRTIMER_BASE_MONOTONIC;
}
WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
return HRTIMER_BASE_MONOTONIC;
}

static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
Expand Down
24 changes: 1 addition & 23 deletions kernel/time/posix-clock.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp,
return err;
}

#ifdef CONFIG_COMPAT
static long posix_clock_compat_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;

if (!clk)
return -ENODEV;

if (clk->ops.ioctl)
err = clk->ops.ioctl(pccontext, cmd, arg);

put_posix_clock(clk);

return err;
}
#endif

static int posix_clock_open(struct inode *inode, struct file *fp)
{
int err;
Expand Down Expand Up @@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = {
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
.compat_ioctl = posix_clock_ioctl,
.open = posix_clock_open,
.release = posix_clock_release,
#ifdef CONFIG_COMPAT
.compat_ioctl = posix_clock_compat_ioctl,
#endif
};

int posix_clock_register(struct posix_clock *clk, struct device *dev)
Expand Down
Loading

0 comments on commit d5048d1

Please sign in to comment.