Skip to content

Commit

Permalink
sgi-xpc: prevent false heartbeat failures
Browse files Browse the repository at this point in the history
The heartbeat timeout functionality in sgi-xpc is currently not trained to
the connection time.  If a connection is made and the code is in the last
polling window prior to doing a timeout, the next polling window will see
the heartbeat as unchanged and initiate a no-heartbeat disconnect.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Dean Nelson <dcn@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Robin Holt authored and Linus Torvalds committed Apr 13, 2009
1 parent a06bba4 commit a374c57
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 152 deletions.
100 changes: 38 additions & 62 deletions drivers/misc/sgi-xp/xpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,21 @@ struct xpc_rsvd_page {
short max_npartitions; /* value of XPC_MAX_PARTITIONS */
u8 version;
u8 pad1[3]; /* align to next u64 in 1st 64-byte cacheline */
unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */
union {
unsigned long vars_pa; /* phys address of struct xpc_vars */
unsigned long activate_gru_mq_desc_gpa; /* phys addr of */
/* activate mq's */
/* gru mq descriptor */
struct {
unsigned long vars_pa; /* phys addr */
} sn2;
struct {
unsigned long heartbeat_gpa; /* phys addr */
unsigned long activate_gru_mq_desc_gpa; /* phys addr */
} uv;
} sn;
unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */
u64 pad2[10]; /* align to last u64 in 2nd 64-byte cacheline */
u64 pad2[9]; /* align to last u64 in 2nd 64-byte cacheline */
u64 SAL_nasids_size; /* SAL: size of each nasid mask in bytes */
};

#define XPC_RP_VERSION _XPC_VERSION(2, 0) /* version 2.0 of the reserved page */
#define XPC_RP_VERSION _XPC_VERSION(3, 0) /* version 3.0 of the reserved page */

/*
* Define the structures by which XPC variables can be exported to other
Expand Down Expand Up @@ -182,6 +185,17 @@ struct xpc_vars_part_sn2 {
(XPC_RP_MACH_NASIDS(_rp) + \
xpc_nasid_mask_nlongs))


/*
* The following structure describes the partition's heartbeat info which
* will be periodically read by other partitions to determine whether this
* XPC is still 'alive'.
*/
struct xpc_heartbeat_uv {
unsigned long value;
unsigned long offline; /* if 0, heartbeat should be changing */
};

/*
* Info pertinent to a GRU message queue using a watch list for irq generation.
*/
Expand All @@ -198,7 +212,7 @@ struct xpc_gru_mq_uv {

/*
* The activate_mq is used to send/receive GRU messages that affect XPC's
* heartbeat, partition active state, and channel state. This is UV only.
* partition active state and channel state. This is uv only.
*/
struct xpc_activate_mq_msghdr_uv {
unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */
Expand All @@ -210,33 +224,26 @@ struct xpc_activate_mq_msghdr_uv {

/* activate_mq defined message types */
#define XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV 0
#define XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV 1
#define XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV 2
#define XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV 3

#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV 4
#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV 5
#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV 1
#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV 2

#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV 6
#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV 7
#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV 8
#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV 9
#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV 3
#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV 4
#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV 5
#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV 6

#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV 10
#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV 11
#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV 7
#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV 8

struct xpc_activate_mq_msg_uv {
struct xpc_activate_mq_msghdr_uv hdr;
};

struct xpc_activate_mq_msg_heartbeat_req_uv {
struct xpc_activate_mq_msghdr_uv hdr;
u64 heartbeat;
};

struct xpc_activate_mq_msg_activate_req_uv {
struct xpc_activate_mq_msghdr_uv hdr;
unsigned long rp_gpa;
unsigned long heartbeat_gpa;
unsigned long activate_gru_mq_desc_gpa;
};

Expand Down Expand Up @@ -687,6 +694,9 @@ struct xpc_partition_sn2 {
};

struct xpc_partition_uv {
unsigned long heartbeat_gpa; /* phys addr of partition's heartbeat */
struct xpc_heartbeat_uv cached_heartbeat; /* cached copy of */
/* partition's heartbeat */
unsigned long activate_gru_mq_desc_gpa; /* phys addr of parititon's */
/* activate mq's gru mq */
/* descriptor */
Expand All @@ -698,14 +708,12 @@ struct xpc_partition_uv {
u8 remote_act_state; /* remote partition's act_state */
u8 act_state_req; /* act_state request from remote partition */
enum xp_retval reason; /* reason for deactivate act_state request */
u64 heartbeat; /* incremented by remote partition */
};

/* struct xpc_partition_uv flags */

#define XPC_P_HEARTBEAT_OFFLINE_UV 0x00000001
#define XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV 0x00000001
#define XPC_P_ENGAGED_UV 0x00000002
#define XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV 0x00000004

/* struct xpc_partition_uv act_state change requests */

Expand Down Expand Up @@ -807,7 +815,6 @@ extern int xpc_disengage_timedout;
extern int xpc_activate_IRQ_rcvd;
extern spinlock_t xpc_activate_IRQ_rcvd_lock;
extern wait_queue_head_t xpc_activate_IRQ_wq;
extern void *xpc_heartbeating_to_mask;
extern void *xpc_kzalloc_cacheline_aligned(size_t, gfp_t, void **);
extern void xpc_activate_partition(struct xpc_partition *);
extern void xpc_activate_kthreads(struct xpc_channel *, int);
Expand All @@ -825,6 +832,9 @@ extern void (*xpc_increment_heartbeat) (void);
extern void (*xpc_offline_heartbeat) (void);
extern void (*xpc_online_heartbeat) (void);
extern enum xp_retval (*xpc_get_remote_heartbeat) (struct xpc_partition *);
extern void (*xpc_allow_hb) (short);
extern void (*xpc_disallow_hb) (short);
extern void (*xpc_disallow_all_hbs) (void);
extern enum xp_retval (*xpc_make_first_contact) (struct xpc_partition *);
extern u64 (*xpc_get_chctl_all_flags) (struct xpc_partition *);
extern enum xp_retval (*xpc_setup_msg_structures) (struct xpc_channel *);
Expand Down Expand Up @@ -909,40 +919,6 @@ extern void xpc_disconnect_channel(const int, struct xpc_channel *,
extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval);
extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval);

static inline int
xpc_hb_allowed(short partid, void *heartbeating_to_mask)
{
return test_bit(partid, heartbeating_to_mask);
}

static inline int
xpc_any_hbs_allowed(void)
{
DBUG_ON(xpc_heartbeating_to_mask == NULL);
return !bitmap_empty(xpc_heartbeating_to_mask, xp_max_npartitions);
}

static inline void
xpc_allow_hb(short partid)
{
DBUG_ON(xpc_heartbeating_to_mask == NULL);
set_bit(partid, xpc_heartbeating_to_mask);
}

static inline void
xpc_disallow_hb(short partid)
{
DBUG_ON(xpc_heartbeating_to_mask == NULL);
clear_bit(partid, xpc_heartbeating_to_mask);
}

static inline void
xpc_disallow_all_hbs(void)
{
DBUG_ON(xpc_heartbeating_to_mask == NULL);
bitmap_zero(xpc_heartbeating_to_mask, xp_max_npartitions);
}

static inline void
xpc_wakeup_channel_mgr(struct xpc_partition *part)
{
Expand Down
8 changes: 5 additions & 3 deletions drivers/misc/sgi-xp/xpc_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved.
*/

/*
Expand Down Expand Up @@ -150,7 +150,6 @@ DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);

static unsigned long xpc_hb_check_timeout;
static struct timer_list xpc_hb_timer;
void *xpc_heartbeating_to_mask;

/* notification that the xpc_hb_checker thread has exited */
static DECLARE_COMPLETION(xpc_hb_checker_exited);
Expand All @@ -176,6 +175,10 @@ enum xp_retval (*xpc_get_partition_rsvd_page_pa) (void *buf, u64 *cookie,
unsigned long *rp_pa,
size_t *len);
int (*xpc_setup_rsvd_page_sn) (struct xpc_rsvd_page *rp);

void (*xpc_allow_hb) (short partid);
void (*xpc_disallow_hb) (short partid);
void (*xpc_disallow_all_hbs) (void);
void (*xpc_heartbeat_init) (void);
void (*xpc_heartbeat_exit) (void);
void (*xpc_increment_heartbeat) (void);
Expand Down Expand Up @@ -1087,7 +1090,6 @@ xpc_do_exit(enum xp_retval reason)
} while (1);

DBUG_ON(xpc_any_partition_engaged());
DBUG_ON(xpc_any_hbs_allowed() != 0);

xpc_teardown_rsvd_page();

Expand Down
44 changes: 37 additions & 7 deletions drivers/misc/sgi-xp/xpc_sn2.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
*/

/*
Expand Down Expand Up @@ -629,7 +629,7 @@ xpc_setup_rsvd_page_sn_sn2(struct xpc_rsvd_page *rp)

xpc_vars_sn2 = XPC_RP_VARS(rp);

rp->sn.vars_pa = xp_pa(xpc_vars_sn2);
rp->sn.sn2.vars_pa = xp_pa(xpc_vars_sn2);

/* vars_part array follows immediately after vars */
xpc_vars_part_sn2 = (struct xpc_vars_part_sn2 *)((u8 *)XPC_RP_VARS(rp) +
Expand Down Expand Up @@ -693,6 +693,33 @@ xpc_setup_rsvd_page_sn_sn2(struct xpc_rsvd_page *rp)
return 0;
}

static int
xpc_hb_allowed_sn2(short partid, void *heartbeating_to_mask)
{
return test_bit(partid, heartbeating_to_mask);
}

static void
xpc_allow_hb_sn2(short partid)
{
DBUG_ON(xpc_vars_sn2 == NULL);
set_bit(partid, xpc_vars_sn2->heartbeating_to_mask);
}

static void
xpc_disallow_hb_sn2(short partid)
{
DBUG_ON(xpc_vars_sn2 == NULL);
clear_bit(partid, xpc_vars_sn2->heartbeating_to_mask);
}

static void
xpc_disallow_all_hbs_sn2(void)
{
DBUG_ON(xpc_vars_sn2 == NULL);
bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, xp_max_npartitions);
}

static void
xpc_increment_heartbeat_sn2(void)
{
Expand All @@ -719,7 +746,6 @@ xpc_heartbeat_init_sn2(void)
DBUG_ON(xpc_vars_sn2 == NULL);

bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2);
xpc_heartbeating_to_mask = &xpc_vars_sn2->heartbeating_to_mask[0];
xpc_online_heartbeat_sn2();
}

Expand Down Expand Up @@ -751,9 +777,9 @@ xpc_get_remote_heartbeat_sn2(struct xpc_partition *part)
remote_vars->heartbeating_to_mask[0]);

if ((remote_vars->heartbeat == part->last_heartbeat &&
remote_vars->heartbeat_offline == 0) ||
!xpc_hb_allowed(sn_partition_id,
&remote_vars->heartbeating_to_mask)) {
!remote_vars->heartbeat_offline) ||
!xpc_hb_allowed_sn2(sn_partition_id,
remote_vars->heartbeating_to_mask)) {
ret = xpNoHeartbeat;
} else {
part->last_heartbeat = remote_vars->heartbeat;
Expand Down Expand Up @@ -972,7 +998,7 @@ xpc_identify_activate_IRQ_req_sn2(int nasid)
return;
}

remote_vars_pa = remote_rp->sn.vars_pa;
remote_vars_pa = remote_rp->sn.sn2.vars_pa;
remote_rp_version = remote_rp->version;
remote_rp_ts_jiffies = remote_rp->ts_jiffies;

Expand Down Expand Up @@ -2325,6 +2351,10 @@ xpc_init_sn2(void)
xpc_teardown_partitions_sn = xpc_teardown_partitions_sn_sn2;
xpc_get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_sn2;
xpc_setup_rsvd_page_sn = xpc_setup_rsvd_page_sn_sn2;

xpc_allow_hb = xpc_allow_hb_sn2;
xpc_disallow_hb = xpc_disallow_hb_sn2;
xpc_disallow_all_hbs = xpc_disallow_all_hbs_sn2;
xpc_increment_heartbeat = xpc_increment_heartbeat_sn2;
xpc_offline_heartbeat = xpc_offline_heartbeat_sn2;
xpc_online_heartbeat = xpc_online_heartbeat_sn2;
Expand Down
Loading

0 comments on commit a374c57

Please sign in to comment.