From c4282ca76c5b81ed73ef4c5eb5c07ee397e51642 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 8 Jun 2016 12:00:04 -0400 Subject: [PATCH 1/2] tipc: correct error in node fsm commit 88e8ac7000dc ("tipc: reduce transmission rate of reset messages when link is down") revealed a flaw in the node FSM, as defined in the log of commit 66996b6c47ed ("tipc: extend node FSM"). We see the following scenario: 1: Node B receives a RESET message from node A before its link endpoint is fully up, i.e., the node FSM is in state SELF_UP_PEER_COMING. This event will not change the node FSM state, but the (distinct) link FSM will move to state RESETTING. 2: As an effect of the previous event, the local endpoint on B will declare node A lost, and post the event SELF_DOWN to the its node FSM. This moves the FSM state to SELF_DOWN_PEER_LEAVING, meaning that no messages will be accepted from A until it receives another RESET message that confirms that A's endpoint has been reset. This is wasteful, since we know this as a fact already from the first received RESET, but worse is that the link instance's FSM has not wasted this information, but instead moved on to state ESTABLISHING, meaning that it repeatedly sends out ACTIVATE messages to the reset peer A. 3: Node A will receive one of the ACTIVATE messages, move its link FSM to state ESTABLISHED, and start repeatedly sending out STATE messages to node B. 4: Node B will consistently drop these messages, since it can only accept accept a RESET according to its node FSM. 5: After four lost STATE messages node A will reset its link and start repeatedly sending out RESET messages to B. 6: Because of the reduced send rate for RESET messages, it is very likely that A will receive an ACTIVATE (which is sent out at a much higher frequency) before it gets the chance to send a RESET, and A may hence quickly move back to state ESTABLISHED and continue sending out STATE messages, which will again be dropped by B. 7: GOTO 5. 8: After having repeated the cycle 5-7 a number of times, node A will by chance get in between with sending a RESET, and the situation is resolved. Unfortunately, we have seen that it may take a substantial amount of time before this vicious loop is broken, sometimes in the order of minutes. We correct this by making a small correction to the node FSM: When a node in state SELF_UP_PEER_COMING receives a SELF_DOWN event, it now moves directly back to state SELF_DOWN_PEER_DOWN, instead of as now SELF_DOWN_PEER_LEAVING. This is logically consistent, since we don't need to wait for RESET confirmation from of an endpoint that we alread know has been reset. It also means that node B in the scenario above will not be dropping incoming STATE messages, and the link can come up immediately. Finally, a symmetry comparison reveals that the FSM has a similar error when receiving the event PEER_DOWN in state PEER_UP_SELF_COMING. Instead of moving to PERR_DOWN_SELF_LEAVING, it should move directly to SELF_DOWN_PEER_DOWN. Although we have never seen any negative effect of this logical error, we choose fix this one, too. The node FSM looks as follows after those changes: +----------------------------------------+ | PEER_DOWN_EVT| | | +------------------------+----------------+ | |SELF_DOWN_EVT | | | | | | | | +-----------+ +-----------+ | | |NODE_ | |NODE_ | | | +----------|FAILINGOVER|<---------|SYNCHING |-----------+ | | |SELF_ +-----------+ FAILOVER_+-----------+ PEER_ | | | |DOWN_EVT | A BEGIN_EVT A | DOWN_EVT| | | | | | | | | | | | | | | | | | | | |FAILOVER_ |FAILOVER_ |SYNCH_ |SYNCH_ | | | | |END_EVT |BEGIN_EVT |BEGIN_EVT|END_EVT | | | | | | | | | | | | | | | | | | | | | +--------------+ | | | | | +-------->| SELF_UP_ |<-------+ | | | | +-----------------| PEER_UP |----------------+ | | | | |SELF_DOWN_EVT +--------------+ PEER_DOWN_EVT| | | | | | A A | | | | | | | | | | | | | | PEER_UP_EVT| |SELF_UP_EVT | | | | | | | | | | | V V V | | V V V +------------+ +-----------+ +-----------+ +------------+ |SELF_DOWN_ | |SELF_UP_ | |PEER_UP_ | |PEER_DOWN | |PEER_LEAVING| |PEER_COMING| |SELF_COMING| |SELF_LEAVING| +------------+ +-----------+ +-----------+ +------------+ | | A A | | | | | | | | | SELF_ | |SELF_ |PEER_ |PEER_ | | DOWN_EVT| |UP_EVT |UP_EVT |DOWN_EVT | | | | | | | | | | | | | | | +--------------+ | | |PEER_DOWN_EVT +--->| SELF_DOWN_ |<---+ SELF_DOWN_EVT| +------------------->| PEER_DOWN |<--------------------+ +--------------+ Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index e01e2c71b5a16..c7985b2cb7596 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -950,7 +950,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: - state = SELF_DOWN_PEER_LEAVING; + state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: @@ -969,7 +969,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: - state = SELF_LEAVING_PEER_DOWN; + state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: From 5ca509fc0b6bcfeccf03c8c4bb5e4d1a62720c03 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 8 Jun 2016 12:00:05 -0400 Subject: [PATCH 2/2] tipc: change node timer unit from jiffies to ms The node keepalive interval is recalculated at each timer expiration to catch any changes in the link tolerance, and stored in a field in struct tipc_node. We use jiffies as unit for the stored value. This is suboptimal, because it makes the calculation unnecessary complex, including two unit conversions. The conversions also lead to a rounding error that causes the link "abort limit" to be 3 in the normal case, instead of 4, as intended. This again leads to unnecessary link resets when the network is pushed close to its limit, e.g., in an environment with hundreds of nodes or namesapces. In this commit, we do instead let the keepalive value be calculated and stored in milliseconds, so that there is only one conversion and the rounding error is eliminated. We also remove a redundant "keepalive" field in struct tipc_link. This is remnant from the previous implementation. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 -- net/tipc/node.c | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 7059c94f33c55..a904ccd5a93a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -87,7 +87,6 @@ struct tipc_stats { * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM * @peer_caps: bitmap describing capabilities of peer node @@ -131,7 +130,6 @@ struct tipc_link { u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long keepalive_intv; u32 abort_limit; u32 state; u16 peer_caps; diff --git a/net/tipc/node.c b/net/tipc/node.c index c7985b2cb7596..d6a490f991a4b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -378,14 +378,13 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; - unsigned long keepalive_intv = msecs_to_jiffies(intv); /* Link with lowest tolerance determines timer interval */ - if (keepalive_intv < n->keepalive_intv) - n->keepalive_intv = keepalive_intv; + if (intv < n->keepalive_intv) + n->keepalive_intv = intv; - /* Ensure link's abort limit corresponds to current interval */ - tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv)); + /* Ensure link's abort limit corresponds to current tolerance */ + tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete(struct tipc_node *node) @@ -526,7 +525,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - mod_timer(&n->timer, jiffies + n->keepalive_intv); + mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** @@ -735,6 +734,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, bool accept_addr = false; bool reset = true; char *if_name; + unsigned long intv; *dupl_addr = false; *respond = false; @@ -840,9 +840,11 @@ void tipc_node_check_dest(struct net *net, u32 onode, le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); - if (n->link_cnt == 1) - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + if (n->link_cnt == 1) { + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) tipc_node_get(n); + } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: