From c4282ca76c5b81ed73ef4c5eb5c07ee397e51642 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Wed, 8 Jun 2016 12:00:04 -0400
Subject: [PATCH 1/2] tipc: correct error in node fsm

commit 88e8ac7000dc ("tipc: reduce transmission rate of reset messages
when link is down") revealed a flaw in the node FSM, as defined in
the log of commit 66996b6c47ed ("tipc: extend node FSM").

We see the following scenario:
1: Node B receives a RESET message from node A before its link endpoint
   is fully up, i.e., the node FSM is in state SELF_UP_PEER_COMING. This
   event will not change the node FSM state, but the (distinct) link FSM
   will move to state RESETTING.
2: As an effect of the previous event, the local endpoint on B will
   declare node A lost, and post the event SELF_DOWN to the its node
   FSM. This moves the FSM state to SELF_DOWN_PEER_LEAVING, meaning
   that no messages will be accepted from A until it receives another
   RESET message that confirms that A's endpoint has been reset. This
   is  wasteful, since we know this as a fact already from the first
   received RESET, but worse is that the link instance's FSM has not
   wasted this information, but instead moved on to state ESTABLISHING,
   meaning that it repeatedly sends out ACTIVATE messages to the reset
   peer A.
3: Node A will receive one of the ACTIVATE messages, move its link FSM
   to state ESTABLISHED, and start repeatedly sending out STATE messages
   to node B.
4: Node B will consistently drop these messages, since it can only accept
   accept a RESET according to its node FSM.
5: After four lost STATE messages node A will reset its link and start
   repeatedly sending out RESET messages to B.
6: Because of the reduced send rate for RESET messages, it is very
   likely that A will receive an ACTIVATE (which is sent out at a much
   higher frequency) before it gets the chance to send a RESET, and A
   may hence quickly move back to state ESTABLISHED and continue sending
   out STATE messages, which will again be dropped by B.
7: GOTO 5.
8: After having repeated the cycle 5-7 a number of times, node A will
   by chance get in between with sending a RESET, and the situation is
   resolved.

Unfortunately, we have seen that it may take a substantial amount of
time before this vicious loop is broken, sometimes in the order of
minutes.

We correct this by making a small correction to the node FSM: When a
node in state SELF_UP_PEER_COMING receives a SELF_DOWN event, it now
moves directly back to state SELF_DOWN_PEER_DOWN, instead of as now
SELF_DOWN_PEER_LEAVING. This is logically consistent, since we don't
need to wait for RESET confirmation from of an endpoint that we alread
know has been reset. It also means that node B in the scenario above
will not be dropping incoming STATE messages, and the link can come up
immediately.

Finally, a symmetry comparison reveals that the  FSM has a similar
error when receiving the event PEER_DOWN in state PEER_UP_SELF_COMING.
Instead of moving to PERR_DOWN_SELF_LEAVING, it should move directly
to SELF_DOWN_PEER_DOWN. Although we have never seen any negative effect
of this logical error, we choose fix this one, too.

The node FSM looks as follows after those changes:

                           +----------------------------------------+
                           |                           PEER_DOWN_EVT|
                           |                                        |
  +------------------------+----------------+                       |
  |SELF_DOWN_EVT           |                |                       |
  |                        |                |                       |
  |              +-----------+          +-----------+               |
  |              |NODE_      |          |NODE_      |               |
  |   +----------|FAILINGOVER|<---------|SYNCHING   |-----------+   |
  |   |SELF_     +-----------+ FAILOVER_+-----------+   PEER_   |   |
  |   |DOWN_EVT   |          A BEGIN_EVT  A         |   DOWN_EVT|   |
  |   |           |          |            |         |           |   |
  |   |           |          |            |         |           |   |
  |   |           |FAILOVER_ |FAILOVER_   |SYNCH_   |SYNCH_     |   |
  |   |           |END_EVT   |BEGIN_EVT   |BEGIN_EVT|END_EVT    |   |
  |   |           |          |            |         |           |   |
  |   |           |          |            |         |           |   |
  |   |           |         +--------------+        |           |   |
  |   |           +-------->|   SELF_UP_   |<-------+           |   |
  |   |   +-----------------|   PEER_UP    |----------------+   |   |
  |   |   |SELF_DOWN_EVT    +--------------+   PEER_DOWN_EVT|   |   |
  |   |   |                    A        A                   |   |   |
  |   |   |                    |        |                   |   |   |
  |   |   |         PEER_UP_EVT|        |SELF_UP_EVT        |   |   |
  |   |   |                    |        |                   |   |   |
  V   V   V                    |        |                   V   V   V
+------------+       +-----------+    +-----------+       +------------+
|SELF_DOWN_  |       |SELF_UP_   |    |PEER_UP_   |       |PEER_DOWN   |
|PEER_LEAVING|       |PEER_COMING|    |SELF_COMING|       |SELF_LEAVING|
+------------+       +-----------+    +-----------+       +------------+
       |               |       A        A       |                |
       |               |       |        |       |                |
       |       SELF_   |       |SELF_   |PEER_  |PEER_           |
       |       DOWN_EVT|       |UP_EVT  |UP_EVT |DOWN_EVT        |
       |               |       |        |       |                |
       |               |       |        |       |                |
       |               |    +--------------+    |                |
       |PEER_DOWN_EVT  +--->|  SELF_DOWN_  |<---+   SELF_DOWN_EVT|
       +------------------->|  PEER_DOWN   |<--------------------+
                            +--------------+

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/tipc/node.c b/net/tipc/node.c
index e01e2c71b5a16..c7985b2cb7596 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -950,7 +950,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
 			state = SELF_UP_PEER_UP;
 			break;
 		case SELF_LOST_CONTACT_EVT:
-			state = SELF_DOWN_PEER_LEAVING;
+			state = SELF_DOWN_PEER_DOWN;
 			break;
 		case SELF_ESTABL_CONTACT_EVT:
 		case PEER_LOST_CONTACT_EVT:
@@ -969,7 +969,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
 			state = SELF_UP_PEER_UP;
 			break;
 		case PEER_LOST_CONTACT_EVT:
-			state = SELF_LEAVING_PEER_DOWN;
+			state = SELF_DOWN_PEER_DOWN;
 			break;
 		case SELF_LOST_CONTACT_EVT:
 		case PEER_ESTABL_CONTACT_EVT:

From 5ca509fc0b6bcfeccf03c8c4bb5e4d1a62720c03 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Wed, 8 Jun 2016 12:00:05 -0400
Subject: [PATCH 2/2] tipc: change node timer unit from jiffies to ms

The node keepalive interval is recalculated at each timer expiration
to catch any changes in the link tolerance, and stored in a field in
struct tipc_node. We use jiffies as unit for the stored value.

This is suboptimal, because it makes the calculation unnecessary
complex, including two unit conversions. The conversions also lead to
a rounding error that causes the link "abort limit" to be 3 in the
normal case, instead of 4, as intended. This again leads to unnecessary
link resets when the network is pushed close to its limit, e.g., in an
environment with hundreds of nodes or namesapces.

In this commit, we do instead let the keepalive value be calculated and
stored in milliseconds, so that there is only one conversion and the
rounding error is eliminated.

We also remove a redundant "keepalive" field in struct tipc_link. This
is remnant from the previous implementation.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c |  2 --
 net/tipc/node.c | 18 ++++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7059c94f33c55..a904ccd5a93a1 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -87,7 +87,6 @@ struct tipc_stats {
  * @peer_bearer_id: bearer id used by link's peer endpoint
  * @bearer_id: local bearer id used by link
  * @tolerance: minimum link continuity loss needed to reset link [in ms]
- * @keepalive_intv: link keepalive timer interval
  * @abort_limit: # of unacknowledged continuity probes needed to reset link
  * @state: current state of link FSM
  * @peer_caps: bitmap describing capabilities of peer node
@@ -131,7 +130,6 @@ struct tipc_link {
 	u32 peer_bearer_id;
 	u32 bearer_id;
 	u32 tolerance;
-	unsigned long keepalive_intv;
 	u32 abort_limit;
 	u32 state;
 	u16 peer_caps;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c7985b2cb7596..d6a490f991a4b 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -378,14 +378,13 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
 {
 	unsigned long tol = tipc_link_tolerance(l);
 	unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
-	unsigned long keepalive_intv = msecs_to_jiffies(intv);
 
 	/* Link with lowest tolerance determines timer interval */
-	if (keepalive_intv < n->keepalive_intv)
-		n->keepalive_intv = keepalive_intv;
+	if (intv < n->keepalive_intv)
+		n->keepalive_intv = intv;
 
-	/* Ensure link's abort limit corresponds to current interval */
-	tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv));
+	/* Ensure link's abort limit corresponds to current tolerance */
+	tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
 }
 
 static void tipc_node_delete(struct tipc_node *node)
@@ -526,7 +525,7 @@ static void tipc_node_timeout(unsigned long data)
 		if (rc & TIPC_LINK_DOWN_EVT)
 			tipc_node_link_down(n, bearer_id, false);
 	}
-	mod_timer(&n->timer, jiffies + n->keepalive_intv);
+	mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv));
 }
 
 /**
@@ -735,6 +734,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 	bool accept_addr = false;
 	bool reset = true;
 	char *if_name;
+	unsigned long intv;
 
 	*dupl_addr = false;
 	*respond = false;
@@ -840,9 +840,11 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 		le->link = l;
 		n->link_cnt++;
 		tipc_node_calculate_timer(n, l);
-		if (n->link_cnt == 1)
-			if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+		if (n->link_cnt == 1) {
+			intv = jiffies + msecs_to_jiffies(n->keepalive_intv);
+			if (!mod_timer(&n->timer, intv))
 				tipc_node_get(n);
+		}
 	}
 	memcpy(&le->maddr, maddr, sizeof(*maddr));
 exit: