Skip to content

Commit

Permalink
libceph: fix messenger retry
Browse files Browse the repository at this point in the history
In ancient times, the messenger could both initiate and accept connections.
An artifact if that was data structures to store/process an incoming
ceph_msg_connect request and send an outgoing ceph_msg_connect_reply.
Sadly, the negotiation code was referencing those structures and ignoring
important information (like the peer's connect_seq) from the correct ones.

Among other things, this fixes tight reconnect loops where the server sends
RETRY_SESSION and we (the client) retries with the same connect_seq as last
time.  This bug pretty easily triggered by injecting socket failures on the
MDS and running some fs workload like workunits/direct_io/test_sync_io.

Signed-off-by: Sage Weil <sage@inktank.com>
  • Loading branch information
Sage Weil committed Jul 30, 2012
1 parent cd43045 commit a16cb1f
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 16 deletions.
12 changes: 2 additions & 10 deletions include/linux/ceph/messenger.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,8 @@ struct ceph_connection {

/* connection negotiation temps */
char in_banner[CEPH_BANNER_MAX_LEN];
union {
struct { /* outgoing connection */
struct ceph_msg_connect out_connect;
struct ceph_msg_connect_reply in_reply;
};
struct { /* incoming */
struct ceph_msg_connect in_connect;
struct ceph_msg_connect_reply out_reply;
};
};
struct ceph_msg_connect out_connect;
struct ceph_msg_connect_reply in_reply;
struct ceph_entity_addr actual_peer_addr;

/* message out temps */
Expand Down
12 changes: 6 additions & 6 deletions net/ceph/messenger.c
Original file line number Diff line number Diff line change
Expand Up @@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con)
* dropped messages.
*/
dout("process_connect got RESET peer seq %u\n",
le32_to_cpu(con->in_connect.connect_seq));
le32_to_cpu(con->in_reply.connect_seq));
pr_err("%s%lld %s connection reset\n",
ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr));
Expand All @@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con)
* If we sent a smaller connect_seq than the peer has, try
* again with a larger value.
*/
dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
le32_to_cpu(con->out_connect.connect_seq),
le32_to_cpu(con->in_connect.connect_seq));
con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
le32_to_cpu(con->in_reply.connect_seq));
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
Expand All @@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con)
*/
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
con->peer_global_seq,
le32_to_cpu(con->in_connect.global_seq));
le32_to_cpu(con->in_reply.global_seq));
get_global_seq(con->msgr,
le32_to_cpu(con->in_connect.global_seq));
le32_to_cpu(con->in_reply.global_seq));
ret = prepare_write_connect(con);
if (ret < 0)
return ret;
Expand Down

0 comments on commit a16cb1f

Please sign in to comment.