Browse code

tcp: fix fd passing bug

If connections are opened and closed very quickly when data is
sent on them, it is possible that a connection gets closed
(close() inside tcp_main) while a process waits for its fd, just
before tcp_main attempts to send the fd. In this case the fd
sending will fail (one cannot send a closed fd) and the process
that asked for it will remain waiting forever.
The fix always checks before attempting to send the fd if the fd is
still open and the connection is not in a "bad" state. If not,
a new error response is sent (no fd and connection == NULL).

Andrei Pelinescu-Onciul authored on 16/06/2010 19:03:06
Showing 1 changed files
... ...
@@ -2102,16 +2102,22 @@ static int tcpconn_send_put(struct tcp_connection* c, char* buf, unsigned len,
2102 2102
 				do_close_fd=0;
2103 2103
 				goto release_c;
2104 2104
 			}
2105
-			if (unlikely(c!=tmp)){
2106
-				LOG(L_CRIT, "BUG: tcp_send: get_fd: got different connection:"
2105
+			/* handle fd closed or bad connection/error
2106
+				(it's possible that this happened in the time between
2107
+				we found the intial connection and the time when we get
2108
+				the fd)
2109
+			 */
2110
+			if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2111
+				if (unlikely(c!=tmp && tmp!=0))
2112
+					BUG("tcp_send: get_fd: got different connection:"
2107 2113
 						"  %p (id= %d, refcnt=%d state=%d) != "
2108 2114
 						"  %p (n=%d)\n",
2109 2115
 						  c,   c->id,   atomic_get(&c->refcnt),   c->state,
2110 2116
 						  tmp, n
2111
-				   );
2117
+						);
2112 2118
 				n=-1; /* fail */
2113 2119
 				/* don't cache fd & close it */
2114
-				do_close_fd = 1;
2120
+				do_close_fd = (fd==-1)?0:1;
2115 2121
 #ifdef TCP_FD_CACHE
2116 2122
 				use_fd_cache = 0;
2117 2123
 #endif /* TCP_FD_CACHE */
... ...
@@ -2943,6 +2949,12 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
2943 2943
 	struct send_fd_info* t;
2944 2944
 	
2945 2945
 	for (p=t=&q->data[0]; p<q->crt; p++){
2946
+		if (unlikely(p->tcp_conn->state == S_CONN_BAD ||
2947
+					 p->tcp_conn->flags & F_CONN_FD_CLOSED ||
2948
+					 p->tcp_conn->s ==-1)) {
2949
+			/* bad and/or already closed connection => remove */
2950
+			goto rm_con;
2951
+		}
2946 2952
 		if (unlikely(send_fd(p->unix_sock, &(p->tcp_conn),
2947 2953
 					sizeof(struct tcp_connection*), p->tcp_conn->s)<=0)){
2948 2954
 			if ( ((errno==EAGAIN)||(errno==EWOULDBLOCK)) && 
... ...
@@ -2958,7 +2970,11 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
2958 2958
 						   p->unix_sock, (long)(p-&q->data[0]), p->retries,
2959 2959
 						   p->tcp_conn, p->tcp_conn->s, errno,
2960 2960
 						   strerror(errno));
2961
+rm_con:
2961 2962
 #ifdef TCP_ASYNC
2963
+				/* if a connection is on the send_fd queue it means it's
2964
+				   not watched for read anymore => could be watched only for
2965
+				   write */
2962 2966
 				if (p->tcp_conn->flags & F_CONN_WRITE_W){
2963 2967
 					io_watch_del(&io_h, p->tcp_conn->s, -1, IO_FD_CLOSING);
2964 2968
 					p->tcp_conn->flags &=~F_CONN_WRITE_W;
... ...
@@ -3222,6 +3238,7 @@ error:
3222 3222
 inline static int handle_ser_child(struct process_table* p, int fd_i)
3223 3223
 {
3224 3224
 	struct tcp_connection* tcpconn;
3225
+	struct tcp_connection* tmp;
3225 3226
 	long response[2];
3226 3227
 	int cmd;
3227 3228
 	int bytes;
... ...
@@ -3317,9 +3334,26 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
3317 3317
 			/* send the requested FD  */
3318 3318
 			/* WARNING: take care of setting refcnt properly to
3319 3319
 			 * avoid race conditions */
3320
-			if (unlikely(send_fd(p->unix_sock, &tcpconn, sizeof(tcpconn),
3321
-								tcpconn->s)<=0)){
3322
-				LOG(L_ERR, "ERROR: handle_ser_child: send_fd failed\n");
3320
+			if (unlikely(tcpconn->state == S_CONN_BAD ||
3321
+						(tcpconn->flags & F_CONN_FD_CLOSED) ||
3322
+						tcpconn->s ==-1)) {
3323
+				/* connection is already marked as bad and/or has no
3324
+				   fd => don't try to send the fd (trying to send a
3325
+				   closed fd _will_ fail) */
3326
+				tmp = 0;
3327
+				if (unlikely(send_all(p->unix_sock, &tmp, sizeof(tmp)) <= 0))
3328
+					BUG("handle_ser_child: CONN_GET_FD: send_all failed\n");
3329
+				/* no need to attempt to destroy the connection, it should
3330
+				   be already in the process of being destroyed */
3331
+			} else if (unlikely(send_fd(p->unix_sock, &tcpconn,
3332
+										sizeof(tcpconn), tcpconn->s)<=0)){
3333
+				LOG(L_ERR, "handle_ser_child: CONN_GET_FD:"
3334
+							" send_fd failed\n");
3335
+				/* try sending error (better then not sending anything) */
3336
+				tmp = 0;
3337
+				if (unlikely(send_all(p->unix_sock, &tmp, sizeof(tmp)) <= 0))
3338
+					BUG("handle_ser_child: CONN_GET_FD:"
3339
+							" send_fd send_all fallback failed\n");
3323 3340
 			}
3324 3341
 			break;
3325 3342
 		case CONN_NEW: