Browse code

tcp: fix fd passing bug

If connections are opened and closed very quickly when data is
sent on them, it is possible that a connection gets closed
(close() inside tcp_main) while a process waits for its fd, just
before tcp_main attempts to send the fd. In this case the fd
sending will fail (one cannot send a closed fd) and the process
that asked for it will remain waiting forever.
The fix always checks before attempting to send the fd if the fd is
still open and the connection is not in a "bad" state. If not,
a new error response is sent (no fd and connection == NULL).

Andrei Pelinescu-Onciul authored on 16/06/2010 19:03:06
Showing 1 changed files
... ...
@@ -2102,16 +2102,22 @@ static int tcpconn_send_put(struct tcp_connection* c, char* buf, unsigned len,
2102 2102
 				do_close_fd=0;
2103 2103
 				goto release_c;
2104 2104
 			}
2105
-			if (unlikely(c!=tmp)){
2106
-				LOG(L_CRIT, "BUG: tcp_send: get_fd: got different connection:"
2105
+			/* handle fd closed or bad connection/error
2106
+				(it's possible that this happened in the time between
2107
+				we found the intial connection and the time when we get
2108
+				the fd)
2109
+			 */
2110
+			if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2111
+				if (unlikely(c!=tmp && tmp!=0))
2112
+					BUG("tcp_send: get_fd: got different connection:"
2107 2113
 						"  %p (id= %d, refcnt=%d state=%d) != "
2108 2114
 						"  %p (n=%d)\n",
2109 2115
 						  c,   c->id,   atomic_get(&c->refcnt),   c->state,
2110 2116
 						  tmp, n
2111
-				   );
2117
+						);
2112 2118
 				n=-1; /* fail */
2113 2119
 				/* don't cache fd & close it */
2114
-				do_close_fd = 1;
2120
+				do_close_fd = (fd==-1)?0:1;
2115 2121
 #ifdef TCP_FD_CACHE
2116 2122
 				use_fd_cache = 0;
2117 2123
 #endif /* TCP_FD_CACHE */
... ...
@@ -2943,6 +2949,12 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
2943 2949
 	struct send_fd_info* t;
2944 2950
 	
2945 2951
 	for (p=t=&q->data[0]; p<q->crt; p++){
2952
+		if (unlikely(p->tcp_conn->state == S_CONN_BAD ||
2953
+					 p->tcp_conn->flags & F_CONN_FD_CLOSED ||
2954
+					 p->tcp_conn->s ==-1)) {
2955
+			/* bad and/or already closed connection => remove */
2956
+			goto rm_con;
2957
+		}
2946 2958
 		if (unlikely(send_fd(p->unix_sock, &(p->tcp_conn),
2947 2959
 					sizeof(struct tcp_connection*), p->tcp_conn->s)<=0)){
2948 2960
 			if ( ((errno==EAGAIN)||(errno==EWOULDBLOCK)) && 
... ...
@@ -2958,7 +2970,11 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
2958 2970
 						   p->unix_sock, (long)(p-&q->data[0]), p->retries,
2959 2971
 						   p->tcp_conn, p->tcp_conn->s, errno,
2960 2972
 						   strerror(errno));
2973
+rm_con:
2961 2974
 #ifdef TCP_ASYNC
2975
+				/* if a connection is on the send_fd queue it means it's
2976
+				   not watched for read anymore => could be watched only for
2977
+				   write */
2962 2978
 				if (p->tcp_conn->flags & F_CONN_WRITE_W){
2963 2979
 					io_watch_del(&io_h, p->tcp_conn->s, -1, IO_FD_CLOSING);
2964 2980
 					p->tcp_conn->flags &=~F_CONN_WRITE_W;
... ...
@@ -3222,6 +3238,7 @@ error:
3222 3238
 inline static int handle_ser_child(struct process_table* p, int fd_i)
3223 3239
 {
3224 3240
 	struct tcp_connection* tcpconn;
3241
+	struct tcp_connection* tmp;
3225 3242
 	long response[2];
3226 3243
 	int cmd;
3227 3244
 	int bytes;
... ...
@@ -3317,9 +3334,26 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
3317 3334
 			/* send the requested FD  */
3318 3335
 			/* WARNING: take care of setting refcnt properly to
3319 3336
 			 * avoid race conditions */
3320
-			if (unlikely(send_fd(p->unix_sock, &tcpconn, sizeof(tcpconn),
3321
-								tcpconn->s)<=0)){
3322
-				LOG(L_ERR, "ERROR: handle_ser_child: send_fd failed\n");
3337
+			if (unlikely(tcpconn->state == S_CONN_BAD ||
3338
+						(tcpconn->flags & F_CONN_FD_CLOSED) ||
3339
+						tcpconn->s ==-1)) {
3340
+				/* connection is already marked as bad and/or has no
3341
+				   fd => don't try to send the fd (trying to send a
3342
+				   closed fd _will_ fail) */
3343
+				tmp = 0;
3344
+				if (unlikely(send_all(p->unix_sock, &tmp, sizeof(tmp)) <= 0))
3345
+					BUG("handle_ser_child: CONN_GET_FD: send_all failed\n");
3346
+				/* no need to attempt to destroy the connection, it should
3347
+				   be already in the process of being destroyed */
3348
+			} else if (unlikely(send_fd(p->unix_sock, &tcpconn,
3349
+										sizeof(tcpconn), tcpconn->s)<=0)){
3350
+				LOG(L_ERR, "handle_ser_child: CONN_GET_FD:"
3351
+							" send_fd failed\n");
3352
+				/* try sending error (better then not sending anything) */
3353
+				tmp = 0;
3354
+				if (unlikely(send_all(p->unix_sock, &tmp, sizeof(tmp)) <= 0))
3355
+					BUG("handle_ser_child: CONN_GET_FD:"
3356
+							" send_fd send_all fallback failed\n");
3323 3357
 			}
3324 3358
 			break;
3325 3359
 		case CONN_NEW: