Browse code

- tcp support for queueing writes: if some data cannot be written immediately on the socket (socket buffers full or still connecting), the data will be queued and written at a latter time (max. queue size per socket is controlled by tcp_conn_wq_max, timeout by tcp_send_timeout and total queued bytes / max. mem. used by tcp_wq_max). By default disabled (experimental), to enable it use tcp_buf_write=yes in ser.cfg. To compile without queueing support use -DNO_TCP_BUF_WRITE.

Andrei Pelinescu-Onciul authored on 04/12/2007 20:25:29
Showing 11 changed files
... ...
@@ -77,7 +77,7 @@ MAIN_NAME=ser
77 77
 VERSION = 2
78 78
 PATCHLEVEL = 1
79 79
 SUBLEVEL =  0
80
-EXTRAVERSION = -dev13
80
+EXTRAVERSION = -dev14
81 81
 
82 82
 SER_VER = $(shell expr $(VERSION) \* 1000000 + $(PATCHLEVEL) \* 1000 + \
83 83
 			$(SUBLEVEL) )
... ...
@@ -120,6 +120,15 @@ new config variables:
120 120
      will be cached inside the process calling tcp_send (performance increase
121 121
      for sending over tcp at the cost of slightly slower connection closing and
122 122
      extra FDs kept open)
123
+  tcp_buf_write = yes | no (default no) - if enabled all the tcp  writes that 
124
+     would block / wait for connect to finish, will be queued and attempted
125
+     latter (see also tcp_conn_wq_max and tcp_wq_max).
126
+  tcp_conn_wq_max = bytes (default 32 K) - maximum bytes queued for write 
127
+     allowed per connection. Attempting to queue more bytes would result
128
+     in an error and in the connection being closed (too slow). If 
129
+     tcp_write_buf is not enabled, it has no effect.
130
+  tcp_wq_max = bytes (default 10 Mb) - maximum bytes queued for write allowed
131
+     globally. It has no effect if tcp_write_buf is not enabled.
123 132
   tcp_defer_accept =  yes | no (default no) on freebsd  / number of seconds
124 133
         before timeout on linux (default disabled) - tcp accepts will be 
125 134
         delayed until some data is received (improves performance on proxies
... ...
@@ -293,6 +293,9 @@ TCP_MAX_CONNECTIONS	"tcp_max_connections"
293 293
 TCP_SOURCE_IPV4		"tcp_source_ipv4"
294 294
 TCP_SOURCE_IPV6		"tcp_source_ipv6"
295 295
 TCP_OPT_FD_CACHE	"tcp_fd_cache"
296
+TCP_OPT_BUF_WRITE	"tcp_buf_write"
297
+TCP_OPT_CONN_WQ_MAX	"tcp_conn_wq_max"
298
+TCP_OPT_WQ_MAX		"tcp_wq_max"
296 299
 TCP_OPT_DEFER_ACCEPT "tcp_defer_accept"
297 300
 TCP_OPT_DELAYED_ACK	"tcp_delayed_ack"
298 301
 TCP_OPT_SYNCNT		"tcp_syncnt"
... ...
@@ -561,6 +564,12 @@ EAT_ABLE	[\ \t\b\r]
561 564
 									return TCP_SOURCE_IPV6; }
562 565
 <INITIAL>{TCP_OPT_FD_CACHE}		{ count(); yylval.strval=yytext;
563 566
 									return TCP_OPT_FD_CACHE; }
567
+<INITIAL>{TCP_OPT_CONN_WQ_MAX}	{ count(); yylval.strval=yytext;
568
+									return TCP_OPT_CONN_WQ_MAX; }
569
+<INITIAL>{TCP_OPT_WQ_MAX}	{ count(); yylval.strval=yytext;
570
+									return TCP_OPT_WQ_MAX; }
571
+<INITIAL>{TCP_OPT_BUF_WRITE}	{ count(); yylval.strval=yytext;
572
+									return TCP_OPT_BUF_WRITE; }
564 573
 <INITIAL>{TCP_OPT_DEFER_ACCEPT}	{ count(); yylval.strval=yytext;
565 574
 									return TCP_OPT_DEFER_ACCEPT; }
566 575
 <INITIAL>{TCP_OPT_DELAYED_ACK}	{ count(); yylval.strval=yytext;
... ...
@@ -334,6 +334,9 @@ static struct socket_id* mk_listen_id(char*, int, int);
334 334
 %token TCP_SOURCE_IPV4
335 335
 %token TCP_SOURCE_IPV6
336 336
 %token TCP_OPT_FD_CACHE
337
+%token TCP_OPT_BUF_WRITE
338
+%token TCP_OPT_CONN_WQ_MAX
339
+%token TCP_OPT_WQ_MAX
337 340
 %token TCP_OPT_DEFER_ACCEPT
338 341
 %token TCP_OPT_DELAYED_ACK
339 342
 %token TCP_OPT_SYNCNT
... ...
@@ -803,6 +806,30 @@ assign_stm:
803 806
 		#endif
804 807
 	}
805 808
 	| TCP_OPT_FD_CACHE EQUAL error { yyerror("boolean value expected"); }
809
+	| TCP_OPT_BUF_WRITE EQUAL NUMBER {
810
+		#ifdef USE_TCP
811
+			tcp_options.tcp_buf_write=$3;
812
+		#else
813
+			warn("tcp support not compiled in");
814
+		#endif
815
+	}
816
+	| TCP_OPT_BUF_WRITE EQUAL error { yyerror("boolean value expected"); }
817
+	| TCP_OPT_CONN_WQ_MAX EQUAL NUMBER {
818
+		#ifdef USE_TCP
819
+			tcp_options.tcpconn_wq_max=$3;
820
+		#else
821
+			warn("tcp support not compiled in");
822
+		#endif
823
+	}
824
+	| TCP_OPT_CONN_WQ_MAX error { yyerror("boolean value expected"); }
825
+	| TCP_OPT_WQ_MAX EQUAL NUMBER {
826
+		#ifdef USE_TCP
827
+			tcp_options.tcp_wq_max=$3;
828
+		#else
829
+			warn("tcp support not compiled in");
830
+		#endif
831
+	}
832
+	| TCP_OPT_WQ_MAX error { yyerror("boolean value expected"); }
806 833
 	| TCP_OPT_DEFER_ACCEPT EQUAL NUMBER {
807 834
 		#ifdef USE_TCP
808 835
 			tcp_options.defer_accept=$3;
... ...
@@ -532,10 +532,11 @@ static void core_tcpinfo(rpc_t* rpc, void* c)
532 532
 	if (!tcp_disable){
533 533
 		tcp_get_info(&ti);
534 534
 		rpc->add(c, "{", &handle);
535
-		rpc->struct_add(handle, "ddd",
535
+		rpc->struct_add(handle, "dddd",
536 536
 			"readers", ti.tcp_readers,
537 537
 			"max_connections", ti.tcp_max_connections,
538
-			"opened_connections", ti.tcp_connections_no
538
+			"opened_connections", ti.tcp_connections_no,
539
+			"write_queued_bytes", ti.tcp_write_queued
539 540
 		);
540 541
 	}else{
541 542
 		rpc->fault(c, 500, "tcp support disabled");
... ...
@@ -561,8 +562,13 @@ static void core_tcp_options(rpc_t* rpc, void* c)
561 562
 	if (!tcp_disable){
562 563
 		tcp_options_get(&t);
563 564
 		rpc->add(c, "{", &handle);
564
-		rpc->struct_add(handle, "ddddddddd",
565
+		rpc->struct_add(handle, "ddddddddddddd",
565 566
 			"fd_cache",		t.fd_cache,
567
+			"tcp_buf_write",	t.tcp_buf_write,
568
+			"tcpconn_wq_max",	t.tcpconn_wq_max,
569
+			"tcp_wq_max",	t.tcp_wq_max,
570
+			"tcp_wq_timeout",	TICKS_TO_S(t.tcp_wq_timeout),
571
+			
566 572
 			"defer_accept",	t.defer_accept,
567 573
 			"delayed_ack",	t.delayed_ack,
568 574
 			"syncnt",		t.syncnt,
... ...
@@ -842,7 +842,6 @@ again_devpoll2:
842 842
 					h->poll_method);
843 843
 			goto error;
844 844
 	}
845
-	h->fd_no--;
846 845
 	return 0;
847 846
 error:
848 847
 	return -1;
... ...
@@ -34,6 +34,7 @@
34 34
  *  2007-07-26  improved tcp connection hash function; increased aliases
35 35
  *               hash size (andrei)
36 36
  *  2007-11-26  switched to local_timer (andrei)
37
+ *  2007-11-30  buffered write support (andrei)
37 38
  */
38 39
 
39 40
 
... ...
@@ -41,6 +42,8 @@
41 42
 #ifndef _tcp_conn_h
42 43
 #define _tcp_conn_h
43 44
 
45
+#include "tcp_options.h"
46
+
44 47
 #include "ip_addr.h"
45 48
 #include "locking.h"
46 49
 #include "atomic_ops.h"
... ...
@@ -67,6 +70,7 @@
67 70
 #define F_CONN_NON_BLOCKING 1
68 71
 #define F_CONN_REMOVED      2 /* no longer  in "main" listen fd list */
69 72
 #define F_CONN_READER       4 /* handled by a tcp reader */
73
+#define F_CONN_WRITE_W      8 /* watched for write (main) */
70 74
 
71 75
 
72 76
 enum tcp_req_errors {	TCP_REQ_INIT, TCP_REQ_OK, TCP_READ_ERROR,
... ...
@@ -86,7 +90,7 @@ enum tcp_conn_states { S_CONN_ERROR=-2, S_CONN_BAD=-1, S_CONN_OK=0,
86 90
 
87 91
 /* fd communication commands */
88 92
 enum conn_cmds { CONN_DESTROY=-3, CONN_ERROR=-2, CONN_EOF=-1, CONN_RELEASE, 
89
-					CONN_GET_FD, CONN_NEW };
93
+					CONN_GET_FD, CONN_NEW, CONN_QUEUED_WRITE };
90 94
 /* CONN_RELEASE, EOF, ERROR, DESTROY can be used by "reader" processes
91 95
  * CONN_GET_FD, NEW, ERROR only by writers */
92 96
 
... ...
@@ -121,6 +125,23 @@ struct tcp_conn_alias{
121 125
 };
122 126
 
123 127
 
128
+#ifdef TCP_BUF_WRITE
129
+	struct tcp_wbuffer{
130
+		struct tcp_wbuffer* next;
131
+		unsigned int b_size;
132
+		char buf[1];
133
+	};
134
+
135
+	struct tcp_wbuffer_queue{
136
+		struct tcp_wbuffer* first;
137
+		struct tcp_wbuffer* last;
138
+		unsigned int queued; /* total size */
139
+		unsigned int offset; /* offset in the first wbuffer were data
140
+								starts */
141
+		unsigned int last_used; /* how much of the last buffer is used */
142
+	};
143
+#endif
144
+
124 145
 
125 146
 struct tcp_connection{
126 147
 	int s; /*socket, used by "tcp main" */
... ...
@@ -137,7 +158,7 @@ struct tcp_connection{
137 158
 	enum tcp_conn_states state; /* connection state */
138 159
 	void* extra_data; /* extra data associated to the connection, 0 for tcp*/
139 160
 	struct timer_ln timer;
140
-	unsigned int timeout;/* connection timeout, after this it will be removed*/
161
+	ticks_t timeout;/* connection timeout, after this it will be removed*/
141 162
 	unsigned id_hash; /* hash index in the id_hash */
142 163
 	struct tcp_connection* id_next; /* next, prev in id hash table */
143 164
 	struct tcp_connection* id_prev;
... ...
@@ -145,6 +166,10 @@ struct tcp_connection{
145 166
 	struct tcp_connection* c_prev;
146 167
 	struct tcp_conn_alias con_aliases[TCP_CON_MAX_ALIASES];
147 168
 	int aliases; /* aliases number, at least 1 */
169
+#ifdef TCP_BUF_WRITE
170
+	ticks_t last_write; /* time when the last write took place */
171
+	struct tcp_wbuffer_queue wbuf_q;
172
+#endif
148 173
 };
149 174
 
150 175
 
... ...
@@ -35,6 +35,8 @@ struct tcp_gen_info{
35 35
 	int tcp_readers;
36 36
 	int tcp_max_connections;
37 37
 	int tcp_connections_no; /* crt. number */
38
+	int tcp_write_queued; /* total bytes queued for write, 0 if no
39
+							 write queued support is enabled */
38 40
 };
39 41
 
40 42
 
... ...
@@ -87,6 +87,7 @@
87 87
  *  2007-11-27  added send fd cache and reader fd reuse (andrei)
88 88
  *  2007-11-28  added support for TCP_DEFER_ACCEPT, KEEPALIVE, KEEPINTVL,
89 89
  *               KEEPCNT, QUICKACK, SYNCNT, LINGER2 (andrei)
90
+ *  2007-12-04  support for queueing write requests (andrei)
90 91
  */
91 92
 
92 93
 
... ...
@@ -145,6 +146,7 @@
145 146
 
146 147
 #include "tcp_info.h"
147 148
 #include "tcp_options.h"
149
+#include "ut.h"
148 150
 
149 151
 #define local_malloc pkg_malloc
150 152
 #define local_free   pkg_free
... ...
@@ -177,6 +179,12 @@
177 179
 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
178 180
 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
179 181
 
182
+#ifdef TCP_BUF_WRITE
183
+#define TCP_WBUF_SIZE	1024 /* FIXME: after debugging switch to 16-32k */
184
+static unsigned int* tcp_total_wq=0;
185
+#endif
186
+
187
+
180 188
 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
181 189
 				F_TCPCONN, F_TCPCHILD, F_PROC };
182 190
 
... ...
@@ -542,6 +550,173 @@ end:
542 550
 
543 551
 
544 552
 
553
+inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
554
+									char* buf, int len);
555
+
556
+
557
+#ifdef TCP_BUF_WRITE
558
+
559
+
560
+inline static int wbufq_add(struct  tcp_connection* c, char* data, 
561
+							unsigned int size)
562
+{
563
+	struct tcp_wbuffer_queue* q;
564
+	struct tcp_wbuffer* wb;
565
+	unsigned int last_free;
566
+	unsigned int wb_size;
567
+	unsigned int crt_size;
568
+	ticks_t t;
569
+	
570
+	q=&c->wbuf_q;
571
+	t=get_ticks_raw();
572
+	if (unlikely(	((q->queued+size)>tcp_options.tcpconn_wq_max) ||
573
+					((*tcp_total_wq+size)>tcp_options.tcp_wq_max) ||
574
+					(q->first &&
575
+					TICKS_GT(t, c->last_write+tcp_options.tcp_wq_timeout)) )){
576
+		LOG(L_ERR, "ERROR: wbufq_add(%d bytes): write queue full or timeout "
577
+					" (%d, total %d, last write %d s ago)\n",
578
+					size, q->queued, *tcp_total_wq,
579
+					TICKS_TO_S(t-c->last_write));
580
+		goto error;
581
+	}
582
+	
583
+	if (unlikely(q->last==0)){
584
+		wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
585
+		wb=shm_malloc(sizeof(*wb)+wb_size-1);
586
+		if (unlikely(wb==0))
587
+			goto error;
588
+		wb->b_size=wb_size;
589
+		wb->next=0;
590
+		q->last=wb;
591
+		q->first=wb;
592
+		q->last_used=0;
593
+		q->offset=0;
594
+		c->last_write=get_ticks_raw(); /* start with the crt. time */
595
+	}else{
596
+		wb=q->last;
597
+	}
598
+	
599
+	while(size){
600
+		last_free=wb->b_size-q->last_used;
601
+		if (last_free==0){
602
+			wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
603
+			wb=shm_malloc(sizeof(*wb)+wb_size-1);
604
+			if (unlikely(wb==0))
605
+				goto error;
606
+			wb->b_size=wb_size;
607
+			wb->next=0;
608
+			q->last->next=wb;
609
+			q->last=wb;
610
+			q->last_used=0;
611
+			last_free=wb->b_size;
612
+		}
613
+		crt_size=MIN_unsigned(last_free, size);
614
+		memcpy(wb->buf, data, crt_size);
615
+		q->last_used+=crt_size;
616
+		size-=crt_size;
617
+		data+=crt_size;
618
+		q->queued+=crt_size;
619
+		atomic_add_int((int*)tcp_total_wq, crt_size);
620
+	}
621
+	return 0;
622
+error:
623
+	return -1;
624
+}
625
+
626
+
627
+
628
+inline static void wbufq_destroy( struct  tcp_wbuffer_queue* q)
629
+{
630
+	struct tcp_wbuffer* wb;
631
+	struct tcp_wbuffer* next_wb;
632
+	int unqueued;
633
+	
634
+	unqueued=0;
635
+	if (likely(q->first)){
636
+		wb=q->first;
637
+		do{
638
+			next_wb=wb->next;
639
+			unqueued+=(wb==q->last)?q->last_used:wb->b_size;
640
+			if (wb==q->first)
641
+				unqueued-=q->offset;
642
+			shm_free(wb);
643
+			wb=next_wb;
644
+		}while(wb);
645
+	}
646
+	memset(q, 0, sizeof(*q));
647
+	atomic_add_int((int*)tcp_total_wq, -unqueued);
648
+}
649
+
650
+
651
+
652
+/* tries to empty the queue
653
+ * returns -1 on error, bytes written on success (>=0) 
654
+ * if the whole queue is emptied => sets *empty*/
655
+inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
656
+{
657
+	struct tcp_wbuffer_queue* q;
658
+	struct tcp_wbuffer* wb;
659
+	int n;
660
+	int ret;
661
+	int block_size;
662
+	ticks_t t;
663
+	char* buf;
664
+	
665
+	*empty=0;
666
+	ret=0;
667
+	t=get_ticks_raw();
668
+	lock_get(&c->write_lock);
669
+	q=&c->wbuf_q;
670
+	while(q->first){
671
+		block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
672
+						q->offset;
673
+		buf=q->first->buf+q->offset;
674
+		n=_tcpconn_write_nb(fd, c, buf, block_size);
675
+		if (likely(n>0)){
676
+			ret+=n;
677
+			if (likely(n==block_size)){
678
+				wb=q->first;
679
+				q->first=q->first->next; 
680
+				shm_free(wb);
681
+				q->offset=0;
682
+				q->queued-=block_size;
683
+				atomic_add_int((int*)tcp_total_wq, -block_size);
684
+			}else{
685
+				q->offset+=n;
686
+				q->queued-=n;
687
+				atomic_add_int((int*)tcp_total_wq, -n);
688
+				break;
689
+			}
690
+			c->last_write=t;
691
+			c->state=S_CONN_OK;
692
+		}else{
693
+			if (n<0){
694
+				/* EINTR is handled inside _tcpconn_write_nb */
695
+				if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
696
+					ret=-1;
697
+					LOG(L_ERR, "ERROR: wbuf_runq: %s [%d]\n",
698
+						strerror(errno), errno);
699
+				}
700
+			}
701
+			break;
702
+		}
703
+	}
704
+	if (likely(q->first==0)){
705
+		q->last=0;
706
+		q->last_used=0;
707
+		q->offset=0;
708
+		*empty=1;
709
+	}
710
+	if (unlikely(c->state==S_CONN_CONNECT && (ret>0)))
711
+			c->state=S_CONN_OK;
712
+	lock_release(&c->write_lock);
713
+	return ret;
714
+}
715
+
716
+#endif /* TCP_BUF_WRITE */
717
+
718
+
719
+
545 720
 #if 0
546 721
 /* blocking write even on non-blocking sockets 
547 722
  * if TCP_TIMEOUT will return with error */
... ...
@@ -687,6 +862,10 @@ struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
687 862
 	socklen_t my_name_len;
688 863
 	struct tcp_connection* con;
689 864
 	struct ip_addr ip;
865
+	enum tcp_conn_states state;
866
+#ifdef TCP_BUF_WRITE
867
+	int n;
868
+#endif /* TCP_BUF_WRITE */
690 869
 
691 870
 	s=-1;
692 871
 	
... ...
@@ -710,11 +889,30 @@ struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
710 889
 	if (from && bind(s, &from->s, sockaddru_len(*from)) != 0)
711 890
 		LOG(L_WARN, "WARNING: tcpconn_connect: binding to source address"
712 891
 					" failed: %s [%d]\n", strerror(errno), errno);
713
-
714
-	if (tcp_blocking_connect(s, &server->s, sockaddru_len(*server))<0){
715
-		LOG(L_ERR, "ERROR: tcpconn_connect: tcp_blocking_connect failed\n");
716
-		goto error;
892
+#ifdef TCP_BUF_WRITE
893
+	if (likely(tcp_options.tcp_buf_write)){
894
+again:
895
+		n=connect(s, &server->s, sockaddru_len(*server));
896
+		if (unlikely(n==-1)){
897
+			if (errno==EINTR) goto again;
898
+			if (errno!=EINPROGRESS && errno!=EALREADY){
899
+				LOG(L_ERR, "ERROR: tcpconn_connect: connect: (%d) %s\n",
900
+						errno, strerror(errno));
901
+				goto error;
902
+			}
903
+			state=S_CONN_CONNECT;
904
+		}
905
+	}else{
906
+#endif /* TCP_BUF_WRITE */
907
+		if (tcp_blocking_connect(s, &server->s, sockaddru_len(*server))<0){
908
+			LOG(L_ERR, "ERROR: tcpconn_connect: tcp_blocking_connect"
909
+						" failed\n");
910
+			goto error;
911
+		}
912
+		state=S_CONN_OK;
913
+#ifdef TCP_BUF_WRITE
717 914
 	}
915
+#endif /* TCP_BUF_WRITE */
718 916
 	if (from){
719 917
 		su2ip_addr(&ip, from);
720 918
 		if (!ip_addr_any(&ip))
... ...
@@ -746,7 +944,7 @@ skip:
746 944
 		else si=sendipv6_tcp;
747 945
 #endif
748 946
 	}
749
-	con=tcpconn_new(s, server, from, si,  type, S_CONN_CONNECT);
947
+	con=tcpconn_new(s, server, from, si,  type, state);
750 948
 	if (con==0){
751 949
 		LOG(L_ERR, "ERROR: tcp_connect: tcpconn_new failed, closing the "
752 950
 				 " socket\n");
... ...
@@ -818,6 +1016,10 @@ static inline void _tcpconn_detach(struct tcp_connection *c)
818 1016
 
819 1017
 static inline void _tcpconn_free(struct tcp_connection* c)
820 1018
 {
1019
+#ifdef TCP_BUF_WRITE
1020
+	if (unlikely(c->wbuf_q.first))
1021
+		wbufq_destroy(&c->wbuf_q);
1022
+#endif
821 1023
 	lock_destroy(&c->write_lock);
822 1024
 #ifdef USE_TLS
823 1025
 	if (unlikely(c->type==PROTO_TLS)) tls_tcpconn_clean(c);
... ...
@@ -1134,6 +1336,9 @@ int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1134 1336
 	long response[2];
1135 1337
 	int n;
1136 1338
 	int do_close_fd;
1339
+#ifdef TCP_BUF_WRITE
1340
+	int enable_write_watch;
1341
+#endif /* TCP_BUF_WRITE */
1137 1342
 #ifdef TCP_FD_CACHE
1138 1343
 	struct fd_cache_entry* fd_cache_e;
1139 1344
 	
... ...
@@ -1204,6 +1409,24 @@ no_id:
1204 1409
 			goto send_it;
1205 1410
 		}
1206 1411
 get_fd:
1412
+#ifdef TCP_BUF_WRITE
1413
+		/* if data is already queued, we don't need the fd any more */
1414
+		if (unlikely(tcp_options.tcp_buf_write && c->wbuf_q.first)){
1415
+			lock_get(&c->write_lock);
1416
+				if (likely(c->wbuf_q.first)){
1417
+					do_close_fd=0;
1418
+					if (unlikely(wbufq_add(c, buf, len)<0)){
1419
+						lock_release(&c->write_lock);
1420
+						n=-1;
1421
+						goto error;
1422
+					}
1423
+					n=len;
1424
+					lock_release(&c->write_lock);
1425
+					goto release_c;
1426
+				}
1427
+			lock_release(&c->write_lock);
1428
+		}
1429
+#endif /* TCP_BUF_WRITE */
1207 1430
 		/* check if this is not the same reader process holding
1208 1431
 		 *  c  and if so send directly on c->fd */
1209 1432
 		if (c->reader_pid==my_pid()){
... ...
@@ -1237,6 +1460,7 @@ get_fd:
1237 1460
 				LOG(L_ERR, "BUG: tcp_send: failed to get fd(receive_fd):"
1238 1461
 							" %s (%d)\n", strerror(errno), errno);
1239 1462
 				n=-1;
1463
+				do_close_fd=0;
1240 1464
 				goto release_c;
1241 1465
 			}
1242 1466
 			if (unlikely(c!=tmp)){
... ...
@@ -1256,6 +1480,21 @@ get_fd:
1256 1480
 send_it:
1257 1481
 	DBG("tcp_send: sending...\n");
1258 1482
 	lock_get(&c->write_lock);
1483
+#ifdef TCP_BUF_WRITE
1484
+	if (likely(tcp_options.tcp_buf_write)){
1485
+		if (c->wbuf_q.first){
1486
+			if (unlikely(wbufq_add(c, buf, len)<0)){
1487
+				lock_release(&c->write_lock);
1488
+				n=-1;
1489
+				goto error;
1490
+			}
1491
+			lock_release(&c->write_lock);
1492
+			n=len;
1493
+			goto end;
1494
+		}
1495
+		n=_tcpconn_write_nb(fd, c, buf, len);
1496
+	}else{
1497
+#endif /* TCP_BUF_WRITE */
1259 1498
 #ifdef USE_TLS
1260 1499
 	if (c->type==PROTO_TLS)
1261 1500
 		n=tls_blocking_write(c, fd, buf, len);
... ...
@@ -1263,10 +1502,39 @@ send_it:
1263 1502
 #endif
1264 1503
 		/* n=tcp_blocking_write(c, fd, buf, len); */
1265 1504
 		n=tsend_stream(fd, buf, len, tcp_send_timeout*1000); 
1505
+#ifdef TCP_BUF_WRITE
1506
+	}
1507
+#endif /* TCP_BUF_WRITE */
1266 1508
 	lock_release(&c->write_lock);
1267 1509
 	DBG("tcp_send: after write: c= %p n=%d fd=%d\n",c, n, fd);
1268 1510
 	DBG("tcp_send: buf=\n%.*s\n", (int)len, buf);
1269 1511
 	if (unlikely(n<0)){
1512
+#ifdef TCP_BUF_WRITE
1513
+		if (tcp_options.tcp_buf_write && 
1514
+				(errno==EAGAIN || errno==EWOULDBLOCK)){
1515
+			lock_get(&c->write_lock);
1516
+			enable_write_watch=(c->wbuf_q.first==0);
1517
+			if (unlikely(wbufq_add(c, buf, len)<0)){
1518
+				lock_release(&c->write_lock);
1519
+				n=-1;
1520
+				goto error;
1521
+			}
1522
+			lock_release(&c->write_lock);
1523
+			n=len;
1524
+			if (enable_write_watch){
1525
+				response[0]=(long)c;
1526
+				response[1]=CONN_QUEUED_WRITE;
1527
+				if (send_all(unix_tcp_sock, response, sizeof(response))<=0){
1528
+					LOG(L_ERR, "BUG: tcp_send: error return failed "
1529
+							"(write):%s (%d)\n", strerror(errno), errno);
1530
+					n=-1;
1531
+					goto error;
1532
+				}
1533
+			}
1534
+			goto end;
1535
+		}
1536
+error:
1537
+#endif /* TCP_BUF_WRITE */
1270 1538
 		LOG(L_ERR, "ERROR: tcp_send: failed to send\n");
1271 1539
 		/* error on the connection , mark it as bad and set 0 timeout */
1272 1540
 		c->state=S_CONN_BAD;
... ...
@@ -1294,6 +1562,13 @@ send_it:
1294 1562
 		if (do_close_fd) close(fd);
1295 1563
 		return n; /* error return, no tcpconn_put */
1296 1564
 	}
1565
+#ifdef TCP_BUF_WRITE
1566
+	if (likely(tcp_options.tcp_buf_write)){
1567
+		if (unlikely(c->state==S_CONN_CONNECT))
1568
+			c->state=S_CONN_OK;
1569
+		c->last_write=get_ticks_raw();
1570
+	}
1571
+#endif /* TCP_BUF_WRITE */
1297 1572
 end:
1298 1573
 #ifdef TCP_FD_CACHE
1299 1574
 	if (unlikely((fd_cache_e==0) && tcp_options.fd_cache)){
... ...
@@ -1465,23 +1740,40 @@ static void tcpconn_destroy(struct tcp_connection* tcpconn)
1465 1740
 	 *  (if the timer is already removed, nothing happens) */
1466 1741
 	if (likely(!(tcpconn->flags & F_CONN_READER)))
1467 1742
 		local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
1743
+#ifdef TCP_BUF_WRITE
1744
+	if (unlikely((tcpconn->flags & F_CONN_WRITE_W) ||
1745
+				!(tcpconn->flags & F_CONN_REMOVED))){
1746
+		LOG(L_CRIT, "tcpconn_destroy: possible BUG: flags = %0x\n",
1747
+					tcpconn->flags);
1748
+	}
1749
+	if (unlikely(tcpconn->wbuf_q.first)){
1750
+		lock_get(&tcpconn->write_lock);
1751
+			/* check again, while holding the lock */
1752
+			if (likely(tcpconn->wbuf_q.first))
1753
+				wbufq_destroy(&tcpconn->wbuf_q);
1754
+		lock_release(&tcpconn->write_lock);
1755
+	}
1756
+#endif /* TCP_BUF_WRITE */
1468 1757
 	TCPCONN_LOCK; /*avoid races w/ tcp_send*/
1469 1758
 	if (likely(atomic_dec_and_test(&tcpconn->refcnt))){ 
1470 1759
 		_tcpconn_detach(tcpconn);
1471 1760
 		TCPCONN_UNLOCK;
1472
-		DBG("tcpconn_destroy: destroying connection %p, flags %04x\n",
1473
-				tcpconn, tcpconn->flags);
1761
+		DBG("tcpconn_destroy: destroying connection %p (%d, %d) flags %04x\n",
1762
+				tcpconn, tcpconn->id, tcpconn->s, tcpconn->flags);
1474 1763
 		fd=tcpconn->s;
1475 1764
 #ifdef USE_TLS
1476 1765
 		/*FIXME: lock ->writelock ? */
1477 1766
 		if (tcpconn->type==PROTO_TLS)
1478 1767
 			tls_close(tcpconn, fd);
1479 1768
 #endif
1480
-		_tcpconn_free(tcpconn);
1769
+		_tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
1481 1770
 #ifdef TCP_FD_CACHE
1482 1771
 		if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
1483 1772
 #endif /* TCP_FD_CACHE */
1484
-		close(fd);
1773
+		if (unlikely(close(fd)<0)){
1774
+			LOG(L_ERR, "ERROR: tcpconn_destroy; close() failed: %s (%d)\n",
1775
+					strerror(errno), errno);
1776
+		}
1485 1777
 		(*tcp_connections_no)--;
1486 1778
 	}else{
1487 1779
 		TCPCONN_UNLOCK;
... ...
@@ -1627,6 +1919,13 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
1627 1919
 						   p->unix_sock, (long)(p-&q->data[0]), p->retries,
1628 1920
 						   p->tcp_conn, p->tcp_conn->s, errno,
1629 1921
 						   strerror(errno));
1922
+#ifdef TCP_BUF_WRITE
1923
+				if (p->tcp_conn->flags & F_CONN_WRITE_W){
1924
+					io_watch_del(&io_h, p->tcp_conn->s, -1, IO_FD_CLOSING);
1925
+					p->tcp_conn->flags &=~F_CONN_WRITE_W;
1926
+				}
1927
+#endif
1928
+				p->tcp_conn->flags &= ~F_CONN_READER;
1630 1929
 				tcpconn_destroy(p->tcp_conn);
1631 1930
 			}
1632 1931
 		}
... ...
@@ -1638,6 +1937,36 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
1638 1937
 #endif
1639 1938
 
1640 1939
 
1940
+/* non blocking write() on a tcpconnection, unsafe version (should be called
1941
+ * while holding  c->write_lock). The fd should be non-blocking.
1942
+ *  returns number of bytes written on success, -1 on error (and sets errno)
1943
+ */
1944
+inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
1945
+									char* buf, int len)
1946
+{
1947
+	int n;
1948
+	
1949
+again:
1950
+#ifdef USE_TLS
1951
+	if (unlikely(c->type==PROTO_TLS))
1952
+		/* FIXME: tls_nonblocking_write !! */
1953
+		n=tls_blocking_write(c, fd, buf, len);
1954
+	else
1955
+#endif /* USE_TLS */
1956
+		n=send(fd, buf, len,
1957
+#ifdef HAVE_MSG_NOSIGNAL
1958
+					MSG_NOSIGNAL
1959
+#else
1960
+					0
1961
+#endif /* HAVE_MSG_NOSIGNAL */
1962
+			  );
1963
+	if (unlikely(n<0)){
1964
+		if (errno==EINTR) goto again;
1965
+	}
1966
+	return n;
1967
+}
1968
+
1969
+
1641 1970
 
1642 1971
 /* handles io from a tcp child process
1643 1972
  * params: tcp_c - pointer in the tcp_children array, to the entry for
... ...
@@ -1654,6 +1983,7 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1654 1983
 	long response[2];
1655 1984
 	int cmd;
1656 1985
 	int bytes;
1986
+	int n;
1657 1987
 	ticks_t t;
1658 1988
 	
1659 1989
 	if (unlikely(tcp_c->unix_sock<=0)){
... ...
@@ -1715,6 +2045,12 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1715 2045
 		case CONN_RELEASE:
1716 2046
 			tcp_c->busy--;
1717 2047
 			if (unlikely(tcpconn->state==S_CONN_BAD)){ 
2048
+#ifdef TCP_BUF_WRITE
2049
+				if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2050
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2051
+					tcpconn->flags &= ~F_CONN_WRITE_W;
2052
+				}
2053
+#endif /* TCP_BUF_WRITE */
1718 2054
 				tcpconn_destroy(tcpconn);
1719 2055
 				break;
1720 2056
 			}
... ...
@@ -1729,12 +2065,22 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1729 2065
 								tcp_con_lifetime, t);
1730 2066
 			/* must be after the de-ref*/
1731 2067
 			tcpconn->flags&=~(F_CONN_REMOVED|F_CONN_READER);
1732
-			if (unlikely(
1733
-					io_watch_add(&io_h, tcpconn->s, POLLIN,
1734
-												F_TCPCONN, tcpconn)<0)){
2068
+#ifdef TCP_BUF_WRITE
2069
+			if (unlikely(tcpconn->flags & F_CONN_WRITE_W))
2070
+				n=io_watch_chg(&io_h, tcpconn->s, POLLIN| POLLOUT, -1);
2071
+			else
2072
+#endif /* TCP_BUF_WRITE */
2073
+				n=io_watch_add(&io_h, tcpconn->s, POLLIN, F_TCPCONN, tcpconn);
2074
+			if (unlikely(n<0)){
1735 2075
 				LOG(L_CRIT, "ERROR: tcp_main: handle_tcp_child: failed to add"
1736 2076
 						" new socket to the fd list\n");
1737 2077
 				tcpconn->flags|=F_CONN_REMOVED;
2078
+#ifdef TCP_BUF_WRITE
2079
+				if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2080
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2081
+					tcpconn->flags&=~F_CONN_WRITE_W;
2082
+				}
2083
+#endif /* TCP_BUF_WRITE */
1738 2084
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1739 2085
 			}
1740 2086
 			DBG("handle_tcp_child: CONN_RELEASE  %p refcnt= %d\n", 
... ...
@@ -1749,6 +2095,12 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1749 2095
 				 if (tcpconn->s!=-1)
1750 2096
 					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1751 2097
 				*/
2098
+#ifdef TCP_BUF_WRITE
2099
+				if ((tcpconn->flags & F_CONN_WRITE_W) && (tcpconn->s!=-1)){
2100
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2101
+					tcpconn->flags&=~F_CONN_WRITE_W;
2102
+				}
2103
+#endif /* TCP_BUF_WRITE */
1752 2104
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1753 2105
 				break;
1754 2106
 		default:
... ...
@@ -1785,6 +2137,7 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1785 2137
 	int bytes;
1786 2138
 	int ret;
1787 2139
 	int fd;
2140
+	int flags;
1788 2141
 	ticks_t t;
1789 2142
 	
1790 2143
 	ret=-1;
... ...
@@ -1844,10 +2197,15 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1844 2197
 	}
1845 2198
 	switch(cmd){
1846 2199
 		case CONN_ERROR:
1847
-			if (!(tcpconn->flags & F_CONN_REMOVED) && (tcpconn->s!=-1)){
2200
+			if ( (!(tcpconn->flags & F_CONN_REMOVED) ||
2201
+					(tcpconn->flags & F_CONN_WRITE_W) ) && (tcpconn->s!=-1)){
1848 2202
 				io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1849 2203
 				tcpconn->flags|=F_CONN_REMOVED;
2204
+				tcpconn->flags&=~F_CONN_WRITE_W;
1850 2205
 			}
2206
+			LOG(L_ERR, "handle_ser_child: ERROR: received CON_ERROR for %p"
2207
+					" (id %d), refcnt %d\n", 
2208
+					tcpconn, tcpconn->id, atomic_get(&tcpconn->refcnt));
1851 2209
 			tcpconn_destroy(tcpconn); /* will close also the fd */
1852 2210
 			break;
1853 2211
 		case CONN_GET_FD:
... ...
@@ -1879,15 +2237,53 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1879 2237
 			local_timer_add(&tcp_main_ltimer, &tcpconn->timer, 
1880 2238
 								tcp_con_lifetime, t);
1881 2239
 			tcpconn->flags&=~F_CONN_REMOVED;
2240
+			flags=POLLIN 
2241
+#ifdef TCP_BUF_WRITE
2242
+					/* not used for now, the connection is sent to tcp_main
2243
+					 * before knowing if we can write on it or we should 
2244
+					 * wait */
2245
+					| (((int)!(tcpconn->flags & F_CONN_WRITE_W)-1) & POLLOUT)
2246
+#endif /* TCP_BUF_WRITE */
2247
+					;
1882 2248
 			if (unlikely(
1883
-					io_watch_add(&io_h, tcpconn->s, POLLIN,
2249
+					io_watch_add(&io_h, tcpconn->s, flags,
1884 2250
 												F_TCPCONN, tcpconn)<0)){
1885 2251
 				LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed to add"
1886 2252
 						" new socket to the fd list\n");
1887 2253
 				tcpconn->flags|=F_CONN_REMOVED;
2254
+				tcpconn->flags&=~F_CONN_WRITE_W;
1888 2255
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1889 2256
 			}
1890 2257
 			break;
2258
+#ifdef TCP_BUF_WRITE
2259
+		case CONN_QUEUED_WRITE:
2260
+			if (!(tcpconn->flags & F_CONN_WRITE_W)){
2261
+				if (tcpconn->flags& F_CONN_REMOVED){
2262
+					if (unlikely(io_watch_add(&io_h, tcpconn->s, POLLOUT,
2263
+												F_TCPCONN, tcpconn)<0)){
2264
+						LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
2265
+								    " to enable write watch on socket\n");
2266
+						tcpconn_destroy(tcpconn);
2267
+						break;
2268
+					}
2269
+				}else{
2270
+					if (unlikely(io_watch_chg(&io_h, tcpconn->s,
2271
+												POLLIN|POLLOUT, -1)<0)){
2272
+						LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
2273
+								    " to change socket watch events\n");
2274
+						io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2275
+						tcpconn->flags|=F_CONN_REMOVED;
2276
+						tcpconn_destroy(tcpconn);
2277
+						break;
2278
+					}
2279
+				}
2280
+				tcpconn->flags|=F_CONN_WRITE_W;
2281
+			}else{
2282
+				LOG(L_WARN, "tcp_main: hanlder_ser_child: connection %p"
2283
+							" already watched for write\n", tcpconn);
2284
+			}
2285
+			break;
2286
+#endif /* TCP_BUF_WRITE */
1891 2287
 		default:
1892 2288
 			LOG(L_CRIT, "BUG: handle_ser_child: unknown cmd %d\n", cmd);
1893 2289
 	}
... ...
@@ -2056,6 +2452,7 @@ static inline int handle_new_connect(struct socket_info* si)
2056 2452
 		if(unlikely(send2child(tcpconn)<0)){
2057 2453
 			LOG(L_ERR,"ERROR: handle_new_connect: no children "
2058 2454
 					"available\n");
2455
+			tcpconn->flags&=~F_CONN_READER;
2059 2456
 			tcpconn_destroy(tcpconn);
2060 2457
 		}
2061 2458
 #endif
... ...
@@ -2075,13 +2472,17 @@ static inline int handle_new_connect(struct socket_info* si)
2075 2472
  * params: tcpconn - pointer to the tcp_connection for which we have an io ev.
2076 2473
  *         fd_i    - index in the fd_array table (needed for delete)
2077 2474
  * returns:  handle_* return convention, but on success it always returns 0
2078
- *           (because it's one-shot, after a succesfull execution the fd is
2475
+ *           (because it's one-shot, after a succesful execution the fd is
2079 2476
  *            removed from tcp_main's watch fd list and passed to a child =>
2080 2477
  *            tcp_main is not interested in further io events that might be
2081 2478
  *            queued for this fd)
2082 2479
  */
2083
-inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, int fd_i)
2480
+inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, short ev, 
2481
+										int fd_i)
2084 2482
 {
2483
+#ifdef TCP_BUF_WRITE
2484
+	int empty_q;
2485
+#endif /* TCP_BUF_WRITE */
2085 2486
 	/*  is refcnt!=0 really necessary? 
2086 2487
 	 *  No, in fact it's a bug: I can have the following situation: a send only
2087 2488
 	 *   tcp connection used by n processes simultaneously => refcnt = n. In 
... ...
@@ -2101,17 +2502,55 @@ inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, int fd_i)
2101 2502
 #endif
2102 2503
 	/* pass it to child, so remove it from the io watch list  and the local
2103 2504
 	 *  timer */
2104
-	DBG("handle_tcpconn_ev: data available on %p %d\n", tcpconn, tcpconn->s);
2105
-	if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1)) goto error;
2106
-	tcpconn->flags|=F_CONN_REMOVED|F_CONN_READER;
2107
-	local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2108
-	tcpconn_ref(tcpconn); /* refcnt ++ */
2109
-	if (unlikely(send2child(tcpconn)<0)){
2110
-		LOG(L_ERR,"ERROR: handle_tcpconn_ev: no children available\n");
2111
-		tcpconn_destroy(tcpconn);
2505
+	DBG("handle_tcpconn_ev: ev (%0x) on %p %d\n", ev, tcpconn, tcpconn->s);
2506
+#ifdef TCP_BUF_WRITE
2507
+	if (unlikely((ev & POLLOUT) && (tcpconn->flags & F_CONN_WRITE_W))){
2508
+		if (unlikely(wbufq_run(tcpconn->s, tcpconn, &empty_q)<0)){
2509
+			io_watch_del(&io_h, tcpconn->s, fd_i, 0);
2510
+			tcpconn->flags|=F_CONN_REMOVED;
2511
+			tcpconn->flags&=~F_CONN_WRITE_W;
2512
+			tcpconn_destroy(tcpconn);
2513
+			goto error;
2514
+		}
2515
+		if (empty_q){
2516
+			if (tcpconn->flags & F_CONN_REMOVED){
2517
+				if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1))
2518
+					goto error;
2519
+			}else{
2520
+				if (unlikely(io_watch_chg(&io_h, tcpconn->s,
2521
+											POLLIN, fd_i)==-1))
2522
+					goto error;
2523
+			}
2524
+		}
2525
+	}
2526
+	if (likely((ev & POLLIN) && !(tcpconn->flags & F_CONN_REMOVED))){
2527
+		if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2528
+			if (unlikely(io_watch_chg(&io_h, tcpconn->s, POLLOUT, fd_i)==-1))
2529
+				goto error;
2530
+		}else
2531
+#else
2532
+	{
2533
+#endif /* TCP_BUF_WRITE */
2534
+			if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1))
2535
+				goto error;
2536
+		tcpconn->flags|=F_CONN_REMOVED|F_CONN_READER;
2537
+		local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2538
+		tcpconn_ref(tcpconn); /* refcnt ++ */
2539
+		if (unlikely(send2child(tcpconn)<0)){
2540
+			LOG(L_ERR,"ERROR: handle_tcpconn_ev: no children available\n");
2541
+			tcpconn->flags&=~F_CONN_READER;
2542
+#ifdef TCP_BUF_WRITE
2543
+			if (tcpconn->flags & F_CONN_WRITE_W){
2544
+				io_watch_del(&io_h, tcpconn->s, fd_i, 0);
2545
+				tcpconn->flags&=~F_CONN_WRITE_W;
2546
+			}
2547
+#endif /* TCP_BUF_WRITE */
2548
+			tcpconn_destroy(tcpconn);
2549
+		}
2112 2550
 	}
2113 2551
 	return 0; /* we are not interested in possibly queued io events, 
2114
-				 the fd was either passed to a child, or closed */
2552
+				 the fd was either passed to a child, closed, or for writes,
2553
+				 everything possible was already written */
2115 2554
 error:
2116 2555
 	return -1;
2117 2556
 }
... ...
@@ -2131,7 +2570,7 @@ error:
2131 2570
  *         >0 on successfull read from the fd (when there might be more io
2132 2571
  *            queued -- the receive buffer might still be non-empty)
2133 2572
  */
2134
-inline static int handle_io(struct fd_map* fm, short events, int idx)
2573
+inline static int handle_io(struct fd_map* fm, short ev, int idx)
2135 2574
 {	
2136 2575
 	int ret;
2137 2576
 	
... ...
@@ -2140,7 +2579,7 @@ inline static int handle_io(struct fd_map* fm, short events, int idx)
2140 2579
 			ret=handle_new_connect((struct socket_info*)fm->data);
2141 2580
 			break;
2142 2581
 		case F_TCPCONN:
2143
-			ret=handle_tcpconn_ev((struct tcp_connection*)fm->data, idx);
2582
+			ret=handle_tcpconn_ev((struct tcp_connection*)fm->data, ev, idx);
2144 2583
 			break;
2145 2584
 		case F_TCPCHILD:
2146 2585
 			ret=handle_tcp_child((struct tcp_child*)fm->data, idx);
... ...
@@ -2185,9 +2624,16 @@ static ticks_t tcpconn_main_timeout(ticks_t t, struct timer_ln* tl, void* data)
2185 2624
 				TCPCONN_UNLOCK; /* unlock as soon as possible */
2186 2625
 				fd=c->s;
2187 2626
 				if (likely(fd>0)){
2188
-					if (likely(!(c->flags & F_CONN_REMOVED))){
2627
+					if (likely(!(c->flags & F_CONN_REMOVED)
2628
+#ifdef TCP_BUF_WRITE
2629
+								|| (c->flags & F_CONN_WRITE_W)
2630
+#endif /* TCP_BUF_WRITE */
2631
+								)){
2189 2632
 						io_watch_del(&io_h, fd, -1, IO_FD_CLOSING);
2190 2633
 						c->flags|=F_CONN_REMOVED;
2634
+#ifdef TCP_BUF_WRITE
2635
+						c->flags&=~F_CONN_WRITE_W;
2636
+#endif /* TCP_BUF_WRITE */
2191 2637
 					}
2192 2638
 #ifdef USE_TLS
2193 2639
 					if (unlikely(c->type==PROTO_TLS ))
... ...
@@ -2250,9 +2696,16 @@ static inline void tcpconn_destroy_all()
2250 2696
 						local_timer_del(&tcp_main_ltimer, &c->timer);
2251 2697
 					/* else still in some reader */
2252 2698
 					fd=c->s;
2253
-					if (fd>0 && !(c->flags & F_CONN_REMOVED)){
2699
+					if (fd>0 && (!(c->flags & F_CONN_REMOVED)
2700
+#ifdef TCP_BUF_WRITE
2701
+								|| (c->flags & F_CONN_WRITE_W)
2702
+#endif /* TCP_BUF_WRITE */
2703
+								)){
2254 2704
 						io_watch_del(&io_h, fd, -1, IO_FD_CLOSING);
2255 2705
 						c->flags|=F_CONN_REMOVED;
2706
+#ifdef TCP_BUF_WRITE
2707
+						c->flags&=~F_CONN_WRITE_W;
2708
+#endif /* TCP_BUF_WRITE */
2256 2709
 					}
2257 2710
 				}else{
2258 2711
 					fd=-1;
... ...
@@ -2456,6 +2909,12 @@ void destroy_tcp()
2456 2909
 			shm_free(tcp_connections_no);
2457 2910
 			tcp_connections_no=0;
2458 2911
 		}
2912
+#ifdef TCP_BUF_WRITE
2913
+		if (tcp_total_wq){
2914
+			shm_free(tcp_total_wq);
2915
+			tcp_total_wq=0;
2916
+		}
2917
+#endif /* TCP_BUF_WRITE */
2459 2918
 		if (connection_id){
2460 2919
 			shm_free(connection_id);
2461 2920
 			connection_id=0;
... ...
@@ -2508,6 +2967,13 @@ int init_tcp()
2508 2967
 		goto error;
2509 2968
 	}
2510 2969
 	*connection_id=1;
2970
+#ifdef TCP_BUF_WRITE
2971
+	tcp_total_wq=shm_malloc(sizeof(*tcp_total_wq));
2972
+	if (tcp_total_wq==0){
2973
+		LOG(L_CRIT, "ERROR: init_tcp: could not alloc globals\n");
2974
+		goto error;
2975
+	}
2976
+#endif /* TCP_BUF_WRITE */
2511 2977
 	/* alloc hashtables*/
2512 2978
 	tcpconn_aliases_hash=(struct tcp_conn_alias**)
2513 2979
 			shm_malloc(TCP_ALIAS_HASH_SIZE* sizeof(struct tcp_conn_alias*));
... ...
@@ -2675,6 +3141,11 @@ void tcp_get_info(struct tcp_gen_info *ti)
2675 3141
 	ti->tcp_readers=tcp_children_no;
2676 3142
 	ti->tcp_max_connections=tcp_max_connections;
2677 3143
 	ti->tcp_connections_no=*tcp_connections_no;
3144
+#ifdef TCP_BUF_WRITE
3145
+	ti->tcp_write_queued=*tcp_total_wq;
3146
+#else
3147
+	ti->tcp_write_queued=0;
3148
+#endif /* TCP_BUF_WRITE */
2678 3149
 }
2679 3150
 
2680 3151
 #endif
... ...
@@ -25,6 +25,8 @@
25 25
 
26 26
 #include "tcp_options.h"
27 27
 #include "dprint.h"
28
+#include "globals.h"
29
+#include "timer_ticks.h"
28 30
 
29 31
 
30 32
 struct tcp_cfg_options tcp_options;
... ...
@@ -33,7 +35,12 @@ struct tcp_cfg_options tcp_options;
33 35
 /* set defaults */
34 36
 void init_tcp_options()
35 37
 {
36
-
38
+#ifdef TCP_BUF_WRITE
39
+	tcp_options.tcp_buf_write=0;
40
+	tcp_options.tcpconn_wq_max=32*1024; /* 32 k */
41
+	tcp_options.tcp_wq_max=10*1024*1024; /* 10 MB */
42
+	tcp_options.tcp_wq_timeout=S_TO_TICKS(tcp_send_timeout);
43
+#endif
37 44
 #ifdef TCP_FD_CACHE
38 45
 	tcp_options.fd_cache=1;
39 46
 #endif
... ...
@@ -54,7 +61,7 @@ void init_tcp_options()
54 61
 
55 62
 #define W_OPT_NC(option) \
56 63
 	if (tcp_options.option){\
57
-		WARN("tcp_options: tcp_" ##option \
64
+		WARN("tcp_options: tcp_" #option \
58 65
 				"cannot be enabled (recompile needed)\n"); \
59 66
 		tcp_options.option=0; \
60 67
 	}
... ...
@@ -63,7 +70,7 @@ void init_tcp_options()
63 70
 
64 71
 #define W_OPT_NS(option) \
65 72
 	if (tcp_options.option){\
66
-		WARN("tcp_options: tcp_" ##option \
73
+		WARN("tcp_options: tcp_" #option \
67 74
 				"cannot be enabled (no OS support)\n"); \
68 75
 		tcp_options.option=0; \
69 76
 	}
... ...
@@ -76,6 +83,13 @@ void tcp_options_check()
76 83
 	W_OPT_NC(defer_accept);
77 84
 #endif
78 85
 
86
+#ifndef TCP_BUF_WRITE
87
+	W_OPT_NC(tcp_buf_write);
88
+	W_OPT_NC(tcpconn_wq_max);
89
+	W_OPT_NC(tcp_wq_max);
90
+	W_OPT_NC(tcp_wq_timeout);
91
+#endif /* TCP_BUF_WRITE */
92
+
79 93
 #if ! defined HAVE_TCP_DEFER_ACCEPT && ! defined HAVE_TCP_ACCEPT_FILTER
80 94
 	W_OPT_NS(defer_accept);
81 95
 #endif
... ...
@@ -26,6 +26,11 @@
26 26
 #ifndef tcp_options_h
27 27
 #define tcp_options_h
28 28
 
29
+
30
+#ifndef NO_TCP_BUF_WRITE
31
+#define TCP_BUF_WRITE /* enabled buffered writing */
32
+#endif 
33
+
29 34
 #ifndef NO_TCP_FD_CACHE
30 35
 #define TCP_FD_CACHE /* enable fd caching */
31 36
 #endif
... ...
@@ -95,6 +100,12 @@
95 100
 struct tcp_cfg_options{
96 101
 	/* ser tcp options */
97 102
 	int fd_cache; /* on /off */
103
+	/* tcp buf. write options */
104
+	int tcp_buf_write; /* on / off */
105
+	unsigned int tcpconn_wq_max; /* maximum queue len per connection */
106
+	unsigned int tcp_wq_max; /* maximum overall queued bytes */
107
+	unsigned int tcp_wq_timeout;      /* timeout for queue writes */
108
+
98 109
 	/* tcp socket options */
99 110
 	int defer_accept; /* on / off */
100 111
 	int delayed_ack; /* delay ack on connect */