Browse code

- tcp support for queueing writes: if some data cannot be written immediately on the socket (socket buffers full or still connecting), the data will be queued and written at a latter time (max. queue size per socket is controlled by tcp_conn_wq_max, timeout by tcp_send_timeout and total queued bytes / max. mem. used by tcp_wq_max). By default disabled (experimental), to enable it use tcp_buf_write=yes in ser.cfg. To compile without queueing support use -DNO_TCP_BUF_WRITE.

Andrei Pelinescu-Onciul authored on 04/12/2007 20:25:29
Showing 11 changed files
... ...
@@ -77,7 +77,7 @@ MAIN_NAME=ser
77 77
 VERSION = 2
78 78
 PATCHLEVEL = 1
79 79
 SUBLEVEL =  0
80
-EXTRAVERSION = -dev13
80
+EXTRAVERSION = -dev14
81 81
 
82 82
 SER_VER = $(shell expr $(VERSION) \* 1000000 + $(PATCHLEVEL) \* 1000 + \
83 83
 			$(SUBLEVEL) )
... ...
@@ -120,6 +120,15 @@ new config variables:
120 120
      will be cached inside the process calling tcp_send (performance increase
121 121
      for sending over tcp at the cost of slightly slower connection closing and
122 122
      extra FDs kept open)
123
+  tcp_buf_write = yes | no (default no) - if enabled all the tcp  writes that 
124
+     would block / wait for connect to finish, will be queued and attempted
125
+     latter (see also tcp_conn_wq_max and tcp_wq_max).
126
+  tcp_conn_wq_max = bytes (default 32 K) - maximum bytes queued for write 
127
+     allowed per connection. Attempting to queue more bytes would result
128
+     in an error and in the connection being closed (too slow). If 
129
+     tcp_write_buf is not enabled, it has no effect.
130
+  tcp_wq_max = bytes (default 10 Mb) - maximum bytes queued for write allowed
131
+     globally. It has no effect if tcp_write_buf is not enabled.
123 132
   tcp_defer_accept =  yes | no (default no) on freebsd  / number of seconds
124 133
         before timeout on linux (default disabled) - tcp accepts will be 
125 134
         delayed until some data is received (improves performance on proxies
... ...
@@ -293,6 +293,9 @@ TCP_MAX_CONNECTIONS	"tcp_max_connections"
293 293
 TCP_SOURCE_IPV4		"tcp_source_ipv4"
294 294
 TCP_SOURCE_IPV6		"tcp_source_ipv6"
295 295
 TCP_OPT_FD_CACHE	"tcp_fd_cache"
296
+TCP_OPT_BUF_WRITE	"tcp_buf_write"
297
+TCP_OPT_CONN_WQ_MAX	"tcp_conn_wq_max"
298
+TCP_OPT_WQ_MAX		"tcp_wq_max"
296 299
 TCP_OPT_DEFER_ACCEPT "tcp_defer_accept"
297 300
 TCP_OPT_DELAYED_ACK	"tcp_delayed_ack"
298 301
 TCP_OPT_SYNCNT		"tcp_syncnt"
... ...
@@ -561,6 +564,12 @@ EAT_ABLE	[\ \t\b\r]
561 561
 									return TCP_SOURCE_IPV6; }
562 562
 <INITIAL>{TCP_OPT_FD_CACHE}		{ count(); yylval.strval=yytext;
563 563
 									return TCP_OPT_FD_CACHE; }
564
+<INITIAL>{TCP_OPT_CONN_WQ_MAX}	{ count(); yylval.strval=yytext;
565
+									return TCP_OPT_CONN_WQ_MAX; }
566
+<INITIAL>{TCP_OPT_WQ_MAX}	{ count(); yylval.strval=yytext;
567
+									return TCP_OPT_WQ_MAX; }
568
+<INITIAL>{TCP_OPT_BUF_WRITE}	{ count(); yylval.strval=yytext;
569
+									return TCP_OPT_BUF_WRITE; }
564 570
 <INITIAL>{TCP_OPT_DEFER_ACCEPT}	{ count(); yylval.strval=yytext;
565 571
 									return TCP_OPT_DEFER_ACCEPT; }
566 572
 <INITIAL>{TCP_OPT_DELAYED_ACK}	{ count(); yylval.strval=yytext;
... ...
@@ -334,6 +334,9 @@ static struct socket_id* mk_listen_id(char*, int, int);
334 334
 %token TCP_SOURCE_IPV4
335 335
 %token TCP_SOURCE_IPV6
336 336
 %token TCP_OPT_FD_CACHE
337
+%token TCP_OPT_BUF_WRITE
338
+%token TCP_OPT_CONN_WQ_MAX
339
+%token TCP_OPT_WQ_MAX
337 340
 %token TCP_OPT_DEFER_ACCEPT
338 341
 %token TCP_OPT_DELAYED_ACK
339 342
 %token TCP_OPT_SYNCNT
... ...
@@ -803,6 +806,30 @@ assign_stm:
803 803
 		#endif
804 804
 	}
805 805
 	| TCP_OPT_FD_CACHE EQUAL error { yyerror("boolean value expected"); }
806
+	| TCP_OPT_BUF_WRITE EQUAL NUMBER {
807
+		#ifdef USE_TCP
808
+			tcp_options.tcp_buf_write=$3;
809
+		#else
810
+			warn("tcp support not compiled in");
811
+		#endif
812
+	}
813
+	| TCP_OPT_BUF_WRITE EQUAL error { yyerror("boolean value expected"); }
814
+	| TCP_OPT_CONN_WQ_MAX EQUAL NUMBER {
815
+		#ifdef USE_TCP
816
+			tcp_options.tcpconn_wq_max=$3;
817
+		#else
818
+			warn("tcp support not compiled in");
819
+		#endif
820
+	}
821
+	| TCP_OPT_CONN_WQ_MAX error { yyerror("boolean value expected"); }
822
+	| TCP_OPT_WQ_MAX EQUAL NUMBER {
823
+		#ifdef USE_TCP
824
+			tcp_options.tcp_wq_max=$3;
825
+		#else
826
+			warn("tcp support not compiled in");
827
+		#endif
828
+	}
829
+	| TCP_OPT_WQ_MAX error { yyerror("boolean value expected"); }
806 830
 	| TCP_OPT_DEFER_ACCEPT EQUAL NUMBER {
807 831
 		#ifdef USE_TCP
808 832
 			tcp_options.defer_accept=$3;
... ...
@@ -532,10 +532,11 @@ static void core_tcpinfo(rpc_t* rpc, void* c)
532 532
 	if (!tcp_disable){
533 533
 		tcp_get_info(&ti);
534 534
 		rpc->add(c, "{", &handle);
535
-		rpc->struct_add(handle, "ddd",
535
+		rpc->struct_add(handle, "dddd",
536 536
 			"readers", ti.tcp_readers,
537 537
 			"max_connections", ti.tcp_max_connections,
538
-			"opened_connections", ti.tcp_connections_no
538
+			"opened_connections", ti.tcp_connections_no,
539
+			"write_queued_bytes", ti.tcp_write_queued
539 540
 		);
540 541
 	}else{
541 542
 		rpc->fault(c, 500, "tcp support disabled");
... ...
@@ -561,8 +562,13 @@ static void core_tcp_options(rpc_t* rpc, void* c)
561 561
 	if (!tcp_disable){
562 562
 		tcp_options_get(&t);
563 563
 		rpc->add(c, "{", &handle);
564
-		rpc->struct_add(handle, "ddddddddd",
564
+		rpc->struct_add(handle, "ddddddddddddd",
565 565
 			"fd_cache",		t.fd_cache,
566
+			"tcp_buf_write",	t.tcp_buf_write,
567
+			"tcpconn_wq_max",	t.tcpconn_wq_max,
568
+			"tcp_wq_max",	t.tcp_wq_max,
569
+			"tcp_wq_timeout",	TICKS_TO_S(t.tcp_wq_timeout),
570
+			
566 571
 			"defer_accept",	t.defer_accept,
567 572
 			"delayed_ack",	t.delayed_ack,
568 573
 			"syncnt",		t.syncnt,
... ...
@@ -842,7 +842,6 @@ again_devpoll2:
842 842
 					h->poll_method);
843 843
 			goto error;
844 844
 	}
845
-	h->fd_no--;
846 845
 	return 0;
847 846
 error:
848 847
 	return -1;
... ...
@@ -34,6 +34,7 @@
34 34
  *  2007-07-26  improved tcp connection hash function; increased aliases
35 35
  *               hash size (andrei)
36 36
  *  2007-11-26  switched to local_timer (andrei)
37
+ *  2007-11-30  buffered write support (andrei)
37 38
  */
38 39
 
39 40
 
... ...
@@ -41,6 +42,8 @@
41 41
 #ifndef _tcp_conn_h
42 42
 #define _tcp_conn_h
43 43
 
44
+#include "tcp_options.h"
45
+
44 46
 #include "ip_addr.h"
45 47
 #include "locking.h"
46 48
 #include "atomic_ops.h"
... ...
@@ -67,6 +70,7 @@
67 67
 #define F_CONN_NON_BLOCKING 1
68 68
 #define F_CONN_REMOVED      2 /* no longer  in "main" listen fd list */
69 69
 #define F_CONN_READER       4 /* handled by a tcp reader */
70
+#define F_CONN_WRITE_W      8 /* watched for write (main) */
70 71
 
71 72
 
72 73
 enum tcp_req_errors {	TCP_REQ_INIT, TCP_REQ_OK, TCP_READ_ERROR,
... ...
@@ -86,7 +90,7 @@ enum tcp_conn_states { S_CONN_ERROR=-2, S_CONN_BAD=-1, S_CONN_OK=0,
86 86
 
87 87
 /* fd communication commands */
88 88
 enum conn_cmds { CONN_DESTROY=-3, CONN_ERROR=-2, CONN_EOF=-1, CONN_RELEASE, 
89
-					CONN_GET_FD, CONN_NEW };
89
+					CONN_GET_FD, CONN_NEW, CONN_QUEUED_WRITE };
90 90
 /* CONN_RELEASE, EOF, ERROR, DESTROY can be used by "reader" processes
91 91
  * CONN_GET_FD, NEW, ERROR only by writers */
92 92
 
... ...
@@ -121,6 +125,23 @@ struct tcp_conn_alias{
121 121
 };
122 122
 
123 123
 
124
+#ifdef TCP_BUF_WRITE
125
+	struct tcp_wbuffer{
126
+		struct tcp_wbuffer* next;
127
+		unsigned int b_size;
128
+		char buf[1];
129
+	};
130
+
131
+	struct tcp_wbuffer_queue{
132
+		struct tcp_wbuffer* first;
133
+		struct tcp_wbuffer* last;
134
+		unsigned int queued; /* total size */
135
+		unsigned int offset; /* offset in the first wbuffer were data
136
+								starts */
137
+		unsigned int last_used; /* how much of the last buffer is used */
138
+	};
139
+#endif
140
+
124 141
 
125 142
 struct tcp_connection{
126 143
 	int s; /*socket, used by "tcp main" */
... ...
@@ -137,7 +158,7 @@ struct tcp_connection{
137 137
 	enum tcp_conn_states state; /* connection state */
138 138
 	void* extra_data; /* extra data associated to the connection, 0 for tcp*/
139 139
 	struct timer_ln timer;
140
-	unsigned int timeout;/* connection timeout, after this it will be removed*/
140
+	ticks_t timeout;/* connection timeout, after this it will be removed*/
141 141
 	unsigned id_hash; /* hash index in the id_hash */
142 142
 	struct tcp_connection* id_next; /* next, prev in id hash table */
143 143
 	struct tcp_connection* id_prev;
... ...
@@ -145,6 +166,10 @@ struct tcp_connection{
145 145
 	struct tcp_connection* c_prev;
146 146
 	struct tcp_conn_alias con_aliases[TCP_CON_MAX_ALIASES];
147 147
 	int aliases; /* aliases number, at least 1 */
148
+#ifdef TCP_BUF_WRITE
149
+	ticks_t last_write; /* time when the last write took place */
150
+	struct tcp_wbuffer_queue wbuf_q;
151
+#endif
148 152
 };
149 153
 
150 154
 
... ...
@@ -35,6 +35,8 @@ struct tcp_gen_info{
35 35
 	int tcp_readers;
36 36
 	int tcp_max_connections;
37 37
 	int tcp_connections_no; /* crt. number */
38
+	int tcp_write_queued; /* total bytes queued for write, 0 if no
39
+							 write queued support is enabled */
38 40
 };
39 41
 
40 42
 
... ...
@@ -87,6 +87,7 @@
87 87
  *  2007-11-27  added send fd cache and reader fd reuse (andrei)
88 88
  *  2007-11-28  added support for TCP_DEFER_ACCEPT, KEEPALIVE, KEEPINTVL,
89 89
  *               KEEPCNT, QUICKACK, SYNCNT, LINGER2 (andrei)
90
+ *  2007-12-04  support for queueing write requests (andrei)
90 91
  */
91 92
 
92 93
 
... ...
@@ -145,6 +146,7 @@
145 145
 
146 146
 #include "tcp_info.h"
147 147
 #include "tcp_options.h"
148
+#include "ut.h"
148 149
 
149 150
 #define local_malloc pkg_malloc
150 151
 #define local_free   pkg_free
... ...
@@ -177,6 +179,12 @@
177 177
 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
178 178
 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
179 179
 
180
+#ifdef TCP_BUF_WRITE
181
+#define TCP_WBUF_SIZE	1024 /* FIXME: after debugging switch to 16-32k */
182
+static unsigned int* tcp_total_wq=0;
183
+#endif
184
+
185
+
180 186
 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
181 187
 				F_TCPCONN, F_TCPCHILD, F_PROC };
182 188
 
... ...
@@ -542,6 +550,173 @@ end:
542 542
 
543 543
 
544 544
 
545
+inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
546
+									char* buf, int len);
547
+
548
+
549
+#ifdef TCP_BUF_WRITE
550
+
551
+
552
+inline static int wbufq_add(struct  tcp_connection* c, char* data, 
553
+							unsigned int size)
554
+{
555
+	struct tcp_wbuffer_queue* q;
556
+	struct tcp_wbuffer* wb;
557
+	unsigned int last_free;
558
+	unsigned int wb_size;
559
+	unsigned int crt_size;
560
+	ticks_t t;
561
+	
562
+	q=&c->wbuf_q;
563
+	t=get_ticks_raw();
564
+	if (unlikely(	((q->queued+size)>tcp_options.tcpconn_wq_max) ||
565
+					((*tcp_total_wq+size)>tcp_options.tcp_wq_max) ||
566
+					(q->first &&
567
+					TICKS_GT(t, c->last_write+tcp_options.tcp_wq_timeout)) )){
568
+		LOG(L_ERR, "ERROR: wbufq_add(%d bytes): write queue full or timeout "
569
+					" (%d, total %d, last write %d s ago)\n",
570
+					size, q->queued, *tcp_total_wq,
571
+					TICKS_TO_S(t-c->last_write));
572
+		goto error;
573
+	}
574
+	
575
+	if (unlikely(q->last==0)){
576
+		wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
577
+		wb=shm_malloc(sizeof(*wb)+wb_size-1);
578
+		if (unlikely(wb==0))
579
+			goto error;
580
+		wb->b_size=wb_size;
581
+		wb->next=0;
582
+		q->last=wb;
583
+		q->first=wb;
584
+		q->last_used=0;
585
+		q->offset=0;
586
+		c->last_write=get_ticks_raw(); /* start with the crt. time */
587
+	}else{
588
+		wb=q->last;
589
+	}
590
+	
591
+	while(size){
592
+		last_free=wb->b_size-q->last_used;
593
+		if (last_free==0){
594
+			wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
595
+			wb=shm_malloc(sizeof(*wb)+wb_size-1);
596
+			if (unlikely(wb==0))
597
+				goto error;
598
+			wb->b_size=wb_size;
599
+			wb->next=0;
600
+			q->last->next=wb;
601
+			q->last=wb;
602
+			q->last_used=0;
603
+			last_free=wb->b_size;
604
+		}
605
+		crt_size=MIN_unsigned(last_free, size);
606
+		memcpy(wb->buf, data, crt_size);
607
+		q->last_used+=crt_size;
608
+		size-=crt_size;
609
+		data+=crt_size;
610
+		q->queued+=crt_size;
611
+		atomic_add_int((int*)tcp_total_wq, crt_size);
612
+	}
613
+	return 0;
614
+error:
615
+	return -1;
616
+}
617
+
618
+
619
+
620
+inline static void wbufq_destroy( struct  tcp_wbuffer_queue* q)
621
+{
622
+	struct tcp_wbuffer* wb;
623
+	struct tcp_wbuffer* next_wb;
624
+	int unqueued;
625
+	
626
+	unqueued=0;
627
+	if (likely(q->first)){
628
+		wb=q->first;
629
+		do{
630
+			next_wb=wb->next;
631
+			unqueued+=(wb==q->last)?q->last_used:wb->b_size;
632
+			if (wb==q->first)
633
+				unqueued-=q->offset;
634
+			shm_free(wb);
635
+			wb=next_wb;
636
+		}while(wb);
637
+	}
638
+	memset(q, 0, sizeof(*q));
639
+	atomic_add_int((int*)tcp_total_wq, -unqueued);
640
+}
641
+
642
+
643
+
644
+/* tries to empty the queue
645
+ * returns -1 on error, bytes written on success (>=0) 
646
+ * if the whole queue is emptied => sets *empty*/
647
+inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
648
+{
649
+	struct tcp_wbuffer_queue* q;
650
+	struct tcp_wbuffer* wb;
651
+	int n;
652
+	int ret;
653
+	int block_size;
654
+	ticks_t t;
655
+	char* buf;
656
+	
657
+	*empty=0;
658
+	ret=0;
659
+	t=get_ticks_raw();
660
+	lock_get(&c->write_lock);
661
+	q=&c->wbuf_q;
662
+	while(q->first){
663
+		block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
664
+						q->offset;
665
+		buf=q->first->buf+q->offset;
666
+		n=_tcpconn_write_nb(fd, c, buf, block_size);
667
+		if (likely(n>0)){
668
+			ret+=n;
669
+			if (likely(n==block_size)){
670
+				wb=q->first;
671
+				q->first=q->first->next; 
672
+				shm_free(wb);
673
+				q->offset=0;
674
+				q->queued-=block_size;
675
+				atomic_add_int((int*)tcp_total_wq, -block_size);
676
+			}else{
677
+				q->offset+=n;
678
+				q->queued-=n;
679
+				atomic_add_int((int*)tcp_total_wq, -n);
680
+				break;
681
+			}
682
+			c->last_write=t;
683
+			c->state=S_CONN_OK;
684
+		}else{
685
+			if (n<0){
686
+				/* EINTR is handled inside _tcpconn_write_nb */
687
+				if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
688
+					ret=-1;
689
+					LOG(L_ERR, "ERROR: wbuf_runq: %s [%d]\n",
690
+						strerror(errno), errno);
691
+				}
692
+			}
693
+			break;
694
+		}
695
+	}
696
+	if (likely(q->first==0)){
697
+		q->last=0;
698
+		q->last_used=0;
699
+		q->offset=0;
700
+		*empty=1;
701
+	}
702
+	if (unlikely(c->state==S_CONN_CONNECT && (ret>0)))
703
+			c->state=S_CONN_OK;
704
+	lock_release(&c->write_lock);
705
+	return ret;
706
+}
707
+
708
+#endif /* TCP_BUF_WRITE */
709
+
710
+
711
+
545 712
 #if 0
546 713
 /* blocking write even on non-blocking sockets 
547 714
  * if TCP_TIMEOUT will return with error */
... ...
@@ -687,6 +862,10 @@ struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
687 687
 	socklen_t my_name_len;
688 688
 	struct tcp_connection* con;
689 689
 	struct ip_addr ip;
690
+	enum tcp_conn_states state;
691
+#ifdef TCP_BUF_WRITE
692
+	int n;
693
+#endif /* TCP_BUF_WRITE */
690 694
 
691 695
 	s=-1;
692 696
 	
... ...
@@ -710,11 +889,30 @@ struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
710 710
 	if (from && bind(s, &from->s, sockaddru_len(*from)) != 0)
711 711
 		LOG(L_WARN, "WARNING: tcpconn_connect: binding to source address"
712 712
 					" failed: %s [%d]\n", strerror(errno), errno);
713
-
714
-	if (tcp_blocking_connect(s, &server->s, sockaddru_len(*server))<0){
715
-		LOG(L_ERR, "ERROR: tcpconn_connect: tcp_blocking_connect failed\n");
716
-		goto error;
713
+#ifdef TCP_BUF_WRITE
714
+	if (likely(tcp_options.tcp_buf_write)){
715
+again:
716
+		n=connect(s, &server->s, sockaddru_len(*server));
717
+		if (unlikely(n==-1)){
718
+			if (errno==EINTR) goto again;
719
+			if (errno!=EINPROGRESS && errno!=EALREADY){
720
+				LOG(L_ERR, "ERROR: tcpconn_connect: connect: (%d) %s\n",
721
+						errno, strerror(errno));
722
+				goto error;
723
+			}
724
+			state=S_CONN_CONNECT;
725
+		}
726
+	}else{
727
+#endif /* TCP_BUF_WRITE */
728
+		if (tcp_blocking_connect(s, &server->s, sockaddru_len(*server))<0){
729
+			LOG(L_ERR, "ERROR: tcpconn_connect: tcp_blocking_connect"
730
+						" failed\n");
731
+			goto error;
732
+		}
733
+		state=S_CONN_OK;
734
+#ifdef TCP_BUF_WRITE
717 735
 	}
736
+#endif /* TCP_BUF_WRITE */
718 737
 	if (from){
719 738
 		su2ip_addr(&ip, from);
720 739
 		if (!ip_addr_any(&ip))
... ...
@@ -746,7 +944,7 @@ skip:
746 746
 		else si=sendipv6_tcp;
747 747
 #endif
748 748
 	}
749
-	con=tcpconn_new(s, server, from, si,  type, S_CONN_CONNECT);
749
+	con=tcpconn_new(s, server, from, si,  type, state);
750 750
 	if (con==0){
751 751
 		LOG(L_ERR, "ERROR: tcp_connect: tcpconn_new failed, closing the "
752 752
 				 " socket\n");
... ...
@@ -818,6 +1016,10 @@ static inline void _tcpconn_detach(struct tcp_connection *c)
818 818
 
819 819
 static inline void _tcpconn_free(struct tcp_connection* c)
820 820
 {
821
+#ifdef TCP_BUF_WRITE
822
+	if (unlikely(c->wbuf_q.first))
823
+		wbufq_destroy(&c->wbuf_q);
824
+#endif
821 825
 	lock_destroy(&c->write_lock);
822 826
 #ifdef USE_TLS
823 827
 	if (unlikely(c->type==PROTO_TLS)) tls_tcpconn_clean(c);
... ...
@@ -1134,6 +1336,9 @@ int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1134 1134
 	long response[2];
1135 1135
 	int n;
1136 1136
 	int do_close_fd;
1137
+#ifdef TCP_BUF_WRITE
1138
+	int enable_write_watch;
1139
+#endif /* TCP_BUF_WRITE */
1137 1140
 #ifdef TCP_FD_CACHE
1138 1141
 	struct fd_cache_entry* fd_cache_e;
1139 1142
 	
... ...
@@ -1204,6 +1409,24 @@ no_id:
1204 1204
 			goto send_it;
1205 1205
 		}
1206 1206
 get_fd:
1207
+#ifdef TCP_BUF_WRITE
1208
+		/* if data is already queued, we don't need the fd any more */
1209
+		if (unlikely(tcp_options.tcp_buf_write && c->wbuf_q.first)){
1210
+			lock_get(&c->write_lock);
1211
+				if (likely(c->wbuf_q.first)){
1212
+					do_close_fd=0;
1213
+					if (unlikely(wbufq_add(c, buf, len)<0)){
1214
+						lock_release(&c->write_lock);
1215
+						n=-1;
1216
+						goto error;
1217
+					}
1218
+					n=len;
1219
+					lock_release(&c->write_lock);
1220
+					goto release_c;
1221
+				}
1222
+			lock_release(&c->write_lock);
1223
+		}
1224
+#endif /* TCP_BUF_WRITE */
1207 1225
 		/* check if this is not the same reader process holding
1208 1226
 		 *  c  and if so send directly on c->fd */
1209 1227
 		if (c->reader_pid==my_pid()){
... ...
@@ -1237,6 +1460,7 @@ get_fd:
1237 1237
 				LOG(L_ERR, "BUG: tcp_send: failed to get fd(receive_fd):"
1238 1238
 							" %s (%d)\n", strerror(errno), errno);
1239 1239
 				n=-1;
1240
+				do_close_fd=0;
1240 1241
 				goto release_c;
1241 1242
 			}
1242 1243
 			if (unlikely(c!=tmp)){
... ...
@@ -1256,6 +1480,21 @@ get_fd:
1256 1256
 send_it:
1257 1257
 	DBG("tcp_send: sending...\n");
1258 1258
 	lock_get(&c->write_lock);
1259
+#ifdef TCP_BUF_WRITE
1260
+	if (likely(tcp_options.tcp_buf_write)){
1261
+		if (c->wbuf_q.first){
1262
+			if (unlikely(wbufq_add(c, buf, len)<0)){
1263
+				lock_release(&c->write_lock);
1264
+				n=-1;
1265
+				goto error;
1266
+			}
1267
+			lock_release(&c->write_lock);
1268
+			n=len;
1269
+			goto end;
1270
+		}
1271
+		n=_tcpconn_write_nb(fd, c, buf, len);
1272
+	}else{
1273
+#endif /* TCP_BUF_WRITE */
1259 1274
 #ifdef USE_TLS
1260 1275
 	if (c->type==PROTO_TLS)
1261 1276
 		n=tls_blocking_write(c, fd, buf, len);
... ...
@@ -1263,10 +1502,39 @@ send_it:
1263 1263
 #endif
1264 1264
 		/* n=tcp_blocking_write(c, fd, buf, len); */
1265 1265
 		n=tsend_stream(fd, buf, len, tcp_send_timeout*1000); 
1266
+#ifdef TCP_BUF_WRITE
1267
+	}
1268
+#endif /* TCP_BUF_WRITE */
1266 1269
 	lock_release(&c->write_lock);
1267 1270
 	DBG("tcp_send: after write: c= %p n=%d fd=%d\n",c, n, fd);
1268 1271
 	DBG("tcp_send: buf=\n%.*s\n", (int)len, buf);
1269 1272
 	if (unlikely(n<0)){
1273
+#ifdef TCP_BUF_WRITE
1274
+		if (tcp_options.tcp_buf_write && 
1275
+				(errno==EAGAIN || errno==EWOULDBLOCK)){
1276
+			lock_get(&c->write_lock);
1277
+			enable_write_watch=(c->wbuf_q.first==0);
1278
+			if (unlikely(wbufq_add(c, buf, len)<0)){
1279
+				lock_release(&c->write_lock);
1280
+				n=-1;
1281
+				goto error;
1282
+			}
1283
+			lock_release(&c->write_lock);
1284
+			n=len;
1285
+			if (enable_write_watch){
1286
+				response[0]=(long)c;
1287
+				response[1]=CONN_QUEUED_WRITE;
1288
+				if (send_all(unix_tcp_sock, response, sizeof(response))<=0){
1289
+					LOG(L_ERR, "BUG: tcp_send: error return failed "
1290
+							"(write):%s (%d)\n", strerror(errno), errno);
1291
+					n=-1;
1292
+					goto error;
1293
+				}
1294
+			}
1295
+			goto end;
1296
+		}
1297
+error:
1298
+#endif /* TCP_BUF_WRITE */
1270 1299
 		LOG(L_ERR, "ERROR: tcp_send: failed to send\n");
1271 1300
 		/* error on the connection , mark it as bad and set 0 timeout */
1272 1301
 		c->state=S_CONN_BAD;
... ...
@@ -1294,6 +1562,13 @@ send_it:
1294 1294
 		if (do_close_fd) close(fd);
1295 1295
 		return n; /* error return, no tcpconn_put */
1296 1296
 	}
1297
+#ifdef TCP_BUF_WRITE
1298
+	if (likely(tcp_options.tcp_buf_write)){
1299
+		if (unlikely(c->state==S_CONN_CONNECT))
1300
+			c->state=S_CONN_OK;
1301
+		c->last_write=get_ticks_raw();
1302
+	}
1303
+#endif /* TCP_BUF_WRITE */
1297 1304
 end:
1298 1305
 #ifdef TCP_FD_CACHE
1299 1306
 	if (unlikely((fd_cache_e==0) && tcp_options.fd_cache)){
... ...
@@ -1465,23 +1740,40 @@ static void tcpconn_destroy(struct tcp_connection* tcpconn)
1465 1465
 	 *  (if the timer is already removed, nothing happens) */
1466 1466
 	if (likely(!(tcpconn->flags & F_CONN_READER)))
1467 1467
 		local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
1468
+#ifdef TCP_BUF_WRITE
1469
+	if (unlikely((tcpconn->flags & F_CONN_WRITE_W) ||
1470
+				!(tcpconn->flags & F_CONN_REMOVED))){
1471
+		LOG(L_CRIT, "tcpconn_destroy: possible BUG: flags = %0x\n",
1472
+					tcpconn->flags);
1473
+	}
1474
+	if (unlikely(tcpconn->wbuf_q.first)){
1475
+		lock_get(&tcpconn->write_lock);
1476
+			/* check again, while holding the lock */
1477
+			if (likely(tcpconn->wbuf_q.first))
1478
+				wbufq_destroy(&tcpconn->wbuf_q);
1479
+		lock_release(&tcpconn->write_lock);
1480
+	}
1481
+#endif /* TCP_BUF_WRITE */
1468 1482
 	TCPCONN_LOCK; /*avoid races w/ tcp_send*/
1469 1483
 	if (likely(atomic_dec_and_test(&tcpconn->refcnt))){ 
1470 1484
 		_tcpconn_detach(tcpconn);
1471 1485
 		TCPCONN_UNLOCK;
1472
-		DBG("tcpconn_destroy: destroying connection %p, flags %04x\n",
1473
-				tcpconn, tcpconn->flags);
1486
+		DBG("tcpconn_destroy: destroying connection %p (%d, %d) flags %04x\n",
1487
+				tcpconn, tcpconn->id, tcpconn->s, tcpconn->flags);
1474 1488
 		fd=tcpconn->s;
1475 1489
 #ifdef USE_TLS
1476 1490
 		/*FIXME: lock ->writelock ? */
1477 1491
 		if (tcpconn->type==PROTO_TLS)
1478 1492
 			tls_close(tcpconn, fd);
1479 1493
 #endif
1480
-		_tcpconn_free(tcpconn);
1494
+		_tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
1481 1495
 #ifdef TCP_FD_CACHE
1482 1496
 		if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
1483 1497
 #endif /* TCP_FD_CACHE */
1484
-		close(fd);
1498
+		if (unlikely(close(fd)<0)){
1499
+			LOG(L_ERR, "ERROR: tcpconn_destroy; close() failed: %s (%d)\n",
1500
+					strerror(errno), errno);
1501
+		}
1485 1502
 		(*tcp_connections_no)--;
1486 1503
 	}else{
1487 1504
 		TCPCONN_UNLOCK;
... ...
@@ -1627,6 +1919,13 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
1627 1627
 						   p->unix_sock, (long)(p-&q->data[0]), p->retries,
1628 1628
 						   p->tcp_conn, p->tcp_conn->s, errno,
1629 1629
 						   strerror(errno));
1630
+#ifdef TCP_BUF_WRITE
1631
+				if (p->tcp_conn->flags & F_CONN_WRITE_W){
1632
+					io_watch_del(&io_h, p->tcp_conn->s, -1, IO_FD_CLOSING);
1633
+					p->tcp_conn->flags &=~F_CONN_WRITE_W;
1634
+				}
1635
+#endif
1636
+				p->tcp_conn->flags &= ~F_CONN_READER;
1630 1637
 				tcpconn_destroy(p->tcp_conn);
1631 1638
 			}
1632 1639
 		}
... ...
@@ -1638,6 +1937,36 @@ inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
1638 1638
 #endif
1639 1639
 
1640 1640
 
1641
+/* non blocking write() on a tcpconnection, unsafe version (should be called
1642
+ * while holding  c->write_lock). The fd should be non-blocking.
1643
+ *  returns number of bytes written on success, -1 on error (and sets errno)
1644
+ */
1645
+inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
1646
+									char* buf, int len)
1647
+{
1648
+	int n;
1649
+	
1650
+again:
1651
+#ifdef USE_TLS
1652
+	if (unlikely(c->type==PROTO_TLS))
1653
+		/* FIXME: tls_nonblocking_write !! */
1654
+		n=tls_blocking_write(c, fd, buf, len);
1655
+	else
1656
+#endif /* USE_TLS */
1657
+		n=send(fd, buf, len,
1658
+#ifdef HAVE_MSG_NOSIGNAL
1659
+					MSG_NOSIGNAL
1660
+#else
1661
+					0
1662
+#endif /* HAVE_MSG_NOSIGNAL */
1663
+			  );
1664
+	if (unlikely(n<0)){
1665
+		if (errno==EINTR) goto again;
1666
+	}
1667
+	return n;
1668
+}
1669
+
1670
+
1641 1671
 
1642 1672
 /* handles io from a tcp child process
1643 1673
  * params: tcp_c - pointer in the tcp_children array, to the entry for
... ...
@@ -1654,6 +1983,7 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1654 1654
 	long response[2];
1655 1655
 	int cmd;
1656 1656
 	int bytes;
1657
+	int n;
1657 1658
 	ticks_t t;
1658 1659
 	
1659 1660
 	if (unlikely(tcp_c->unix_sock<=0)){
... ...
@@ -1715,6 +2045,12 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1715 1715
 		case CONN_RELEASE:
1716 1716
 			tcp_c->busy--;
1717 1717
 			if (unlikely(tcpconn->state==S_CONN_BAD)){ 
1718
+#ifdef TCP_BUF_WRITE
1719
+				if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
1720
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1721
+					tcpconn->flags &= ~F_CONN_WRITE_W;
1722
+				}
1723
+#endif /* TCP_BUF_WRITE */
1718 1724
 				tcpconn_destroy(tcpconn);
1719 1725
 				break;
1720 1726
 			}
... ...
@@ -1729,12 +2065,22 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1729 1729
 								tcp_con_lifetime, t);
1730 1730
 			/* must be after the de-ref*/
1731 1731
 			tcpconn->flags&=~(F_CONN_REMOVED|F_CONN_READER);
1732
-			if (unlikely(
1733
-					io_watch_add(&io_h, tcpconn->s, POLLIN,
1734
-												F_TCPCONN, tcpconn)<0)){
1732
+#ifdef TCP_BUF_WRITE
1733
+			if (unlikely(tcpconn->flags & F_CONN_WRITE_W))
1734
+				n=io_watch_chg(&io_h, tcpconn->s, POLLIN| POLLOUT, -1);
1735
+			else
1736
+#endif /* TCP_BUF_WRITE */
1737
+				n=io_watch_add(&io_h, tcpconn->s, POLLIN, F_TCPCONN, tcpconn);
1738
+			if (unlikely(n<0)){
1735 1739
 				LOG(L_CRIT, "ERROR: tcp_main: handle_tcp_child: failed to add"
1736 1740
 						" new socket to the fd list\n");
1737 1741
 				tcpconn->flags|=F_CONN_REMOVED;
1742
+#ifdef TCP_BUF_WRITE
1743
+				if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
1744
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1745
+					tcpconn->flags&=~F_CONN_WRITE_W;
1746
+				}
1747
+#endif /* TCP_BUF_WRITE */
1738 1748
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1739 1749
 			}
1740 1750
 			DBG("handle_tcp_child: CONN_RELEASE  %p refcnt= %d\n", 
... ...
@@ -1749,6 +2095,12 @@ inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
1749 1749
 				 if (tcpconn->s!=-1)
1750 1750
 					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1751 1751
 				*/
1752
+#ifdef TCP_BUF_WRITE
1753
+				if ((tcpconn->flags & F_CONN_WRITE_W) && (tcpconn->s!=-1)){
1754
+					io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1755
+					tcpconn->flags&=~F_CONN_WRITE_W;
1756
+				}
1757
+#endif /* TCP_BUF_WRITE */
1752 1758
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1753 1759
 				break;
1754 1760
 		default:
... ...
@@ -1785,6 +2137,7 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1785 1785
 	int bytes;
1786 1786
 	int ret;
1787 1787
 	int fd;
1788
+	int flags;
1788 1789
 	ticks_t t;
1789 1790
 	
1790 1791
 	ret=-1;
... ...
@@ -1844,10 +2197,15 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1844 1844
 	}
1845 1845
 	switch(cmd){
1846 1846
 		case CONN_ERROR:
1847
-			if (!(tcpconn->flags & F_CONN_REMOVED) && (tcpconn->s!=-1)){
1847
+			if ( (!(tcpconn->flags & F_CONN_REMOVED) ||
1848
+					(tcpconn->flags & F_CONN_WRITE_W) ) && (tcpconn->s!=-1)){
1848 1849
 				io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1849 1850
 				tcpconn->flags|=F_CONN_REMOVED;
1851
+				tcpconn->flags&=~F_CONN_WRITE_W;
1850 1852
 			}
1853
+			LOG(L_ERR, "handle_ser_child: ERROR: received CON_ERROR for %p"
1854
+					" (id %d), refcnt %d\n", 
1855
+					tcpconn, tcpconn->id, atomic_get(&tcpconn->refcnt));
1851 1856
 			tcpconn_destroy(tcpconn); /* will close also the fd */
1852 1857
 			break;
1853 1858
 		case CONN_GET_FD:
... ...
@@ -1879,15 +2237,53 @@ inline static int handle_ser_child(struct process_table* p, int fd_i)
1879 1879
 			local_timer_add(&tcp_main_ltimer, &tcpconn->timer, 
1880 1880
 								tcp_con_lifetime, t);
1881 1881
 			tcpconn->flags&=~F_CONN_REMOVED;
1882
+			flags=POLLIN 
1883
+#ifdef TCP_BUF_WRITE
1884
+					/* not used for now, the connection is sent to tcp_main
1885
+					 * before knowing if we can write on it or we should 
1886
+					 * wait */
1887
+					| (((int)!(tcpconn->flags & F_CONN_WRITE_W)-1) & POLLOUT)
1888
+#endif /* TCP_BUF_WRITE */
1889
+					;
1882 1890
 			if (unlikely(
1883
-					io_watch_add(&io_h, tcpconn->s, POLLIN,
1891
+					io_watch_add(&io_h, tcpconn->s, flags,
1884 1892
 												F_TCPCONN, tcpconn)<0)){
1885 1893
 				LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed to add"
1886 1894
 						" new socket to the fd list\n");
1887 1895
 				tcpconn->flags|=F_CONN_REMOVED;
1896
+				tcpconn->flags&=~F_CONN_WRITE_W;
1888 1897
 				tcpconn_destroy(tcpconn); /* closes also the fd */
1889 1898
 			}
1890 1899
 			break;
1900
+#ifdef TCP_BUF_WRITE
1901
+		case CONN_QUEUED_WRITE:
1902
+			if (!(tcpconn->flags & F_CONN_WRITE_W)){
1903
+				if (tcpconn->flags& F_CONN_REMOVED){
1904
+					if (unlikely(io_watch_add(&io_h, tcpconn->s, POLLOUT,
1905
+												F_TCPCONN, tcpconn)<0)){
1906
+						LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
1907
+								    " to enable write watch on socket\n");
1908
+						tcpconn_destroy(tcpconn);
1909
+						break;
1910
+					}
1911
+				}else{
1912
+					if (unlikely(io_watch_chg(&io_h, tcpconn->s,
1913
+												POLLIN|POLLOUT, -1)<0)){
1914
+						LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
1915
+								    " to change socket watch events\n");
1916
+						io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
1917
+						tcpconn->flags|=F_CONN_REMOVED;
1918
+						tcpconn_destroy(tcpconn);
1919
+						break;
1920
+					}
1921
+				}
1922
+				tcpconn->flags|=F_CONN_WRITE_W;
1923
+			}else{
1924
+				LOG(L_WARN, "tcp_main: hanlder_ser_child: connection %p"
1925
+							" already watched for write\n", tcpconn);
1926
+			}
1927
+			break;
1928
+#endif /* TCP_BUF_WRITE */
1891 1929
 		default:
1892 1930
 			LOG(L_CRIT, "BUG: handle_ser_child: unknown cmd %d\n", cmd);
1893 1931
 	}
... ...
@@ -2056,6 +2452,7 @@ static inline int handle_new_connect(struct socket_info* si)
2056 2056
 		if(unlikely(send2child(tcpconn)<0)){
2057 2057
 			LOG(L_ERR,"ERROR: handle_new_connect: no children "
2058 2058
 					"available\n");
2059
+			tcpconn->flags&=~F_CONN_READER;
2059 2060
 			tcpconn_destroy(tcpconn);
2060 2061
 		}
2061 2062
 #endif
... ...
@@ -2075,13 +2472,17 @@ static inline int handle_new_connect(struct socket_info* si)
2075 2075
  * params: tcpconn - pointer to the tcp_connection for which we have an io ev.
2076 2076
  *         fd_i    - index in the fd_array table (needed for delete)
2077 2077
  * returns:  handle_* return convention, but on success it always returns 0
2078
- *           (because it's one-shot, after a succesfull execution the fd is
2078
+ *           (because it's one-shot, after a succesful execution the fd is
2079 2079
  *            removed from tcp_main's watch fd list and passed to a child =>
2080 2080
  *            tcp_main is not interested in further io events that might be
2081 2081
  *            queued for this fd)
2082 2082
  */
2083
-inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, int fd_i)
2083
+inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, short ev, 
2084
+										int fd_i)
2084 2085
 {
2086
+#ifdef TCP_BUF_WRITE
2087
+	int empty_q;
2088
+#endif /* TCP_BUF_WRITE */
2085 2089
 	/*  is refcnt!=0 really necessary? 
2086 2090
 	 *  No, in fact it's a bug: I can have the following situation: a send only
2087 2091
 	 *   tcp connection used by n processes simultaneously => refcnt = n. In 
... ...
@@ -2101,17 +2502,55 @@ inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, int fd_i)
2101 2101
 #endif
2102 2102
 	/* pass it to child, so remove it from the io watch list  and the local
2103 2103
 	 *  timer */
2104
-	DBG("handle_tcpconn_ev: data available on %p %d\n", tcpconn, tcpconn->s);
2105
-	if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1)) goto error;
2106
-	tcpconn->flags|=F_CONN_REMOVED|F_CONN_READER;
2107
-	local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2108
-	tcpconn_ref(tcpconn); /* refcnt ++ */
2109
-	if (unlikely(send2child(tcpconn)<0)){
2110
-		LOG(L_ERR,"ERROR: handle_tcpconn_ev: no children available\n");
2111
-		tcpconn_destroy(tcpconn);
2104
+	DBG("handle_tcpconn_ev: ev (%0x) on %p %d\n", ev, tcpconn, tcpconn->s);
2105
+#ifdef TCP_BUF_WRITE
2106
+	if (unlikely((ev & POLLOUT) && (tcpconn->flags & F_CONN_WRITE_W))){
2107
+		if (unlikely(wbufq_run(tcpconn->s, tcpconn, &empty_q)<0)){
2108
+			io_watch_del(&io_h, tcpconn->s, fd_i, 0);
2109
+			tcpconn->flags|=F_CONN_REMOVED;
2110
+			tcpconn->flags&=~F_CONN_WRITE_W;
2111
+			tcpconn_destroy(tcpconn);
2112
+			goto error;
2113
+		}
2114
+		if (empty_q){
2115
+			if (tcpconn->flags & F_CONN_REMOVED){
2116
+				if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1))
2117
+					goto error;
2118
+			}else{
2119
+				if (unlikely(io_watch_chg(&io_h, tcpconn->s,
2120
+											POLLIN, fd_i)==-1))
2121
+					goto error;
2122
+			}
2123
+		}
2124
+	}
2125
+	if (likely((ev & POLLIN) && !(tcpconn->flags & F_CONN_REMOVED))){
2126
+		if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2127
+			if (unlikely(io_watch_chg(&io_h, tcpconn->s, POLLOUT, fd_i)==-1))
2128
+				goto error;
2129
+		}else
2130
+#else
2131
+	{
2132
+#endif /* TCP_BUF_WRITE */
2133
+			if (unlikely(io_watch_del(&io_h, tcpconn->s, fd_i, 0)==-1))
2134
+				goto error;
2135
+		tcpconn->flags|=F_CONN_REMOVED|F_CONN_READER;
2136
+		local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2137
+		tcpconn_ref(tcpconn); /* refcnt ++ */
2138
+		if (unlikely(send2child(tcpconn)<0)){
2139
+			LOG(L_ERR,"ERROR: handle_tcpconn_ev: no children available\n");
2140
+			tcpconn->flags&=~F_CONN_READER;
2141
+#ifdef TCP_BUF_WRITE
2142
+			if (tcpconn->flags & F_CONN_WRITE_W){
2143
+				io_watch_del(&io_h, tcpconn->s, fd_i, 0);
2144
+				tcpconn->flags&=~F_CONN_WRITE_W;
2145
+			}
2146
+#endif /* TCP_BUF_WRITE */
2147
+			tcpconn_destroy(tcpconn);
2148
+		}
2112 2149
 	}
2113 2150
 	return 0; /* we are not interested in possibly queued io events, 
2114
-				 the fd was either passed to a child, or closed */
2151
+				 the fd was either passed to a child, closed, or for writes,
2152
+				 everything possible was already written */
2115 2153
 error:
2116 2154
 	return -1;
2117 2155
 }
... ...
@@ -2131,7 +2570,7 @@ error:
2131 2131
  *         >0 on successfull read from the fd (when there might be more io
2132 2132
  *            queued -- the receive buffer might still be non-empty)
2133 2133
  */
2134
-inline static int handle_io(struct fd_map* fm, short events, int idx)
2134
+inline static int handle_io(struct fd_map* fm, short ev, int idx)
2135 2135
 {	
2136 2136
 	int ret;
2137 2137
 	
... ...
@@ -2140,7 +2579,7 @@ inline static int handle_io(struct fd_map* fm, short events, int idx)
2140 2140
 			ret=handle_new_connect((struct socket_info*)fm->data);
2141 2141
 			break;
2142 2142
 		case F_TCPCONN:
2143
-			ret=handle_tcpconn_ev((struct tcp_connection*)fm->data, idx);
2143
+			ret=handle_tcpconn_ev((struct tcp_connection*)fm->data, ev, idx);
2144 2144
 			break;
2145 2145
 		case F_TCPCHILD:
2146 2146
 			ret=handle_tcp_child((struct tcp_child*)fm->data, idx);
... ...
@@ -2185,9 +2624,16 @@ static ticks_t tcpconn_main_timeout(ticks_t t, struct timer_ln* tl, void* data)
2185 2185
 				TCPCONN_UNLOCK; /* unlock as soon as possible */
2186 2186
 				fd=c->s;
2187 2187
 				if (likely(fd>0)){
2188
-					if (likely(!(c->flags & F_CONN_REMOVED))){
2188
+					if (likely(!(c->flags & F_CONN_REMOVED)
2189
+#ifdef TCP_BUF_WRITE
2190
+								|| (c->flags & F_CONN_WRITE_W)
2191
+#endif /* TCP_BUF_WRITE */
2192
+								)){
2189 2193
 						io_watch_del(&io_h, fd, -1, IO_FD_CLOSING);
2190 2194
 						c->flags|=F_CONN_REMOVED;
2195
+#ifdef TCP_BUF_WRITE
2196
+						c->flags&=~F_CONN_WRITE_W;
2197
+#endif /* TCP_BUF_WRITE */
2191 2198
 					}
2192 2199
 #ifdef USE_TLS
2193 2200
 					if (unlikely(c->type==PROTO_TLS ))
... ...
@@ -2250,9 +2696,16 @@ static inline void tcpconn_destroy_all()
2250 2250
 						local_timer_del(&tcp_main_ltimer, &c->timer);
2251 2251
 					/* else still in some reader */
2252 2252
 					fd=c->s;
2253
-					if (fd>0 && !(c->flags & F_CONN_REMOVED)){
2253
+					if (fd>0 && (!(c->flags & F_CONN_REMOVED)
2254
+#ifdef TCP_BUF_WRITE
2255
+								|| (c->flags & F_CONN_WRITE_W)
2256
+#endif /* TCP_BUF_WRITE */
2257
+								)){
2254 2258
 						io_watch_del(&io_h, fd, -1, IO_FD_CLOSING);
2255 2259
 						c->flags|=F_CONN_REMOVED;
2260
+#ifdef TCP_BUF_WRITE
2261
+						c->flags&=~F_CONN_WRITE_W;
2262
+#endif /* TCP_BUF_WRITE */
2256 2263
 					}
2257 2264
 				}else{
2258 2265
 					fd=-1;
... ...
@@ -2456,6 +2909,12 @@ void destroy_tcp()
2456 2456
 			shm_free(tcp_connections_no);
2457 2457
 			tcp_connections_no=0;
2458 2458
 		}
2459
+#ifdef TCP_BUF_WRITE
2460
+		if (tcp_total_wq){
2461
+			shm_free(tcp_total_wq);
2462
+			tcp_total_wq=0;
2463
+		}
2464
+#endif /* TCP_BUF_WRITE */
2459 2465
 		if (connection_id){
2460 2466
 			shm_free(connection_id);
2461 2467
 			connection_id=0;
... ...
@@ -2508,6 +2967,13 @@ int init_tcp()
2508 2508
 		goto error;
2509 2509
 	}
2510 2510
 	*connection_id=1;
2511
+#ifdef TCP_BUF_WRITE
2512
+	tcp_total_wq=shm_malloc(sizeof(*tcp_total_wq));
2513
+	if (tcp_total_wq==0){
2514
+		LOG(L_CRIT, "ERROR: init_tcp: could not alloc globals\n");
2515
+		goto error;
2516
+	}
2517
+#endif /* TCP_BUF_WRITE */
2511 2518
 	/* alloc hashtables*/
2512 2519
 	tcpconn_aliases_hash=(struct tcp_conn_alias**)
2513 2520
 			shm_malloc(TCP_ALIAS_HASH_SIZE* sizeof(struct tcp_conn_alias*));
... ...
@@ -2675,6 +3141,11 @@ void tcp_get_info(struct tcp_gen_info *ti)
2675 2675
 	ti->tcp_readers=tcp_children_no;
2676 2676
 	ti->tcp_max_connections=tcp_max_connections;
2677 2677
 	ti->tcp_connections_no=*tcp_connections_no;
2678
+#ifdef TCP_BUF_WRITE
2679
+	ti->tcp_write_queued=*tcp_total_wq;
2680
+#else
2681
+	ti->tcp_write_queued=0;
2682
+#endif /* TCP_BUF_WRITE */
2678 2683
 }
2679 2684
 
2680 2685
 #endif
... ...
@@ -25,6 +25,8 @@
25 25
 
26 26
 #include "tcp_options.h"
27 27
 #include "dprint.h"
28
+#include "globals.h"
29
+#include "timer_ticks.h"
28 30
 
29 31
 
30 32
 struct tcp_cfg_options tcp_options;
... ...
@@ -33,7 +35,12 @@ struct tcp_cfg_options tcp_options;
33 33
 /* set defaults */
34 34
 void init_tcp_options()
35 35
 {
36
-
36
+#ifdef TCP_BUF_WRITE
37
+	tcp_options.tcp_buf_write=0;
38
+	tcp_options.tcpconn_wq_max=32*1024; /* 32 k */
39
+	tcp_options.tcp_wq_max=10*1024*1024; /* 10 MB */
40
+	tcp_options.tcp_wq_timeout=S_TO_TICKS(tcp_send_timeout);
41
+#endif
37 42
 #ifdef TCP_FD_CACHE
38 43
 	tcp_options.fd_cache=1;
39 44
 #endif
... ...
@@ -54,7 +61,7 @@ void init_tcp_options()
54 54
 
55 55
 #define W_OPT_NC(option) \
56 56
 	if (tcp_options.option){\
57
-		WARN("tcp_options: tcp_" ##option \
57
+		WARN("tcp_options: tcp_" #option \
58 58
 				"cannot be enabled (recompile needed)\n"); \
59 59
 		tcp_options.option=0; \
60 60
 	}
... ...
@@ -63,7 +70,7 @@ void init_tcp_options()
63 63
 
64 64
 #define W_OPT_NS(option) \
65 65
 	if (tcp_options.option){\
66
-		WARN("tcp_options: tcp_" ##option \
66
+		WARN("tcp_options: tcp_" #option \
67 67
 				"cannot be enabled (no OS support)\n"); \
68 68
 		tcp_options.option=0; \
69 69
 	}
... ...
@@ -76,6 +83,13 @@ void tcp_options_check()
76 76
 	W_OPT_NC(defer_accept);
77 77
 #endif
78 78
 
79
+#ifndef TCP_BUF_WRITE
80
+	W_OPT_NC(tcp_buf_write);
81
+	W_OPT_NC(tcpconn_wq_max);
82
+	W_OPT_NC(tcp_wq_max);
83
+	W_OPT_NC(tcp_wq_timeout);
84
+#endif /* TCP_BUF_WRITE */
85
+
79 86
 #if ! defined HAVE_TCP_DEFER_ACCEPT && ! defined HAVE_TCP_ACCEPT_FILTER
80 87
 	W_OPT_NS(defer_accept);
81 88
 #endif
... ...
@@ -26,6 +26,11 @@
26 26
 #ifndef tcp_options_h
27 27
 #define tcp_options_h
28 28
 
29
+
30
+#ifndef NO_TCP_BUF_WRITE
31
+#define TCP_BUF_WRITE /* enabled buffered writing */
32
+#endif 
33
+
29 34
 #ifndef NO_TCP_FD_CACHE
30 35
 #define TCP_FD_CACHE /* enable fd caching */
31 36
 #endif
... ...
@@ -95,6 +100,12 @@
95 95
 struct tcp_cfg_options{
96 96
 	/* ser tcp options */
97 97
 	int fd_cache; /* on /off */
98
+	/* tcp buf. write options */
99
+	int tcp_buf_write; /* on / off */
100
+	unsigned int tcpconn_wq_max; /* maximum queue len per connection */
101
+	unsigned int tcp_wq_max; /* maximum overall queued bytes */
102
+	unsigned int tcp_wq_timeout;      /* timeout for queue writes */
103
+
98 104
 	/* tcp socket options */
99 105
 	int defer_accept; /* on / off */
100 106
 	int delayed_ack; /* delay ack on connect */