Browse code

On popular demand: - added timeout on children shutdown and final cleanup: if it takes more than 60s => something is definitely wrong => kill all (if still waiting for children to finish) or abort (if the children are dead and we are stuck in cleanup) - force a shm_unlock before cleaning-up, in case we have a crashed child which still holds the lock (one more chance for a clean shutdown)

Andrei Pelinescu-Onciul authored on 11/09/2004 18:51:43
Showing 2 changed files
... ...
@@ -48,7 +48,7 @@ MAIN_NAME=ser
48 48
 VERSION = 0
49 49
 PATCHLEVEL = 8
50 50
 SUBLEVEL =   99
51
-EXTRAVERSION = -dev3
51
+EXTRAVERSION = -dev4
52 52
 
53 53
 RELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
54 54
 OS = $(shell uname -s | sed -e s/SunOS/solaris/ | tr "[A-Z]" "[a-z]")
... ...
@@ -49,7 +49,11 @@
49 49
  *              added support for increasing the open files limit    (andrei)
50 50
  *  2004-04-28  sock_{user,group,uid,gid,mode} added
51 51
  *              user2uid() & user2gid() added  (andrei)
52
- *
52
+ *  2004-09-11  added timeout on children shutdown and final cleanup
53
+ *               (if it takes more than 60s => something is definitely wrong
54
+ *                => kill all or abort)  (andrei)
55
+ *              force a shm_unlock before cleaning-up, in case we have a
56
+ *               crashed childvwhich still holds the lock  (andrei)
53 57
  */
54 58
 
55 59
 
... ...
@@ -419,6 +423,9 @@ char* pgid_file = 0;
419 423
 void cleanup(show_status)
420 424
 {
421 425
 	/*clean-up*/
426
+	shm_unlock(); /* hack: force-unlock the shared memory lock in case
427
+					 some process crashed and let it locked; this will 
428
+					 allow an almost gracious shutdown */
422 429
 	destroy_modules();
423 430
 #ifdef USE_TCP
424 431
 	destroy_tcp();
... ...
@@ -451,7 +458,6 @@ void cleanup(show_status)
451 458
 }
452 459
 
453 460
 
454
-
455 461
 /* tries to send a signal to all our processes
456 462
  * if daemonized  is ok to send the signal to all the process group,
457 463
  * however if not daemonized we might end up sending the signal also
... ...
@@ -475,6 +481,29 @@ static void kill_all_children(int signum)
475 481
 
476 482
 
477 483
 
484
+/* if this handler is called, a critical timeout has occured while
485
+ * waiting for the children to finish => we should kill everything and exit */
486
+static void sig_alarm_kill(int signo)
487
+{
488
+	kill_all_children(SIGKILL); /* this will kill the whole group
489
+								  including "this" process;
490
+								  for debugging replace with SIGABRT
491
+								  (but warning: it might generate lots
492
+								   of cores) */
493
+}
494
+
495
+
496
+/* like sig_alarm_kill, but the timeout has occured when cleaning up
497
+ * => try to leave a core for future diagnostics */
498
+static void sig_alarm_abort(int signo)
499
+{
500
+	/* LOG is not signal safe, but who cares, we are abort-ing anyway :-) */
501
+	LOG(L_CRIT, "BUG: shutdown timeout triggered, dying...");
502
+	abort();
503
+}
504
+
505
+
506
+
478 507
 void handle_sigs()
479 508
 {
480 509
 	pid_t	chld;
... ...
@@ -548,8 +577,17 @@ void handle_sigs()
548 577
 #endif
549 578
 			/* exit */
550 579
 			kill_all_children(SIGTERM);
580
+			if (signal(SIGALRM, sig_alarm_kill) == SIG_ERR ) {
581
+				LOG(L_ERR, "ERROR: could not install SIGALARM handler\n");
582
+				/* continue, the process will die anyway if no
583
+				 * alarm is installed which is exactly what we want */
584
+			}
585
+			alarm(60); /* 1 minute close timeout */
551 586
 			while(wait(0) > 0); /* wait for all the children to terminate*/
587
+			signal(SIGALRM, sig_alarm_abort);
552 588
 			cleanup(1); /* cleanup & show status*/
589
+			alarm(0);
590
+			signal(SIGALRM, SIG_IGN);
553 591
 			DBG("terminating due to SIGCHLD\n");
554 592
 			exit(0);
555 593
 			break;