Browse code

On popular demand: - added timeout on children shutdown and final cleanup: if it takes more than 60s => something is definitely wrong => kill all (if still waiting for children to finish) or abort (if the children are dead and we are stuck in cleanup) - force a shm_unlock before cleaning-up, in case we have a crashed child which still holds the lock (one more chance for a clean shutdown)

Andrei Pelinescu-Onciul authored on 11/09/2004 18:51:43
Showing 2 changed files
... ...
@@ -48,7 +48,7 @@ MAIN_NAME=ser
48 48
 VERSION = 0
49 49
 PATCHLEVEL = 8
50 50
 SUBLEVEL =   99
51
-EXTRAVERSION = -dev3
51
+EXTRAVERSION = -dev4
52 52
 
53 53
 RELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
54 54
 OS = $(shell uname -s | sed -e s/SunOS/solaris/ | tr "[A-Z]" "[a-z]")
... ...
@@ -49,7 +49,11 @@
49 49
  *              added support for increasing the open files limit    (andrei)
50 50
  *  2004-04-28  sock_{user,group,uid,gid,mode} added
51 51
  *              user2uid() & user2gid() added  (andrei)
52
- *
52
+ *  2004-09-11  added timeout on children shutdown and final cleanup
53
+ *               (if it takes more than 60s => something is definitely wrong
54
+ *                => kill all or abort)  (andrei)
55
+ *              force a shm_unlock before cleaning-up, in case we have a
56
+ *               crashed childvwhich still holds the lock  (andrei)
53 57
  */
54 58
 
55 59
 
... ...
@@ -419,6 +423,9 @@ char* pgid_file = 0;
419 419
 void cleanup(show_status)
420 420
 {
421 421
 	/*clean-up*/
422
+	shm_unlock(); /* hack: force-unlock the shared memory lock in case
423
+					 some process crashed and let it locked; this will 
424
+					 allow an almost gracious shutdown */
422 425
 	destroy_modules();
423 426
 #ifdef USE_TCP
424 427
 	destroy_tcp();
... ...
@@ -451,7 +458,6 @@ void cleanup(show_status)
451 451
 }
452 452
 
453 453
 
454
-
455 454
 /* tries to send a signal to all our processes
456 455
  * if daemonized  is ok to send the signal to all the process group,
457 456
  * however if not daemonized we might end up sending the signal also
... ...
@@ -475,6 +481,29 @@ static void kill_all_children(int signum)
475 475
 
476 476
 
477 477
 
478
+/* if this handler is called, a critical timeout has occured while
479
+ * waiting for the children to finish => we should kill everything and exit */
480
+static void sig_alarm_kill(int signo)
481
+{
482
+	kill_all_children(SIGKILL); /* this will kill the whole group
483
+								  including "this" process;
484
+								  for debugging replace with SIGABRT
485
+								  (but warning: it might generate lots
486
+								   of cores) */
487
+}
488
+
489
+
490
+/* like sig_alarm_kill, but the timeout has occured when cleaning up
491
+ * => try to leave a core for future diagnostics */
492
+static void sig_alarm_abort(int signo)
493
+{
494
+	/* LOG is not signal safe, but who cares, we are abort-ing anyway :-) */
495
+	LOG(L_CRIT, "BUG: shutdown timeout triggered, dying...");
496
+	abort();
497
+}
498
+
499
+
500
+
478 501
 void handle_sigs()
479 502
 {
480 503
 	pid_t	chld;
... ...
@@ -548,8 +577,17 @@ void handle_sigs()
548 548
 #endif
549 549
 			/* exit */
550 550
 			kill_all_children(SIGTERM);
551
+			if (signal(SIGALRM, sig_alarm_kill) == SIG_ERR ) {
552
+				LOG(L_ERR, "ERROR: could not install SIGALARM handler\n");
553
+				/* continue, the process will die anyway if no
554
+				 * alarm is installed which is exactly what we want */
555
+			}
556
+			alarm(60); /* 1 minute close timeout */
551 557
 			while(wait(0) > 0); /* wait for all the children to terminate*/
558
+			signal(SIGALRM, sig_alarm_abort);
552 559
 			cleanup(1); /* cleanup & show status*/
560
+			alarm(0);
561
+			signal(SIGALRM, SIG_IGN);
553 562
 			DBG("terminating due to SIGCHLD\n");
554 563
 			exit(0);
555 564
 			break;