Browse code

p_usrloc: reworking mdb_availability_control

- simplify general implementation
- avoid deadlock due to a process trying to acquire same lock twice

(cherry picked from commit 5789c506d022dd5713072cd1fcc07a25f42e098c)

Lucian Balaceanu authored on 14/07/2020 08:48:45
Showing 5 changed files
... ...
@@ -185,7 +185,7 @@ str default_db_type   = str_init(DEFAULT_DB_TYPE);
185 185
 str domain_db         = str_init(DEFAULT_DOMAIN_DB);
186 186
 int default_dbt       = 0;
187 187
 int expire            = 0;
188
-db_shared_param_t *write_on_master_db_shared;
188
+int *mdb_w_available;
189 189
 
190 190
 /*! \brief
191 191
  * Exported functions
... ...
@@ -311,13 +311,6 @@ static int mod_init(void)
311 311
 	}
312 312
 #endif
313 313
 
314
-	if((write_on_master_db_shared = shm_malloc(sizeof(db_shared_param_t))) == NULL) {
315
-		LM_ERR("couldn't allocate shared memory.\n");
316
-		return -1;
317
-	} else {
318
-		write_on_master_db_shared->val = db_master_write;
319
-	}
320
-
321 314
 	if(ul_hash_size<=1)
322 315
 		ul_hash_size = 512;
323 316
 	else
... ...
@@ -406,14 +399,18 @@ static int mod_init(void)
406 399
 		LM_ERR("could not init database watch environment.\n");
407 400
 		return -1;
408 401
 	}
409
-	if (lock_init(&write_on_master_db_shared->lock)==0){
410
-		LM_ERR("could not initialise lock\n");
402
+
403
+	if((mdb_w_available = shm_malloc(sizeof(int))) == NULL) {
404
+		LM_ERR("couldn't allocate shared memory. \n");
405
+		return -1;
411 406
 	}
412
-	if(write_on_master_db_shared->val){
407
+	if (db_master_write) {
413 408
 		/* register extra dummy timer to be created in init_db_check() */
414 409
 		register_dummy_timers(1);
410
+		if (mdb_availability_control) {
411
+			check_master_db();
412
+		}
415 413
 	}
416
-        check_master_db(db_master_write);
417 414
 	return 0;
418 415
 }
419 416
 
... ...
@@ -422,7 +419,7 @@ static int child_init(int _rank)
422 419
 {
423 420
 	if(_rank==PROC_INIT) {
424 421
 		if(init_db_check() < 0){
425
-				LM_ERR("could not initialise database check.\n");
422
+			LM_ERR("could not initialise database check.\n");
426 423
 			return -1;
427 424
 		}
428 425
 		return 0;
... ...
@@ -123,11 +123,7 @@ extern int connection_expires;
123 123
 extern int alg_location;
124 124
 
125 125
 extern int  max_loc_nr;
126
-typedef struct db_shared_param {
127
-	int val;
128
-	gen_lock_t lock;
129
-} db_shared_param_t;
130
-extern db_shared_param_t *write_on_master_db_shared;
126
+extern int * mdb_w_available;
131 127
 extern int mdb_availability_control;
132 128
 
133 129
 #endif /* UL_MOD_H */
... ...
@@ -60,7 +60,7 @@ int ul_db_init(void) {
60 60
 	
61 61
 	memset(results, 0, sizeof(results));
62 62
 
63
-	if(write_on_master_db_shared->val){
63
+	if(db_master_write){
64 64
 		if(db_bind_mod(mdb.write.url, &mdb.write.dbf) < 0) {
65 65
 			LM_ERR("could not bind api for write db.\n");
66 66
 			return -1;
... ...
@@ -102,16 +102,19 @@ int ul_db_child_init(void) {
102 102
 	if(ul_db_child_locnr_init() == -1) return -1;
103 103
 	
104 104
 	LM_INFO("location number is %d\n", max_loc_nr);
105
-        lock_get(&write_on_master_db_shared->lock);
106
-	if(write_on_master_db_shared->val){
105
+	if(db_master_write){
107 106
 		if((mdb.write.dbh  = mdb.write.dbf.init(mdb.write.url)) == NULL) {
108
-			LM_ERR("could not connect to sip master db (write).\n");
109
-			lock_release(&write_on_master_db_shared->lock);
110
-			return -1;
107
+			if (mdb_availability_control) {
108
+				LM_INFO("starting with no connection to sip master db write\n");
109
+				return 0;
110
+			}
111
+			else {
112
+				LM_ERR("could not connect to sip master db (write).\n");
113
+				return -1;
114
+			}
111 115
 		}
112 116
 		LM_INFO("write db connection for children initialized\n");
113 117
 	}
114
-	lock_release(&write_on_master_db_shared->lock);
115 118
 	return 0;
116 119
 }
117 120
 
... ...
@@ -138,6 +141,21 @@ void ul_db_shutdown(void) {
138 141
 	return;
139 142
 }
140 143
 
144
+int init_w_dbh(ul_master_db_t *write) {
145
+	if (mdb_availability_control) {
146
+		if (!(*mdb_w_available)) {
147
+			return -1;
148
+		}
149
+		if (write->dbh == NULL) {
150
+			if((write->dbh  = write->dbf.init(write->url)) == NULL) {
151
+				LM_ERR("Could not recreate connection to master write db.\n");
152
+				return -1;
153
+			}
154
+			LM_INFO("Recreated connection to master write db.\n");
155
+		}
156
+	}
157
+	return 0;
158
+}
141 159
 
142 160
 int db_handle_error(ul_db_handle_t * handle, int no) {
143 161
 	int query_len;
... ...
@@ -150,12 +168,9 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
150 168
 		return -1;
151 169
 	}
152 170
 
153
-	lock_get(&write_on_master_db_shared->lock);
154
-	if(!write_on_master_db_shared->val){
155
-		lock_release(&write_on_master_db_shared->lock);
171
+	if (!db_master_write) {
156 172
 		return 0;
157 173
 	}
158
-	lock_release(&write_on_master_db_shared->lock);
159 174
 
160 175
 	query_len = 35 + reg_table.len
161 176
 			+ error_col.len * 2 + id_col.len;
... ...
@@ -186,7 +201,10 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
186 201
 		tmp.s = query;
187 202
 		tmp.len = strlen(query);
188 203
 
189
-		if (mdb.write.dbf.raw_query (mdb.write.dbh, &tmp, NULL)) {
204
+		if (init_w_dbh(&mdb.write) < 0)
205
+			return -1;
206
+
207
+		if (mdb.write.dbf.raw_query(mdb.write.dbh, &tmp, NULL)) {
190 208
 			LM_ERR("error in database update.\n");
191 209
 			return -1;
192 210
 		}
... ...
@@ -209,6 +227,8 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
209 227
 		handle->id, db->no, db->errors, cfg_get(p_usrloc, p_usrloc_cfg, db_err_threshold));
210 228
 	if(db->errors >= cfg_get(p_usrloc, p_usrloc_cfg, db_err_threshold)) {
211 229
 		LM_DBG("db_handle_error: now doing failover\n");
230
+		if (init_w_dbh(&mdb.write) < 0)
231
+			return -1;
212 232
 		if((db_failover(&mdb.write.dbf, mdb.write.dbh, handle, no)) < 0) {
213 233
 			LM_ERR("error in doing failover.\n");
214 234
 			return -1;
... ...
@@ -378,12 +398,9 @@ int ul_db_query(str * table, str * first, str * second, db1_con_t *** _r_h,
378 398
 		LM_ERR("could not retrieve db handle.\n");
379 399
 		return -1;
380 400
 	}
381
-	lock_get(&write_on_master_db_shared->lock);
382
-	if((ret = db_query(handle, _r_h, &f, table, _k, _op, _v, _c, _n, _nc, _o, _r, write_on_master_db_shared->val)) < 0){
383
-		lock_release(&write_on_master_db_shared->lock);
401
+	if((ret = db_query(handle, _r_h, &f, table, _k, _op, _v, _c, _n, _nc, _o, _r, db_master_write)) < 0){
384 402
 		return ret;
385 403
 	}
386
-	lock_release(&write_on_master_db_shared->lock);
387 404
 	add_dbf(*_r, f);
388 405
 	return ret;
389 406
 }
... ...
@@ -401,34 +418,31 @@ int ul_db_free_result(db1_con_t ** dbh, db1_res_t * res){
401 418
 }
402 419
 
403 420
 int db_reactivate(ul_db_handle_t * handle, int no){
404
-	lock_get(&write_on_master_db_shared->lock);
405
-	if(!write_on_master_db_shared->val){
406
-		lock_release(&write_on_master_db_shared->lock);
421
+	if(!db_master_write){
407 422
 		LM_ERR("running in read only mode, abort.\n");
408 423
 		return -1;
409 424
 	}
410
-	lock_release(&write_on_master_db_shared->lock);
425
+	if (init_w_dbh(&mdb.write) < 0)
426
+		return -1;
411 427
 	return db_failover_reactivate(&mdb.write.dbf, mdb.write.dbh, handle, no);
412 428
 }
413 429
 
414 430
 int db_reset_failover_time(ul_db_handle_t * handle, int no){
415
-	lock_get(&write_on_master_db_shared->lock);
416
-	if(!write_on_master_db_shared->val){
417
-		lock_release(&write_on_master_db_shared->lock);
431
+	if(!db_master_write){
418 432
 		LM_ERR("running in read only mode, abort.\n");
419 433
 		return -1;
420 434
 	}
421
-	lock_release(&write_on_master_db_shared->lock);
435
+	if (init_w_dbh(&mdb.write) < 0)
436
+		return -1;
422 437
 	return db_failover_reset(&mdb.write.dbf, mdb.write.dbh, handle->id, no);
423 438
 }
424 439
 
425 440
 int ul_db_check(ul_db_handle_t * handle){
426
-	lock_get(&write_on_master_db_shared->lock);
427
-	if(write_on_master_db_shared->val){
428
-		lock_release(&write_on_master_db_shared->lock);
441
+	if(db_master_write){
442
+		if (init_w_dbh(&mdb.write) < 0)
443
+			return -1;
429 444
 		return check_handle(&mdb.write.dbf, mdb.write.dbh, handle);
430 445
 	} else {
431
-		lock_release(&write_on_master_db_shared->lock);
432 446
 		LM_ERR("checking is useless in read-only mode\n");
433 447
 		return 0;
434 448
 	}
... ...
@@ -97,8 +97,8 @@ void check_dbs(unsigned int ticks, void *param){
97 97
 	int found;
98 98
 	int i;
99 99
 
100
-	if(mdb_availability_control) {
101
-		check_master_db(db_master_write);
100
+	if (db_master_write && mdb_availability_control) {
101
+		check_master_db();
102 102
 	}
103 103
 	if(!list_lock){
104 104
 		return;
... ...
@@ -152,20 +152,19 @@ void check_dbs(unsigned int ticks, void *param){
152 152
 	lock_release(list_lock);
153 153
 }
154 154
 
155
-void check_master_db(int dbm_write_default) {
155
+void check_master_db() {
156 156
 	if(mdb.write.dbh){
157 157
 		mdb.write.dbf.close(mdb.write.dbh);
158 158
 		mdb.write.dbh = NULL;
159 159
 	}
160 160
 
161
-	lock_get(&write_on_master_db_shared->lock);
162 161
 	if((mdb.write.dbh  = mdb.write.dbf.init(mdb.write.url)) == NULL) {
163
-		write_on_master_db_shared->val = 0;
164
-		LM_WARN("Master db is unavailable.\n");
162
+		LM_INFO("Master db is unavailable.\n");
163
+		*mdb_w_available = 0;
165 164
 	} else {
166
-		write_on_master_db_shared->val = dbm_write_default;
165
+		LM_INFO("Master db is available.\n");
166
+		*mdb_w_available = 1;
167 167
 	}
168
-	lock_release(&write_on_master_db_shared->lock);
169 168
 }
170 169
 
171 170
 int ul_register_watch_db(int id){
... ...
@@ -35,6 +35,6 @@ int ul_register_watch_db(int id);
35 35
 
36 36
 int ul_unregister_watch_db(int id);
37 37
 
38
-void check_master_db(int dbm_write_default);
38
+void check_master_db();
39 39
 
40 40
 #endif