Browse code

p_usrloc: reworking mdb_availability_control

- simplify general implementation
- avoid deadlock due to a process trying to acquire same lock twice

(cherry picked from commit 5789c506d022dd5713072cd1fcc07a25f42e098c)
(cherry picked from commit ffb0576bd7d93303fb1068a332d9477a4d04471b)

Lucian Balaceanu authored on 14/07/2020 08:48:45
Showing 5 changed files
... ...
@@ -183,7 +183,7 @@ str default_db_type   = str_init(DEFAULT_DB_TYPE);
183 183
 str domain_db         = str_init(DEFAULT_DOMAIN_DB);
184 184
 int default_dbt       = 0;
185 185
 int expire            = 0;
186
-db_shared_param_t *write_on_master_db_shared;
186
+int *mdb_w_available;
187 187
 
188 188
 /*! \brief
189 189
  * Exported functions
... ...
@@ -307,13 +307,6 @@ static int mod_init(void)
307 307
 	}
308 308
 #endif
309 309
 
310
-	if((write_on_master_db_shared = shm_malloc(sizeof(db_shared_param_t))) == NULL) {
311
-		LM_ERR("couldn't allocate shared memory.\n");
312
-		return -1;
313
-	} else {
314
-		write_on_master_db_shared->val = db_master_write;
315
-	}
316
-
317 310
 	if(ul_hash_size<=1)
318 311
 		ul_hash_size = 512;
319 312
 	else
... ...
@@ -402,14 +395,18 @@ static int mod_init(void)
402 395
 		LM_ERR("could not init database watch environment.\n");
403 396
 		return -1;
404 397
 	}
405
-	if (lock_init(&write_on_master_db_shared->lock)==0){
406
-		LM_ERR("could not initialise lock\n");
398
+
399
+	if((mdb_w_available = shm_malloc(sizeof(int))) == NULL) {
400
+		LM_ERR("couldn't allocate shared memory. \n");
401
+		return -1;
407 402
 	}
408
-	if(write_on_master_db_shared->val){
403
+	if (db_master_write) {
409 404
 		/* register extra dummy timer to be created in init_db_check() */
410 405
 		register_dummy_timers(1);
406
+		if (mdb_availability_control) {
407
+			check_master_db();
408
+		}
411 409
 	}
412
-        check_master_db(db_master_write);
413 410
 	return 0;
414 411
 }
415 412
 
... ...
@@ -418,7 +415,7 @@ static int child_init(int _rank)
418 415
 {
419 416
 	if(_rank==PROC_INIT) {
420 417
 		if(init_db_check() < 0){
421
-				LM_ERR("could not initialise database check.\n");
418
+			LM_ERR("could not initialise database check.\n");
422 419
 			return -1;
423 420
 		}
424 421
 		return 0;
... ...
@@ -123,11 +123,7 @@ extern int connection_expires;
123 123
 extern int alg_location;
124 124
 
125 125
 extern int  max_loc_nr;
126
-typedef struct db_shared_param {
127
-	int val;
128
-	gen_lock_t lock;
129
-} db_shared_param_t;
130
-extern db_shared_param_t *write_on_master_db_shared;
126
+extern int * mdb_w_available;
131 127
 extern int mdb_availability_control;
132 128
 
133 129
 #endif /* UL_MOD_H */
... ...
@@ -60,7 +60,7 @@ int ul_db_init(void) {
60 60
 	
61 61
 	memset(results, 0, sizeof(results));
62 62
 
63
-	if(write_on_master_db_shared->val){
63
+	if(db_master_write){
64 64
 		if(db_bind_mod(mdb.write.url, &mdb.write.dbf) < 0) {
65 65
 			LM_ERR("could not bind api for write db.\n");
66 66
 			return -1;
... ...
@@ -102,16 +102,19 @@ int ul_db_child_init(void) {
102 102
 	if(ul_db_child_locnr_init() == -1) return -1;
103 103
 	
104 104
 	LM_INFO("location number is %d\n", max_loc_nr);
105
-        lock_get(&write_on_master_db_shared->lock);
106
-	if(write_on_master_db_shared->val){
105
+	if(db_master_write){
107 106
 		if((mdb.write.dbh  = mdb.write.dbf.init(mdb.write.url)) == NULL) {
108
-			LM_ERR("could not connect to sip master db (write).\n");
109
-			lock_release(&write_on_master_db_shared->lock);
110
-			return -1;
107
+			if (mdb_availability_control) {
108
+				LM_INFO("starting with no connection to sip master db write\n");
109
+				return 0;
110
+			}
111
+			else {
112
+				LM_ERR("could not connect to sip master db (write).\n");
113
+				return -1;
114
+			}
111 115
 		}
112 116
 		LM_INFO("write db connection for children initialized\n");
113 117
 	}
114
-	lock_release(&write_on_master_db_shared->lock);
115 118
 	return 0;
116 119
 }
117 120
 
... ...
@@ -138,6 +141,21 @@ void ul_db_shutdown(void) {
138 141
 	return;
139 142
 }
140 143
 
144
+int init_w_dbh(ul_master_db_t *write) {
145
+	if (mdb_availability_control) {
146
+		if (!(*mdb_w_available)) {
147
+			return -1;
148
+		}
149
+		if (write->dbh == NULL) {
150
+			if((write->dbh  = write->dbf.init(write->url)) == NULL) {
151
+				LM_ERR("Could not recreate connection to master write db.\n");
152
+				return -1;
153
+			}
154
+			LM_INFO("Recreated connection to master write db.\n");
155
+		}
156
+	}
157
+	return 0;
158
+}
141 159
 
142 160
 int db_handle_error(ul_db_handle_t * handle, int no) {
143 161
 	int query_len;
... ...
@@ -150,12 +168,9 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
150 168
 		return -1;
151 169
 	}
152 170
 
153
-	lock_get(&write_on_master_db_shared->lock);
154
-	if(!write_on_master_db_shared->val){
155
-		lock_release(&write_on_master_db_shared->lock);
171
+	if (!db_master_write) {
156 172
 		return 0;
157 173
 	}
158
-	lock_release(&write_on_master_db_shared->lock);
159 174
 
160 175
 	query_len = 35 + reg_table.len
161 176
 			+ error_col.len * 2 + id_col.len;
... ...
@@ -186,7 +201,10 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
186 201
 		tmp.s = query;
187 202
 		tmp.len = strlen(query);
188 203
 
189
-		if (mdb.write.dbf.raw_query (mdb.write.dbh, &tmp, NULL)) {
204
+		if (init_w_dbh(&mdb.write) < 0)
205
+			return -1;
206
+
207
+		if (mdb.write.dbf.raw_query(mdb.write.dbh, &tmp, NULL)) {
190 208
 			LM_ERR("error in database update.\n");
191 209
 			return -1;
192 210
 		}
... ...
@@ -209,6 +227,8 @@ int db_handle_error(ul_db_handle_t * handle, int no) {
209 227
 		handle->id, db->no, db->errors, cfg_get(p_usrloc, p_usrloc_cfg, db_err_threshold));
210 228
 	if(db->errors >= cfg_get(p_usrloc, p_usrloc_cfg, db_err_threshold)) {
211 229
 		LM_DBG("db_handle_error: now doing failover\n");
230
+		if (init_w_dbh(&mdb.write) < 0)
231
+			return -1;
212 232
 		if((db_failover(&mdb.write.dbf, mdb.write.dbh, handle, no)) < 0) {
213 233
 			LM_ERR("error in doing failover.\n");
214 234
 			return -1;
... ...
@@ -378,12 +398,9 @@ int ul_db_query(str * table, str * first, str * second, db1_con_t *** _r_h,
378 398
 		LM_ERR("could not retrieve db handle.\n");
379 399
 		return -1;
380 400
 	}
381
-	lock_get(&write_on_master_db_shared->lock);
382
-	if((ret = db_query(handle, _r_h, &f, table, _k, _op, _v, _c, _n, _nc, _o, _r, write_on_master_db_shared->val)) < 0){
383
-		lock_release(&write_on_master_db_shared->lock);
401
+	if((ret = db_query(handle, _r_h, &f, table, _k, _op, _v, _c, _n, _nc, _o, _r, db_master_write)) < 0){
384 402
 		return ret;
385 403
 	}
386
-	lock_release(&write_on_master_db_shared->lock);
387 404
 	add_dbf(*_r, f);
388 405
 	return ret;
389 406
 }
... ...
@@ -401,34 +418,31 @@ int ul_db_free_result(db1_con_t ** dbh, db1_res_t * res){
401 418
 }
402 419
 
403 420
 int db_reactivate(ul_db_handle_t * handle, int no){
404
-	lock_get(&write_on_master_db_shared->lock);
405
-	if(!write_on_master_db_shared->val){
406
-		lock_release(&write_on_master_db_shared->lock);
421
+	if(!db_master_write){
407 422
 		LM_ERR("running in read only mode, abort.\n");
408 423
 		return -1;
409 424
 	}
410
-	lock_release(&write_on_master_db_shared->lock);
425
+	if (init_w_dbh(&mdb.write) < 0)
426
+		return -1;
411 427
 	return db_failover_reactivate(&mdb.write.dbf, mdb.write.dbh, handle, no);
412 428
 }
413 429
 
414 430
 int db_reset_failover_time(ul_db_handle_t * handle, int no){
415
-	lock_get(&write_on_master_db_shared->lock);
416
-	if(!write_on_master_db_shared->val){
417
-		lock_release(&write_on_master_db_shared->lock);
431
+	if(!db_master_write){
418 432
 		LM_ERR("running in read only mode, abort.\n");
419 433
 		return -1;
420 434
 	}
421
-	lock_release(&write_on_master_db_shared->lock);
435
+	if (init_w_dbh(&mdb.write) < 0)
436
+		return -1;
422 437
 	return db_failover_reset(&mdb.write.dbf, mdb.write.dbh, handle->id, no);
423 438
 }
424 439
 
425 440
 int ul_db_check(ul_db_handle_t * handle){
426
-	lock_get(&write_on_master_db_shared->lock);
427
-	if(write_on_master_db_shared->val){
428
-		lock_release(&write_on_master_db_shared->lock);
441
+	if(db_master_write){
442
+		if (init_w_dbh(&mdb.write) < 0)
443
+			return -1;
429 444
 		return check_handle(&mdb.write.dbf, mdb.write.dbh, handle);
430 445
 	} else {
431
-		lock_release(&write_on_master_db_shared->lock);
432 446
 		LM_ERR("checking is useless in read-only mode\n");
433 447
 		return 0;
434 448
 	}
... ...
@@ -97,8 +97,8 @@ void check_dbs(unsigned int ticks, void *param){
97 97
 	int found;
98 98
 	int i;
99 99
 
100
-	if(mdb_availability_control) {
101
-		check_master_db(db_master_write);
100
+	if (db_master_write && mdb_availability_control) {
101
+		check_master_db();
102 102
 	}
103 103
 	if(!list_lock){
104 104
 		return;
... ...
@@ -152,20 +152,19 @@ void check_dbs(unsigned int ticks, void *param){
152 152
 	lock_release(list_lock);
153 153
 }
154 154
 
155
-void check_master_db(int dbm_write_default) {
155
+void check_master_db() {
156 156
 	if(mdb.write.dbh){
157 157
 		mdb.write.dbf.close(mdb.write.dbh);
158 158
 		mdb.write.dbh = NULL;
159 159
 	}
160 160
 
161
-	lock_get(&write_on_master_db_shared->lock);
162 161
 	if((mdb.write.dbh  = mdb.write.dbf.init(mdb.write.url)) == NULL) {
163
-		write_on_master_db_shared->val = 0;
164
-		LM_WARN("Master db is unavailable.\n");
162
+		LM_INFO("Master db is unavailable.\n");
163
+		*mdb_w_available = 0;
165 164
 	} else {
166
-		write_on_master_db_shared->val = dbm_write_default;
165
+		LM_INFO("Master db is available.\n");
166
+		*mdb_w_available = 1;
167 167
 	}
168
-	lock_release(&write_on_master_db_shared->lock);
169 168
 }
170 169
 
171 170
 int ul_register_watch_db(int id){
... ...
@@ -35,6 +35,6 @@ int ul_register_watch_db(int id);
35 35
 
36 36
 int ul_unregister_watch_db(int id);
37 37
 
38
-void check_master_db(int dbm_write_default);
38
+void check_master_db();
39 39
 
40 40
 #endif