Changeset 706:4ffbc9f1e922 in freeDiameter for libfdcore/p_psm.c
- Timestamp:
- Feb 9, 2011, 3:26:58 PM (13 years ago)
- Branch:
- default
- Phase:
- public
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
libfdcore/p_psm.c
r691 r706 36 36 #include "fdcore-internal.h" 37 37 38 /* 39 This file implement a Peer State Machine which is a mix of: 40 - the state machine described in rfc3588bis 41 - the state machine described in rfc3539#section-3.4 42 - the following observations. 43 44 The delivery of Diameter messages must not always be unordered: order is important at 45 begining and end of a connection lifetime. It means we need agility to 46 switch between "ordering enforced" and "ordering not enforced to counter 47 HotLB" modes of operation. 48 49 The connection state machine represented in RFC3588 (and rfc3588bis) is 50 incomplete, because it lacks the SUSPECT state and the 3 DWR/DWA 51 exchanges (section 5.1) when the peer recovers from this state. 52 Personnally I don't see the rationale for exchanging 3 messages (why 3?) 53 but, if we require at least 1 DWR/DWA exchange to be always performed 54 after the CER/CEA exchange (and initiated by the peer that sent the 55 CEA), we have a simple way to deal with our ordering problem, as resumed 56 bellow. Peers are: [i]nitiator, [r]esponder. 57 (1) [i] SCTP connection attempt. 58 (2) [r] accept the connection. 59 (3) [i,r] (if secure port) DTLS handshake, close on failure. 60 (4) [i] Send CER 61 (5) [r] Receive CER, send CEA using stream 0, flag "unordered" cleared. 62 [r] Immediately send a DWR after the CEA, also using stream 0, 63 flag "unordered" cleared. 64 [r] Move to STATE_OPEN_NEW state -- equivalent to OPEN except 65 that all messages are sent ordered at the moment. 66 (6) [i] receive CEA, move to OPEN state. All messages can be sent 67 unordered in OPEN state. 68 [i] As per normal operation, reply with DWA to the DWR. 69 (7) [r] Upon reception of the DWA, move to OPEN state, messages can be 70 sent unordered from this point. 71 72 Note about (5) and (6): if the Diameter Identity received in CER or CEA 73 does not match the credentials from the certificate presented during 74 DTLS handshake, we may need to specify a path of clean disconnection 75 (not blocking the remote peer waiting for something). 76 77 This proposed mechanism removes the problem of application messages 78 received before the CEA by the initiator. Note that if the "old" inband 79 TLS handshake is used, this handshake plays the same synchronization 80 role than the new DWR/DWA, which becomes useless. 81 82 83 The other time where ordering is important is by the end of connection 84 lifetime, when one peer is shutting down the link for some reason 85 (reboot, overload, no activity, etc...). In case of unordered delivery, 86 we may have: 87 - peer A sends an application message followed by a DPR. Peer B receives 88 the DPR first and tears down the connection. Application message is lost. 89 - Peer B sends an application message, then receives a DPR and answers a 90 DPA. Peer A receives the DPA before the application message. The 91 application message is lost. 92 93 This situation is actually quite possible because DPR/DPA messages are 94 very short, while application messages can be quite large. Therefore, 95 they require much more time to deliver. 96 97 I really cannot see a way to counter this effect by using the ordering 98 of the messages, except by applying a timer (state STATE_CLOSING_GRACE). 99 100 However, this problem must be balanced with the fact that the message 101 that is lost will be in many cases sent again as the failover mechanism 102 specifies. 103 */ 104 38 105 /* The actual declaration of peer_state_str */ 39 106 DECLARE_STATE_STR(); … … 101 168 return 0; 102 169 } 170 103 171 /* Insert in the active peers list */ 104 172 CHECK_POSIX( pthread_rwlock_wrlock(&fd_g_activ_peers_rw) ); 105 173 for (li = fd_g_activ_peers.next; li != &fd_g_activ_peers; li = li->next) { 106 174 struct fd_peer * next_p = (struct fd_peer *)li->o; 107 int cmp = strcmp(peer->p_hdr.info.pi_diamid, next_p->p_hdr.info.pi_diamid); 175 int cmp = fd_os_cmp(peer->p_hdr.info.pi_diamid, peer->p_hdr.info.pi_diamidlen, 176 next_p->p_hdr.info.pi_diamid, next_p->p_hdr.info.pi_diamidlen); 108 177 if (cmp < 0) 109 178 break; … … 115 184 if (peer->p_cb) { 116 185 TRACE_DEBUG(FULL, "Calling add callback for peer %s", peer->p_hdr.info.pi_diamid); 117 (*peer->p_cb)(&peer->p_hdr.info, peer->p_cb_data); 186 (*peer->p_cb)(&peer->p_hdr.info, peer->p_cb_data); /* TODO: do this in a separate detached thread? */ 118 187 peer->p_cb = NULL; 119 188 peer->p_cb_data = NULL; … … 178 247 } 179 248 249 /* Read state */ 250 int fd_peer_get_state(struct peer_hdr *peer) 251 { 252 int ret; 253 254 struct fd_peer * p = (struct fd_peer *)peer; 255 256 if (!CHECK_PEER(p)) 257 return -1; 258 259 CHECK_POSIX_DO( pthread_mutex_lock(&p->p_state_mtx), return -1 ); 260 ret = p->p_state; 261 CHECK_POSIX_DO( pthread_mutex_unlock(&p->p_state_mtx), return -1 ); 262 263 return ret; 264 } 265 180 266 181 267 /* Change state */ … … 186 272 TRACE_ENTRY("%p %d(%s)", peer, new_state, STATE_STR(new_state)); 187 273 CHECK_PARAMS( CHECK_PEER(peer) ); 188 fd_cpu_flush_cache();189 old = peer->p_hdr.info.runtime.pir_state;274 275 old = fd_peer_getstate(peer); 190 276 if (old == new_state) 191 277 return 0; … … 196 282 peer->p_hdr.info.pi_diamid); 197 283 198 peer->p_hdr.info.runtime.pir_state = new_state; 199 fd_cpu_flush_cache(); 284 285 CHECK_POSIX( pthread_mutex_lock(&peer->p_state_mtx) ); 286 peer->p_state = new_state; 287 CHECK_POSIX( pthread_mutex_unlock(&peer->p_state_mtx) ); 200 288 201 289 if (old == STATE_OPEN) { … … 255 343 { 256 344 /* Move to CLOSED state: failover messages, stop OUT thread, unlink peer from active list */ 257 fd_cpu_flush_cache(); 258 if (peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE) { 345 if (fd_peer_getstate(peer) != STATE_ZOMBIE) { 259 346 CHECK_FCT_DO( fd_psm_change_state(peer, STATE_CLOSED), /* continue */ ); 260 347 } … … 285 372 struct fd_peer * peer = (struct fd_peer *)arg; 286 373 CHECK_PARAMS_DO( CHECK_PEER(peer), return ); 287 peer->p_hdr.info.runtime.pir_state = STATE_ZOMBIE; 288 fd_cpu_flush_cache(); 374 CHECK_POSIX_DO( pthread_mutex_lock(&peer->p_state_mtx), ); 375 peer->p_state = STATE_ZOMBIE; 376 CHECK_POSIX_DO( pthread_mutex_unlock(&peer->p_state_mtx), ); 289 377 return; 290 378 } … … 298 386 size_t ev_sz; 299 387 void * ev_data; 388 int cur_state; 300 389 301 390 CHECK_PARAMS_DO( CHECK_PEER(peer), ASSERT(0) ); … … 306 395 { 307 396 char buf[48]; 308 s printf(buf, "PSM/%.*s", (int)sizeof(buf) - 5, peer->p_hdr.info.pi_diamid);397 snprintf(buf, sizeof(buf), "PSM/%s", peer->p_hdr.info.pi_diamid); 309 398 fd_log_threadname ( buf ); 310 399 } 311 400 312 401 /* The state machine starts in CLOSED state */ 313 peer->p_hdr.info.runtime.pir_state = STATE_CLOSED; 402 CHECK_POSIX_DO( pthread_mutex_lock(&peer->p_state_mtx), goto psm_end ); 403 peer->p_state = STATE_CLOSED; 404 CHECK_POSIX_DO( pthread_mutex_unlock(&peer->p_state_mtx), goto psm_end ); 314 405 315 406 /* Wait that the PSM are authorized to start in the daemon */ … … 326 417 /* Get next event */ 327 418 TRACE_DEBUG(FULL, "'%s' in state '%s' waiting for next event.", 328 peer->p_hdr.info.pi_diamid, STATE_STR( peer->p_hdr.info.runtime.pir_state));419 peer->p_hdr.info.pi_diamid, STATE_STR(fd_peer_getstate(peer))); 329 420 CHECK_FCT_DO( fd_event_timedget(peer->p_events, &peer->p_psm_timer, FDEVP_PSM_TIMEOUT, &event, &ev_sz, &ev_data), goto psm_end ); 421 422 cur_state = fd_peer_getstate(peer); 423 if (cur_state == -1) 424 goto psm_end; 425 330 426 TRACE_DEBUG(FULL, "'%s'\t<-- '%s'\t(%p,%zd)\t'%s'", 331 STATE_STR( peer->p_hdr.info.runtime.pir_state),427 STATE_STR(cur_state), 332 428 fd_pev_str(event), ev_data, ev_sz, 333 429 peer->p_hdr.info.pi_diamid); … … 336 432 337 433 /* The following states are impossible */ 338 ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_NEW );339 ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE );340 ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_OPEN_HANDSHAKE ); /* because it should exist only between two loops */434 ASSERT( cur_state != STATE_NEW ); 435 ASSERT( cur_state != STATE_ZOMBIE ); 436 ASSERT( cur_state != STATE_OPEN_HANDSHAKE ); /* because it should exist only between two loops */ 341 437 342 438 /* Purge invalid events */ 343 439 if (!CHECK_PEVENT(event)) { 344 440 TRACE_DEBUG(INFO, "Invalid event received in PSM '%s' : %d", peer->p_hdr.info.pi_diamid, event); 441 ASSERT(0); /* we should investigate this situation */ 345 442 goto psm_loop; 346 443 } … … 354 451 /* Requests to terminate the peer object */ 355 452 if (event == FDEVP_TERMINATE) { 356 switch ( peer->p_hdr.info.runtime.pir_state) {453 switch (cur_state) { 357 454 case STATE_OPEN: 455 case STATE_OPEN_NEW: 358 456 case STATE_REOPEN: 359 /* We cannot just close the con enction, we have to send a DPR first */457 /* We cannot just close the connection, we have to send a DPR first */ 360 458 CHECK_FCT_DO( fd_p_dp_initiate(peer, ev_data), goto psm_end ); 361 459 goto psm_loop; … … 363 461 /* 364 462 case STATE_CLOSING: 463 case STATE_CLOSING_GRACE: 365 464 case STATE_WAITCNXACK: 366 465 case STATE_WAITCNXACK_ELEC: … … 380 479 struct msg_hdr * hdr; 381 480 382 /* If the current state does not allow receiving messages, just drop it */383 if (peer->p_hdr.info.runtime.pir_state == STATE_CLOSED) {384 TRACE_DEBUG(FULL, "Purging message in queue while in CLOSED state (%zdb)", ev_sz);385 free(ev_data);386 goto psm_loop;387 }388 389 481 /* Parse the received buffer */ 390 482 CHECK_FCT_DO( fd_msg_parse_buffer( (void *)&ev_data, ev_sz, &msg), … … 396 488 } ); 397 489 490 /* If the current state does not allow receiving messages, just drop it */ 491 if (cur_state == STATE_CLOSED) { 492 /* In such case, just discard the message */ 493 fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Purged from peer '%s''s queue (CLOSED state).", peer->p_hdr.info.pi_diamid ); 494 fd_msg_free(msg); 495 goto psm_loop; 496 } 497 398 498 /* Log incoming message */ 399 fd_msg_log( FD_MSG_LOG_RECEIVED, msg, "Received %zdb from '%s' ", ev_sz, peer->p_hdr.info.pi_diamid);499 fd_msg_log( FD_MSG_LOG_RECEIVED, msg, "Received %zdb from '%s' (%s)", ev_sz, peer->p_hdr.info.pi_diamid, STATE_STR(cur_state) ); 400 500 401 501 /* Extract the header */ … … 417 517 } 418 518 519 if (cur_state == STATE_OPEN_NEW) { 520 /* OK, we have received something, so the connection is supposedly now in OPEN state at the remote site */ 521 fd_psm_change_state(peer, STATE_OPEN ); 522 } 523 419 524 /* Now handle non-link-local messages */ 420 525 if (fd_msg_is_routable(msg)) { 421 switch ( peer->p_hdr.info.runtime.pir_state) {526 switch (cur_state) { 422 527 /* To maximize compatibility -- should not be a security issue here */ 423 528 case STATE_REOPEN: 424 529 case STATE_SUSPECT: 425 530 case STATE_CLOSING: 531 case STATE_CLOSING_GRACE: 426 532 TRACE_DEBUG(FULL, "Accepted a message while not in OPEN state... "); 427 533 /* The standard situation : */ 534 case STATE_OPEN_NEW: 428 535 case STATE_OPEN: 429 536 /* We received a valid routable message, update the expiry timer */ … … 431 538 432 539 /* Set the message source and add the Route-Record */ 433 CHECK_FCT_DO( fd_msg_source_set( msg, peer->p_hdr.info.pi_diamid, 1, fd_g_config->cnf_dict ), goto psm_end);540 CHECK_FCT_DO( fd_msg_source_set( msg, peer->p_hdr.info.pi_diamid, peer->p_hdr.info.pi_diamidlen, 1, fd_g_config->cnf_dict ), goto psm_end); 434 541 435 542 /* Requeue to the global incoming queue */ … … 437 544 438 545 /* Update the peer timer (only in OPEN state) */ 439 if (( peer->p_hdr.info.runtime.pir_state == STATE_OPEN) && (!peer->p_flags.pf_dw_pending)) {546 if ((cur_state == STATE_OPEN) && (!peer->p_flags.pf_dw_pending)) { 440 547 fd_psm_next_timeout(peer, 1, peer->p_hdr.info.config.pic_twtimer ?: fd_g_config->cnf_timer_tw); 441 548 } … … 449 556 default: 450 557 /* In such case, just discard the message */ 451 fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Received from peer '%s' while connection was not in OPEN state.", peer->p_hdr.info.pi_diamid);558 fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Received from peer '%s' while connection was not in state %s.", peer->p_hdr.info.pi_diamid, STATE_STR(cur_state) ); 452 559 fd_msg_free(msg); 453 560 } … … 485 592 case CC_DISCONNECT_PEER: 486 593 CHECK_FCT_DO( fd_p_dp_handle(&msg, (hdr->msg_flags & CMD_FLAG_REQUEST), peer), goto psm_reset ); 487 if ( peer->p_hdr.info.runtime.pir_state== STATE_CLOSING)594 if (fd_peer_getstate(peer) == STATE_CLOSING) 488 595 goto psm_end; 596 489 597 break; 490 598 … … 494 602 495 603 default: 496 /* Unknown / unexpected / invalid message */604 /* Unknown / unexpected / invalid message -- but validated by our dictionary */ 497 605 TRACE_DEBUG(INFO, "Invalid non-routable command received: %u.", hdr->msg_code); 498 606 if (hdr->msg_flags & CMD_FLAG_REQUEST) { … … 502 610 503 611 /* Set the error code */ 504 CHECK_FCT_DO( fd_msg_rescode_set(msg, "DIAMETER_ INVALID_HDR_BITS", NULL, NULL, 1 ), break );612 CHECK_FCT_DO( fd_msg_rescode_set(msg, "DIAMETER_COMMAND_UNSUPPORTED", "Or maybe the P-bit or application Id are erroneous.", NULL, 1 ), break ); 505 613 506 614 /* Send the answer */ … … 509 617 } else { 510 618 /* We did ASK for it ??? */ 511 fd_log_debug("Invalid PXY flag in answer header ?\n");619 TRACE_DEBUG(INFO, "Received answer with erroneous 'is_routable' result..."); 512 620 } 513 621 … … 531 639 /* The connection object is broken */ 532 640 if (event == FDEVP_CNX_ERROR) { 533 switch ( peer->p_hdr.info.runtime.pir_state) {641 switch (cur_state) { 534 642 case STATE_WAITCNXACK_ELEC: 535 643 /* Abort the initiating side */ … … 541 649 case STATE_WAITCEA: 542 650 case STATE_OPEN: 651 case STATE_OPEN_NEW: 543 652 case STATE_REOPEN: 544 653 case STATE_WAITCNXACK: … … 558 667 goto psm_end; 559 668 669 case STATE_CLOSING_GRACE: 670 if (peer->p_flags.pf_localterm) /* initiated here */ 671 goto psm_end; 672 673 fd_psm_cleanup(peer, 0); 674 675 /* Reset the timer for next connection attempt */ 676 fd_psm_next_timeout(peer, 1, fd_p_dp_newdelay(peer)); 677 goto psm_loop; 560 678 } 561 679 goto psm_loop; … … 616 734 peer->p_ini_thr = (pthread_t)NULL; 617 735 618 switch ( peer->p_hdr.info.runtime.pir_state) {736 switch (cur_state) { 619 737 case STATE_WAITCNXACK_ELEC: 620 738 case STATE_WAITCNXACK: … … 624 742 default: 625 743 /* Just abort the attempt and continue */ 626 TRACE_DEBUG(FULL, "Connection attempt successful but current state is %s, closing... ", STATE_STR(peer->p_hdr.info.runtime.pir_state));744 TRACE_DEBUG(FULL, "Connection attempt successful but current state is %s, closing... (too slow?)", STATE_STR(cur_state)); 627 745 fd_cnx_destroy(cnx); 628 746 } … … 638 756 peer->p_ini_thr = (pthread_t)NULL; 639 757 640 switch ( peer->p_hdr.info.runtime.pir_state) {758 switch (cur_state) { 641 759 case STATE_WAITCNXACK_ELEC: 642 760 /* Abort the initiating side */ … … 653 771 default: 654 772 /* Just ignore */ 655 TRACE_DEBUG(FULL, "Connection attempt failed but current state is %s, ignoring...", STATE_STR( peer->p_hdr.info.runtime.pir_state));773 TRACE_DEBUG(FULL, "Connection attempt failed but current state is %s, ignoring...", STATE_STR(cur_state)); 656 774 } 657 775 … … 661 779 /* The timeout for the current state has been reached */ 662 780 if (event == FDEVP_PSM_TIMEOUT) { 663 switch ( peer->p_hdr.info.runtime.pir_state) {781 switch (cur_state) { 664 782 case STATE_OPEN: 665 783 case STATE_REOPEN: 784 case STATE_OPEN_NEW: 666 785 CHECK_FCT_DO( fd_p_dw_timeout(peer), goto psm_end ); 667 786 goto psm_loop; … … 676 795 /* Mark the connection problem */ 677 796 peer->p_flags.pf_cnx_pb = 1; 678 679 797 case STATE_CLOSING: 680 798 case STATE_WAITCNXACK: … … 683 801 fd_psm_next_timeout(peer, 1, peer->p_hdr.info.config.pic_tctimer ?: fd_g_config->cnf_timer_tc); 684 802 goto psm_reset; 803 804 case STATE_CLOSING_GRACE: 805 /* The grace period is completed, now close */ 806 if (peer->p_flags.pf_localterm) 807 goto psm_end; 808 809 fd_psm_cleanup(peer, 0); 810 /* Reset the timer for next connection attempt */ 811 fd_psm_next_timeout(peer, 1, fd_p_dp_newdelay(peer)); 812 goto psm_loop; 685 813 686 814 case STATE_WAITCNXACK_ELEC: … … 697 825 698 826 /* Default action : the handling has not yet been implemented. [for debug only] */ 699 TRACE_DEBUG(INFO, "Missing handler in PSM for '%s'\t<-- '%s'", STATE_STR( peer->p_hdr.info.runtime.pir_state), fd_pev_str(event));827 TRACE_DEBUG(INFO, "Missing handler in PSM for '%s'\t<-- '%s'", STATE_STR(cur_state), fd_pev_str(event)); 700 828 psm_reset: 701 829 if (peer->p_flags.pf_delete) … … 707 835 fd_psm_cleanup(peer, 1); 708 836 TRACE_DEBUG(INFO, "'%s'\t-> STATE_ZOMBIE (terminated)\t'%s'", 709 STATE_STR( peer->p_hdr.info.runtime.pir_state),837 STATE_STR(fd_peer_getstate(peer)), 710 838 peer->p_hdr.info.pi_diamid); 711 839 pthread_cleanup_pop(1); /* set STATE_ZOMBIE */ 712 fd_cpu_flush_cache();713 840 peer->p_psm = (pthread_t)NULL; 714 841 pthread_detach(pthread_self()); … … 726 853 727 854 /* Check the peer and state are OK */ 728 CHECK_PARAMS( CHECK_PEER(peer) && (peer->p_hdr.info.runtime.pir_state == STATE_NEW));855 CHECK_PARAMS( fd_peer_getstate(peer) == STATE_NEW ); 729 856 730 857 /* Create the FIFO for events */ … … 744 871 CHECK_PARAMS( CHECK_PEER(peer) ); 745 872 746 fd_cpu_flush_cache(); 747 if (peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE) { 873 if (fd_peer_getstate(peer) != STATE_ZOMBIE) { 748 874 CHECK_FCT( fd_event_send(peer->p_events, FDEVP_TERMINATE, 0, reason) ); 749 875 } else {
Note: See TracChangeset
for help on using the changeset viewer.