Navigation


Changeset 706:4ffbc9f1e922 in freeDiameter for libfdcore/p_psm.c


Ignore:
Timestamp:
Feb 9, 2011, 3:26:58 PM (13 years ago)
Author:
Sebastien Decugis <sdecugis@nict.go.jp>
Branch:
default
Phase:
public
Message:

Large UNTESTED commit with the following changes:

  • Improved DiameterIdentity? handling (esp. interationalization issues), and improve efficiency of some string operations in peers, sessions, and dictionary modules (closes #7)
  • Cleanup in the session module to free only unreferenced sessions (#16)
  • Removed fd_cpu_flush_cache(), replaced by more robust alternatives.
  • Improved peer state machine algorithm to counter SCTP multistream race condition.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • libfdcore/p_psm.c

    r691 r706  
    3636#include "fdcore-internal.h"
    3737
     38/*
     39This file implement a Peer State Machine which is a mix of:
     40 - the state machine described in rfc3588bis
     41 - the state machine described in rfc3539#section-3.4
     42 - the following observations.
     43 
     44The delivery of Diameter messages must not always be unordered: order is important at
     45begining and end of a connection lifetime. It means we need agility to
     46switch between "ordering enforced" and "ordering not enforced to counter
     47HotLB" modes of operation.
     48
     49The connection state machine represented in RFC3588 (and rfc3588bis) is
     50incomplete, because it lacks the SUSPECT state and the 3 DWR/DWA
     51exchanges (section 5.1) when the peer recovers from this state.
     52Personnally I don't see the rationale for exchanging 3 messages (why 3?)
     53but, if we require at least 1 DWR/DWA exchange to be always performed
     54after the CER/CEA exchange (and initiated by the peer that sent the
     55CEA), we have a simple way to deal with our ordering problem, as resumed
     56bellow. Peers are: [i]nitiator, [r]esponder.
     57 (1) [i] SCTP connection attempt.
     58 (2) [r] accept the connection.
     59 (3) [i,r] (if secure port) DTLS handshake, close on failure.
     60 (4) [i] Send CER
     61 (5) [r] Receive CER, send CEA using stream 0, flag "unordered" cleared.
     62       [r] Immediately send a DWR after the CEA, also using stream 0,
     63flag "unordered" cleared.
     64       [r] Move to STATE_OPEN_NEW state -- equivalent to OPEN except
     65that all messages are sent ordered at the moment.
     66 (6) [i] receive CEA, move to OPEN state. All messages can be sent
     67unordered in OPEN state.
     68       [i] As per normal operation, reply with DWA to the DWR.
     69 (7) [r] Upon reception of the DWA, move to OPEN state, messages can be
     70sent unordered from this point.
     71
     72Note about (5) and (6): if the Diameter Identity received in CER or CEA
     73does not match the credentials from the certificate presented during
     74DTLS handshake, we may need to specify a path of clean disconnection
     75(not blocking the remote peer waiting for something).
     76
     77This proposed mechanism removes the problem of application messages
     78received before the CEA by the initiator. Note that if the "old" inband
     79TLS handshake is used, this handshake plays the same synchronization
     80role than the new DWR/DWA, which becomes useless.
     81
     82
     83The other time where ordering is important is by the end of connection
     84lifetime, when one peer is shutting down the link for some reason
     85(reboot, overload, no activity, etc...). In case of unordered delivery,
     86we may have:
     87- peer A sends an application message followed by a DPR. Peer B receives
     88the DPR first and tears down the connection. Application message is lost.
     89- Peer B sends an application message, then receives a DPR and answers a
     90DPA. Peer A receives the DPA before the application message. The
     91application message is lost.
     92
     93This situation is actually quite possible because DPR/DPA messages are
     94very short, while application messages can be quite large. Therefore,
     95they require much more time to deliver.
     96
     97I really cannot see a way to counter this effect by using the ordering
     98of the messages, except by applying a timer (state STATE_CLOSING_GRACE).
     99
     100However, this problem must be balanced with the fact that the message
     101that is lost will be in many cases sent again as the failover mechanism
     102specifies.
     103*/
     104
    38105/* The actual declaration of peer_state_str */
    39106DECLARE_STATE_STR();
     
    101168                return 0;
    102169        }
     170       
    103171        /* Insert in the active peers list */
    104172        CHECK_POSIX( pthread_rwlock_wrlock(&fd_g_activ_peers_rw) );
    105173        for (li = fd_g_activ_peers.next; li != &fd_g_activ_peers; li = li->next) {
    106174                struct fd_peer * next_p = (struct fd_peer *)li->o;
    107                 int cmp = strcmp(peer->p_hdr.info.pi_diamid, next_p->p_hdr.info.pi_diamid);
     175                int cmp = fd_os_cmp(peer->p_hdr.info.pi_diamid, peer->p_hdr.info.pi_diamidlen,
     176                                        next_p->p_hdr.info.pi_diamid, next_p->p_hdr.info.pi_diamidlen);
    108177                if (cmp < 0)
    109178                        break;
     
    115184        if (peer->p_cb) {
    116185                TRACE_DEBUG(FULL, "Calling add callback for peer %s", peer->p_hdr.info.pi_diamid);
    117                 (*peer->p_cb)(&peer->p_hdr.info, peer->p_cb_data);
     186                (*peer->p_cb)(&peer->p_hdr.info, peer->p_cb_data); /* TODO: do this in a separate detached thread? */
    118187                peer->p_cb = NULL;
    119188                peer->p_cb_data = NULL;
     
    178247}
    179248
     249/* Read state */
     250int fd_peer_get_state(struct peer_hdr *peer)
     251{
     252        int ret;
     253       
     254        struct fd_peer * p = (struct fd_peer *)peer;
     255       
     256        if (!CHECK_PEER(p))
     257                return -1;
     258       
     259        CHECK_POSIX_DO( pthread_mutex_lock(&p->p_state_mtx), return -1 );
     260        ret = p->p_state;
     261        CHECK_POSIX_DO( pthread_mutex_unlock(&p->p_state_mtx), return -1 );
     262       
     263        return ret;
     264}
     265
    180266
    181267/* Change state */
     
    186272        TRACE_ENTRY("%p %d(%s)", peer, new_state, STATE_STR(new_state));
    187273        CHECK_PARAMS( CHECK_PEER(peer) );
    188         fd_cpu_flush_cache();
    189         old = peer->p_hdr.info.runtime.pir_state;
     274       
     275        old = fd_peer_getstate(peer);
    190276        if (old == new_state)
    191277                return 0;
     
    196282                        peer->p_hdr.info.pi_diamid);
    197283       
    198         peer->p_hdr.info.runtime.pir_state = new_state;
    199         fd_cpu_flush_cache();
     284       
     285        CHECK_POSIX( pthread_mutex_lock(&peer->p_state_mtx) );
     286        peer->p_state = new_state;
     287        CHECK_POSIX( pthread_mutex_unlock(&peer->p_state_mtx) );
    200288       
    201289        if (old == STATE_OPEN) {
     
    255343{
    256344        /* Move to CLOSED state: failover messages, stop OUT thread, unlink peer from active list */
    257         fd_cpu_flush_cache();
    258         if (peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE) {
     345        if (fd_peer_getstate(peer) != STATE_ZOMBIE) {
    259346                CHECK_FCT_DO( fd_psm_change_state(peer, STATE_CLOSED), /* continue */ );
    260347        }
     
    285372        struct fd_peer * peer = (struct fd_peer *)arg;
    286373        CHECK_PARAMS_DO( CHECK_PEER(peer), return );
    287         peer->p_hdr.info.runtime.pir_state = STATE_ZOMBIE;
    288         fd_cpu_flush_cache();
     374        CHECK_POSIX_DO( pthread_mutex_lock(&peer->p_state_mtx), );
     375        peer->p_state = STATE_ZOMBIE;
     376        CHECK_POSIX_DO( pthread_mutex_unlock(&peer->p_state_mtx), );
    289377        return;
    290378}
     
    298386        size_t ev_sz;
    299387        void * ev_data;
     388        int cur_state;
    300389       
    301390        CHECK_PARAMS_DO( CHECK_PEER(peer), ASSERT(0) );
     
    306395        {
    307396                char buf[48];
    308                 sprintf(buf, "PSM/%.*s", (int)sizeof(buf) - 5, peer->p_hdr.info.pi_diamid);
     397                snprintf(buf, sizeof(buf), "PSM/%s", peer->p_hdr.info.pi_diamid);
    309398                fd_log_threadname ( buf );
    310399        }
    311400       
    312401        /* The state machine starts in CLOSED state */
    313         peer->p_hdr.info.runtime.pir_state = STATE_CLOSED;
     402        CHECK_POSIX_DO( pthread_mutex_lock(&peer->p_state_mtx), goto psm_end );
     403        peer->p_state = STATE_CLOSED;
     404        CHECK_POSIX_DO( pthread_mutex_unlock(&peer->p_state_mtx), goto psm_end );
    314405
    315406        /* Wait that the PSM are authorized to start in the daemon */
     
    326417        /* Get next event */
    327418        TRACE_DEBUG(FULL, "'%s' in state '%s' waiting for next event.",
    328                         peer->p_hdr.info.pi_diamid, STATE_STR(peer->p_hdr.info.runtime.pir_state));
     419                        peer->p_hdr.info.pi_diamid, STATE_STR(fd_peer_getstate(peer)));
    329420        CHECK_FCT_DO( fd_event_timedget(peer->p_events, &peer->p_psm_timer, FDEVP_PSM_TIMEOUT, &event, &ev_sz, &ev_data), goto psm_end );
     421       
     422        cur_state = fd_peer_getstate(peer);
     423        if (cur_state == -1)
     424                goto psm_end;
     425       
    330426        TRACE_DEBUG(FULL, "'%s'\t<-- '%s'\t(%p,%zd)\t'%s'",
    331                         STATE_STR(peer->p_hdr.info.runtime.pir_state),
     427                        STATE_STR(cur_state),
    332428                        fd_pev_str(event), ev_data, ev_sz,
    333429                        peer->p_hdr.info.pi_diamid);
     
    336432
    337433        /* The following states are impossible */
    338         ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_NEW );
    339         ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE );
    340         ASSERT( peer->p_hdr.info.runtime.pir_state != STATE_OPEN_HANDSHAKE ); /* because it should exist only between two loops */
     434        ASSERT( cur_state != STATE_NEW );
     435        ASSERT( cur_state != STATE_ZOMBIE );
     436        ASSERT( cur_state != STATE_OPEN_HANDSHAKE ); /* because it should exist only between two loops */
    341437
    342438        /* Purge invalid events */
    343439        if (!CHECK_PEVENT(event)) {
    344440                TRACE_DEBUG(INFO, "Invalid event received in PSM '%s' : %d", peer->p_hdr.info.pi_diamid, event);
     441                ASSERT(0); /* we should investigate this situation */
    345442                goto psm_loop;
    346443        }
     
    354451        /* Requests to terminate the peer object */
    355452        if (event == FDEVP_TERMINATE) {
    356                 switch (peer->p_hdr.info.runtime.pir_state) {
     453                switch (cur_state) {
    357454                        case STATE_OPEN:
     455                        case STATE_OPEN_NEW:
    358456                        case STATE_REOPEN:
    359                                 /* We cannot just close the conenction, we have to send a DPR first */
     457                                /* We cannot just close the connection, we have to send a DPR first */
    360458                                CHECK_FCT_DO( fd_p_dp_initiate(peer, ev_data), goto psm_end );
    361459                                goto psm_loop;
     
    363461                        /*     
    364462                        case STATE_CLOSING:
     463                        case STATE_CLOSING_GRACE:
    365464                        case STATE_WAITCNXACK:
    366465                        case STATE_WAITCNXACK_ELEC:
     
    380479                struct msg_hdr * hdr;
    381480               
    382                 /* If the current state does not allow receiving messages, just drop it */
    383                 if (peer->p_hdr.info.runtime.pir_state == STATE_CLOSED) {
    384                         TRACE_DEBUG(FULL, "Purging message in queue while in CLOSED state (%zdb)", ev_sz);
    385                         free(ev_data);
    386                         goto psm_loop;
    387                 }
    388                
    389481                /* Parse the received buffer */
    390482                CHECK_FCT_DO( fd_msg_parse_buffer( (void *)&ev_data, ev_sz, &msg),
     
    396488                        } );
    397489               
     490                /* If the current state does not allow receiving messages, just drop it */
     491                if (cur_state == STATE_CLOSED) {
     492                        /* In such case, just discard the message */
     493                        fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Purged from peer '%s''s queue (CLOSED state).", peer->p_hdr.info.pi_diamid );
     494                        fd_msg_free(msg);
     495                        goto psm_loop;
     496                }
     497               
    398498                /* Log incoming message */
    399                 fd_msg_log( FD_MSG_LOG_RECEIVED, msg, "Received %zdb from '%s'", ev_sz, peer->p_hdr.info.pi_diamid );
     499                fd_msg_log( FD_MSG_LOG_RECEIVED, msg, "Received %zdb from '%s' (%s)", ev_sz, peer->p_hdr.info.pi_diamid, STATE_STR(cur_state) );
    400500       
    401501                /* Extract the header */
     
    417517                }
    418518               
     519                if (cur_state == STATE_OPEN_NEW) {
     520                        /* OK, we have received something, so the connection is supposedly now in OPEN state at the remote site */
     521                        fd_psm_change_state(peer, STATE_OPEN );
     522                }
     523               
    419524                /* Now handle non-link-local messages */
    420525                if (fd_msg_is_routable(msg)) {
    421                         switch (peer->p_hdr.info.runtime.pir_state) {
     526                        switch (cur_state) {
    422527                                /* To maximize compatibility -- should not be a security issue here */
    423528                                case STATE_REOPEN:
    424529                                case STATE_SUSPECT:
    425530                                case STATE_CLOSING:
     531                                case STATE_CLOSING_GRACE:
    426532                                        TRACE_DEBUG(FULL, "Accepted a message while not in OPEN state... ");
    427533                                /* The standard situation : */
     534                                case STATE_OPEN_NEW:
    428535                                case STATE_OPEN:
    429536                                        /* We received a valid routable message, update the expiry timer */
     
    431538
    432539                                        /* Set the message source and add the Route-Record */
    433                                         CHECK_FCT_DO( fd_msg_source_set( msg, peer->p_hdr.info.pi_diamid, 1, fd_g_config->cnf_dict ), goto psm_end);
     540                                        CHECK_FCT_DO( fd_msg_source_set( msg, peer->p_hdr.info.pi_diamid, peer->p_hdr.info.pi_diamidlen, 1, fd_g_config->cnf_dict ), goto psm_end);
    434541
    435542                                        /* Requeue to the global incoming queue */
     
    437544
    438545                                        /* Update the peer timer (only in OPEN state) */
    439                                         if ((peer->p_hdr.info.runtime.pir_state == STATE_OPEN) && (!peer->p_flags.pf_dw_pending)) {
     546                                        if ((cur_state == STATE_OPEN) && (!peer->p_flags.pf_dw_pending)) {
    440547                                                fd_psm_next_timeout(peer, 1, peer->p_hdr.info.config.pic_twtimer ?: fd_g_config->cnf_timer_tw);
    441548                                        }
     
    449556                                default:
    450557                                        /* In such case, just discard the message */
    451                                         fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Received from peer '%s' while connection was not in OPEN state.", peer->p_hdr.info.pi_diamid );
     558                                        fd_msg_log( FD_MSG_LOG_DROPPED, msg, "Received from peer '%s' while connection was not in state %s.", peer->p_hdr.info.pi_diamid, STATE_STR(cur_state) );
    452559                                        fd_msg_free(msg);
    453560                        }
     
    485592                        case CC_DISCONNECT_PEER:
    486593                                CHECK_FCT_DO( fd_p_dp_handle(&msg, (hdr->msg_flags & CMD_FLAG_REQUEST), peer), goto psm_reset );
    487                                 if (peer->p_hdr.info.runtime.pir_state == STATE_CLOSING)
     594                                if (fd_peer_getstate(peer) == STATE_CLOSING)
    488595                                        goto psm_end;
     596
    489597                                break;
    490598                       
     
    494602                       
    495603                        default:
    496                                 /* Unknown / unexpected / invalid message */
     604                                /* Unknown / unexpected / invalid message -- but validated by our dictionary */
    497605                                TRACE_DEBUG(INFO, "Invalid non-routable command received: %u.", hdr->msg_code);
    498606                                if (hdr->msg_flags & CMD_FLAG_REQUEST) {
     
    502610
    503611                                                /* Set the error code */
    504                                                 CHECK_FCT_DO( fd_msg_rescode_set(msg, "DIAMETER_INVALID_HDR_BITS", NULL, NULL, 1 ), break );
     612                                                CHECK_FCT_DO( fd_msg_rescode_set(msg, "DIAMETER_COMMAND_UNSUPPORTED", "Or maybe the P-bit or application Id are erroneous.", NULL, 1 ), break );
    505613
    506614                                                /* Send the answer */
     
    509617                                } else {
    510618                                        /* We did ASK for it ??? */
    511                                         fd_log_debug("Invalid PXY flag in answer header ?\n");
     619                                        TRACE_DEBUG(INFO, "Received answer with erroneous 'is_routable' result...");
    512620                                }
    513621                               
     
    531639        /* The connection object is broken */
    532640        if (event == FDEVP_CNX_ERROR) {
    533                 switch (peer->p_hdr.info.runtime.pir_state) {
     641                switch (cur_state) {
    534642                        case STATE_WAITCNXACK_ELEC:
    535643                                /* Abort the initiating side */
     
    541649                        case STATE_WAITCEA:
    542650                        case STATE_OPEN:
     651                        case STATE_OPEN_NEW:
    543652                        case STATE_REOPEN:
    544653                        case STATE_WAITCNXACK:
     
    558667                                goto psm_end;
    559668                               
     669                        case STATE_CLOSING_GRACE:
     670                                if (peer->p_flags.pf_localterm) /* initiated here */
     671                                        goto psm_end;
     672                               
     673                                fd_psm_cleanup(peer, 0);
     674                               
     675                                /* Reset the timer for next connection attempt */
     676                                fd_psm_next_timeout(peer, 1, fd_p_dp_newdelay(peer));
     677                                goto psm_loop;
    560678                }
    561679                goto psm_loop;
     
    616734                peer->p_ini_thr = (pthread_t)NULL;
    617735               
    618                 switch (peer->p_hdr.info.runtime.pir_state) {
     736                switch (cur_state) {
    619737                        case STATE_WAITCNXACK_ELEC:
    620738                        case STATE_WAITCNXACK:
     
    624742                        default:
    625743                                /* Just abort the attempt and continue */
    626                                 TRACE_DEBUG(FULL, "Connection attempt successful but current state is %s, closing...", STATE_STR(peer->p_hdr.info.runtime.pir_state));
     744                                TRACE_DEBUG(FULL, "Connection attempt successful but current state is %s, closing... (too slow?)", STATE_STR(cur_state));
    627745                                fd_cnx_destroy(cnx);
    628746                }
     
    638756                peer->p_ini_thr = (pthread_t)NULL;
    639757               
    640                 switch (peer->p_hdr.info.runtime.pir_state) {
     758                switch (cur_state) {
    641759                        case STATE_WAITCNXACK_ELEC:
    642760                                /* Abort the initiating side */
     
    653771                        default:
    654772                                /* Just ignore */
    655                                 TRACE_DEBUG(FULL, "Connection attempt failed but current state is %s, ignoring...", STATE_STR(peer->p_hdr.info.runtime.pir_state));
     773                                TRACE_DEBUG(FULL, "Connection attempt failed but current state is %s, ignoring...", STATE_STR(cur_state));
    656774                }
    657775               
     
    661779        /* The timeout for the current state has been reached */
    662780        if (event == FDEVP_PSM_TIMEOUT) {
    663                 switch (peer->p_hdr.info.runtime.pir_state) {
     781                switch (cur_state) {
    664782                        case STATE_OPEN:
    665783                        case STATE_REOPEN:
     784                        case STATE_OPEN_NEW:
    666785                                CHECK_FCT_DO( fd_p_dw_timeout(peer), goto psm_end );
    667786                                goto psm_loop;
     
    676795                                /* Mark the connection problem */
    677796                                peer->p_flags.pf_cnx_pb = 1;
    678                                
    679797                        case STATE_CLOSING:
    680798                        case STATE_WAITCNXACK:
     
    683801                                fd_psm_next_timeout(peer, 1, peer->p_hdr.info.config.pic_tctimer ?: fd_g_config->cnf_timer_tc);
    684802                                goto psm_reset;
     803                               
     804                        case STATE_CLOSING_GRACE:
     805                                /* The grace period is completed, now close */
     806                                if (peer->p_flags.pf_localterm)
     807                                        goto psm_end;
     808                               
     809                                fd_psm_cleanup(peer, 0);
     810                                /* Reset the timer for next connection attempt */
     811                                fd_psm_next_timeout(peer, 1, fd_p_dp_newdelay(peer));
     812                                goto psm_loop;
    685813                               
    686814                        case STATE_WAITCNXACK_ELEC:
     
    697825       
    698826        /* Default action : the handling has not yet been implemented. [for debug only] */
    699         TRACE_DEBUG(INFO, "Missing handler in PSM for '%s'\t<-- '%s'", STATE_STR(peer->p_hdr.info.runtime.pir_state), fd_pev_str(event));
     827        TRACE_DEBUG(INFO, "Missing handler in PSM for '%s'\t<-- '%s'", STATE_STR(cur_state), fd_pev_str(event));
    700828psm_reset:
    701829        if (peer->p_flags.pf_delete)
     
    707835        fd_psm_cleanup(peer, 1);
    708836        TRACE_DEBUG(INFO, "'%s'\t-> STATE_ZOMBIE (terminated)\t'%s'",
    709                         STATE_STR(peer->p_hdr.info.runtime.pir_state),
     837                        STATE_STR(fd_peer_getstate(peer)),
    710838                        peer->p_hdr.info.pi_diamid);
    711839        pthread_cleanup_pop(1); /* set STATE_ZOMBIE */
    712         fd_cpu_flush_cache();
    713840        peer->p_psm = (pthread_t)NULL;
    714841        pthread_detach(pthread_self());
     
    726853       
    727854        /* Check the peer and state are OK */
    728         CHECK_PARAMS( CHECK_PEER(peer) && (peer->p_hdr.info.runtime.pir_state == STATE_NEW) );
     855        CHECK_PARAMS( fd_peer_getstate(peer) == STATE_NEW );
    729856       
    730857        /* Create the FIFO for events */
     
    744871        CHECK_PARAMS( CHECK_PEER(peer) );
    745872       
    746         fd_cpu_flush_cache();
    747         if (peer->p_hdr.info.runtime.pir_state != STATE_ZOMBIE) {
     873        if (fd_peer_getstate(peer) != STATE_ZOMBIE) {
    748874                CHECK_FCT( fd_event_send(peer->p_events, FDEVP_TERMINATE, 0, reason) );
    749875        } else {
Note: See TracChangeset for help on using the changeset viewer.