2544 lines
94 KiB
C
2544 lines
94 KiB
C
#ifdef __TANDEM
|
|
#pragma columns 79
|
|
#pragma page "srgpsm.c - T9050 - Regroup Module state machine routines"
|
|
#endif
|
|
|
|
/* @@@ START COPYRIGHT @@@
|
|
** Tandem Confidential: Need to Know only
|
|
** Copyright (c) 1995, Tandem Computers Incorporated
|
|
** Protected as an unpublished work.
|
|
** All Rights Reserved.
|
|
**
|
|
** The computer program listings, specifications, and documentation
|
|
** herein are the property of Tandem Computers Incorporated and shall
|
|
** not be reproduced, copied, disclosed, or used in whole or in part
|
|
** for any reason without the prior express written permission of
|
|
** Tandem Computers Incorporated.
|
|
**
|
|
** @@@ END COPYRIGHT @@@
|
|
**/
|
|
|
|
/*---------------------------------------------------------------------------
|
|
* This file (srgpsm.c) contains regroup state machine routines.
|
|
*---------------------------------------------------------------------------*/
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif /* __cplusplus */
|
|
|
|
|
|
#include <wrgp.h>
|
|
|
|
|
|
/*---------- arbitration algorithm ------------ */
|
|
|
|
DWORD MmQuorumArbitrationTimeout = CLUSTER_QUORUM_DEFAULT_ARBITRATION_TIMEOUT; // seconds
|
|
DWORD MmQuorumArbitrationEqualizer = 7; // seconds
|
|
|
|
#define RGP_ARBITRATION_TIMEOUT ((MmQuorumArbitrationTimeout * 100)/30) // tick == 300ms
|
|
#define AVERAGE_ARBITRATION_TIME_IN_SECONDS (MmQuorumArbitrationEqualizer)
|
|
|
|
void enter_first_cleanup_stage();
|
|
void regroup_restart();
|
|
int ClusterEmpty(cluster_t c);
|
|
|
|
DWORD
|
|
DiskArbitrationThread(
|
|
IN LPVOID param
|
|
) ;
|
|
|
|
_priv _resident static int
|
|
regroup_test_arbitrate_advance()
|
|
{
|
|
cluster_t temp;
|
|
int orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);
|
|
int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);
|
|
|
|
if( orig_numnodes == current_numnodes ) {
|
|
return 1;
|
|
}
|
|
//
|
|
// If somebody entered stage4 then our group owns the quorum
|
|
//
|
|
ClusterIntersection(
|
|
temp,
|
|
rgp->rgppkt.knownstage4,
|
|
rgp->rgppkt.pruning_result
|
|
);
|
|
|
|
return ClusterNumMembers(temp) != 0;
|
|
}
|
|
|
|
_priv _resident static int
|
|
regroup_start_arbitrate()
|
|
{
|
|
int orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);
|
|
int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);
|
|
|
|
if( orig_numnodes == current_numnodes ) {
|
|
enter_first_cleanup_stage();
|
|
return 0; // No Arbitration needed. Proceed to clean up stage //
|
|
}
|
|
else {
|
|
cluster_t arbitrators;
|
|
int n_arbitrators;
|
|
node_t arbitrator;
|
|
HANDLE thread;
|
|
DWORD threadId;
|
|
ULONG epoch;
|
|
|
|
RGP_LOCK;
|
|
|
|
epoch = rgp->OS_specific_control.EventEpoch;
|
|
|
|
if(rgp->arbitration_started) {
|
|
RGP_UNLOCK;
|
|
return 1; // stay in this stage for awhile
|
|
}
|
|
|
|
rgp->arbitration_ticks = 0;
|
|
rgp->arbitration_started = 1;
|
|
|
|
RGP_UNLOCK;
|
|
|
|
ClusterIntersection(
|
|
arbitrators,
|
|
rgp->rgppkt.pruning_result,
|
|
rgp->rgppkt.quorumowner
|
|
);
|
|
|
|
n_arbitrators = ClusterNumMembers(arbitrators);
|
|
|
|
if(n_arbitrators == 0) {
|
|
//
|
|
// If there are no quorum owners in this group //
|
|
// Let's take the guy with the lowest id //
|
|
//
|
|
arbitrator = rgp_select_tiebreaker(rgp->rgppkt.pruning_result);
|
|
} else {
|
|
//
|
|
// Otherwise we will take the quorum owner guy
|
|
// with the lowest id
|
|
//
|
|
arbitrator = rgp_select_tiebreaker(arbitrators);
|
|
|
|
if(n_arbitrators > 1) {
|
|
RGP_TRACE( "RGP !!! More than one quorum owner",
|
|
EXT_NODE(arbitrator), /* TRACE */
|
|
GetCluster( rgp->rgpinfo.cluster ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.pruning_result ),/* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
|
|
// Do we need to kill all other arbitrators?
|
|
// No.
|
|
// ClusterDelete(arbitrators, arbitrator);
|
|
// ClusterUnion(
|
|
// rgp->poison_targets,
|
|
// rgp->poison_targets,
|
|
// arbitrators
|
|
// );
|
|
// rgp_broadcast(RGP_UNACK_POISON);
|
|
}
|
|
}
|
|
|
|
rgp->tiebreaker = arbitrator;
|
|
|
|
//
|
|
// Now we have an arbitrating node
|
|
// We will run a thread that will run arbitration algorithm
|
|
//
|
|
|
|
RGP_TRACE( "RGP Arbitration Delegated to",
|
|
EXT_NODE(arbitrator), /* TRACE */
|
|
GetCluster( rgp->rgpinfo.cluster ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
|
|
|
|
// Fix Bug #460991
|
|
// regroup_restart on stage 4 or later will reset ArbitratingNode
|
|
// and if all the nodes are present after restart ApproxArbitrationWinner
|
|
// will be not set properly. Assign it here.
|
|
rgp->OS_specific_control.ApproxArbitrationWinner =
|
|
rgp->OS_specific_control.ArbitratingNode = (DWORD)EXT_NODE(arbitrator);
|
|
|
|
if(arbitrator != rgp->mynode) {
|
|
return 1;
|
|
}
|
|
|
|
thread = CreateThread( NULL, // security attributes
|
|
0, // stack_size = default
|
|
DiskArbitrationThread,
|
|
ULongToPtr(epoch),
|
|
0, // runs immediately
|
|
&threadId );
|
|
if(thread == NULL) {
|
|
//
|
|
// Force Others to regroup //
|
|
//
|
|
RGP_LOCK;
|
|
|
|
rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );
|
|
|
|
RGP_UNLOCK;
|
|
|
|
//
|
|
// Kill this node
|
|
//
|
|
RGP_ERROR(RGP_ARBITRATION_FAILED);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
CloseHandle(thread);
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
DWORD
|
|
DiskArbitrationThread(
|
|
IN LPVOID param
|
|
)
|
|
{
|
|
cluster_t current_participants;
|
|
DWORD status;
|
|
int participant_count;
|
|
int delay;
|
|
ULONG_PTR startingEpoch = (ULONG_PTR) param;
|
|
BOOL EpochsEqual;
|
|
int orig_numnodes;
|
|
int current_numnodes;
|
|
LONGLONG Time1, Time2;
|
|
|
|
ClusterCopy(current_participants, rgp->rgppkt.pruning_result);
|
|
orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);
|
|
current_numnodes = ClusterNumMembers(current_participants);
|
|
|
|
RGP_LOCK;
|
|
|
|
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
|
|
|
|
RGP_UNLOCK;
|
|
|
|
if(!EpochsEqual)
|
|
return 0;
|
|
|
|
delay = (orig_numnodes+1)/2 - current_numnodes;
|
|
|
|
if(delay < 0) delay = 0;
|
|
|
|
Sleep(delay * 6000);
|
|
|
|
RGP_LOCK;
|
|
|
|
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
|
|
if (EpochsEqual) {
|
|
rgp->OS_specific_control.ArbitrationInProgress += 1;
|
|
}
|
|
|
|
RGP_UNLOCK;
|
|
|
|
if(!EpochsEqual)
|
|
return 0;
|
|
|
|
GetSystemTimeAsFileTime((LPFILETIME)&Time1);
|
|
status = (*(rgp->OS_specific_control.QuorumCallback))();
|
|
GetSystemTimeAsFileTime((LPFILETIME)&Time2);
|
|
|
|
if (status != 0
|
|
&& startingEpoch == rgp->OS_specific_control.EventEpoch)
|
|
{
|
|
// If we won the arbitration and we are in the same epoch (approx check)
|
|
// we need to figure out whether we need to slow down a little
|
|
|
|
Time2 -= Time1;
|
|
|
|
// Convert to seconds
|
|
|
|
Time2 = Time2 / 10 / 1000 / 1000;
|
|
//
|
|
// [HACKHACK] GorN Oct/30/1999
|
|
// We had a weird timejump in the middle of the arbitration
|
|
// Arbitration was completed before it started, we slept for
|
|
// too long and regroup timed us out. Let's guard against it.
|
|
//
|
|
if ( (Time2 >= 0)
|
|
&& (Time2 < AVERAGE_ARBITRATION_TIME_IN_SECONDS) )
|
|
{
|
|
|
|
//
|
|
// Don't need to be better than the average
|
|
// If we are so fast, let's slow down
|
|
//
|
|
|
|
Time2 = AVERAGE_ARBITRATION_TIME_IN_SECONDS - Time2;
|
|
|
|
RGP_TRACE( "RGP sleeping",
|
|
(ULONG)Time2, /* TRACE */
|
|
0, /* TRACE */
|
|
0, /* TRACE */
|
|
0 ); /* TRACE */
|
|
Sleep( (ULONG)(Time2 * 1000) );
|
|
}
|
|
}
|
|
|
|
|
|
RGP_LOCK;
|
|
|
|
rgp->OS_specific_control.ArbitrationInProgress -= 1;
|
|
|
|
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
|
|
|
|
if(!EpochsEqual) {
|
|
RGP_UNLOCK;
|
|
return 0;
|
|
}
|
|
|
|
if(status) {
|
|
//
|
|
// We own the quorum device
|
|
// Let's proceed to the next stage
|
|
//
|
|
enter_first_cleanup_stage();
|
|
RGP_UNLOCK;
|
|
//
|
|
// All the rest will see that we are in cleanup stage and
|
|
// will proceed to it too
|
|
//
|
|
} else {
|
|
//
|
|
// Force Others to regroup //
|
|
//
|
|
rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );
|
|
RGP_UNLOCK;
|
|
|
|
//
|
|
// Kill this node
|
|
//
|
|
RGP_ERROR(RGP_ARBITRATION_FAILED);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/************************************************************************
|
|
* rgp_check_packet
|
|
* rgp_print_packet
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* Forward declarations of functions used in rgp_sanity_check macro
|
|
*
|
|
************************************************************************/
|
|
void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code);
|
|
int rgp_check_packet(rgp_pkt_t* pkt);
|
|
|
|
/************************************************************************
|
|
* rgp_sanity_check
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* This macro prints RGP packet if it has unreasonable values in
|
|
* powerfail, knownstages, pruning_result, and connectivity_matrix fields.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* rgp_pkt_t* pkt -
|
|
* packet to be checked
|
|
* char* label -
|
|
* label that will be printed together with a packet
|
|
*
|
|
* Returns:
|
|
*
|
|
* VOID
|
|
*
|
|
************************************************************************/
|
|
|
|
#define rgp_sanity_check(__pkt,__label) \
|
|
do { \
|
|
int __code; __code = rgp_check_packet(__pkt); \
|
|
if( __code ) {rgp_print_packet(__pkt, __label, __code);} \
|
|
} while ( 0 )
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
/************************************************************************
|
|
* split_brain_avoidance_algorithm
|
|
* ===============================
|
|
*
|
|
* Description:
|
|
*
|
|
* This algorithm ensures that, after a regroup incident completes,
|
|
* at most one group of nodes will survive regardless of connectivity
|
|
* failures.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* void - no return value; The algorithm results in either this node
|
|
* halting (with the RGP_AVOID_SPLIT_BRAIN halt code) or this group
|
|
* being the only group that survives.
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* The algorithm is described in detail in the Sierra Tech Memo S.84,
|
|
* "Modifications in Regroup Algorithm for Sierra".
|
|
*
|
|
* The algorithm looks at the set of nodes currently visible from the
|
|
* local cluster and compares it to the set of nodes alive before
|
|
* the regroup incident started (outerscreen). The decision to survive
|
|
* or halt depends on the number of nodes in the current group compared
|
|
* to the number of nodes in the original group.
|
|
*
|
|
* Case 1:
|
|
* If the current group contains > half the original number, this
|
|
* group survives.
|
|
*
|
|
* Case 2:
|
|
* If the current group contains < half the original number, this
|
|
* node (and group) halts.
|
|
*
|
|
* Case 3:
|
|
* If the current group contains exactly half the original number AND
|
|
* the current group has at least two members, then this group
|
|
* survives if and only if it contains the tie-breaker node (selected
|
|
* when the cluster is formed and after each regroup incident).
|
|
*
|
|
* Case 4:
|
|
* If the current group contains exactly half the original number AND
|
|
* the current group has exactly one member, then we will call the
|
|
* QuromSelect procedure to check if the Quorum Disk is accessible
|
|
* from this node. If the procedure returns value TRUE we survive;
|
|
* else we halt.
|
|
*
|
|
*
|
|
************************************************************************/
|
|
_priv _resident static void
|
|
split_brain_avoidance_algorithm()
|
|
{
|
|
int orig_numnodes, current_numnodes;
|
|
|
|
RGP_TRACE( "RGP SpltBrainAlg",
|
|
EXT_NODE(rgp->tiebreaker), /* TRACE */
|
|
GetCluster( rgp->rgpinfo.cluster ), /* TRACE */
|
|
GetCluster( rgp->outerscreen ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
|
|
|
|
/* Sanity checks:
|
|
* 1. The current set of nodes must be a subset of the original set
|
|
* of nodes.
|
|
* 2. My node must be in the current set. This was checked
|
|
* when stage2 was entered. No need to check again.
|
|
*/
|
|
if (!ClusterSubsetOf(rgp->rgpinfo.cluster, rgp->rgppkt.knownstage2))
|
|
RGP_ERROR(RGP_INTERNAL_ERROR);
|
|
|
|
orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);
|
|
current_numnodes = ClusterNumMembers(rgp->rgppkt.knownstage2);
|
|
|
|
if (orig_numnodes == current_numnodes)
|
|
/* All nodes are alive. No split brain possibility. */
|
|
return;
|
|
|
|
else if (orig_numnodes == 2) /* Special 2-node case */
|
|
{
|
|
if ((*(rgp->OS_specific_control.QuorumCallback))())
|
|
return; /* we have access to Quorum disk. We survive. */
|
|
else {
|
|
#if defined( NT )
|
|
ClusnetHalt( NmClusnetHandle );
|
|
#endif
|
|
RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
|
|
}
|
|
} /* Special 2-node case */
|
|
|
|
else /* Multi (>2) node case */
|
|
{
|
|
if ((current_numnodes << 1) > orig_numnodes)
|
|
/* Our group has more than half the nodes => we are the majority.
|
|
* We can survive. Other group(s) will kill themselves.
|
|
*/
|
|
return;
|
|
else if ((current_numnodes << 1) < orig_numnodes)
|
|
/* Our group has less than half the nodes => there may be a
|
|
* larger group alive. We must halt and allow that group to
|
|
* survive.
|
|
*/
|
|
RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
|
|
else
|
|
{
|
|
/* Our group has exactly half the number of processors;
|
|
* We survive if we contain the tie-breaker node and halt otherwise.
|
|
*/
|
|
if (ClusterMember(rgp->rgppkt.knownstage2, rgp->tiebreaker))
|
|
return;
|
|
else
|
|
RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
|
|
}
|
|
} /* Multi (>2) node case */
|
|
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* regroup_restart
|
|
* ===============
|
|
*
|
|
* Description:
|
|
*
|
|
* Starts a new regroup incident.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* void - no return value
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* Sets the regroup state to RGP_ACTIVATED, pauses all IO and
|
|
* initializes the stage masks and connectivity matrix.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident static void
|
|
regroup_restart()
|
|
{
|
|
cluster_t old_ignorescreen;
|
|
UnpackIgnoreScreen(&rgp->rgppkt, old_ignorescreen);
|
|
|
|
RGP_TRACE( "RGP (re)starting",
|
|
rgp->rgppkt.seqno, /* TRACE */
|
|
rgp->rgppkt.reason, /* TRACE */
|
|
rgp->rgppkt.activatingnode, /* TRACE */
|
|
rgp->rgppkt.causingnode ); /* TRACE */
|
|
|
|
RGP_TRACE( "RGP masks ",
|
|
RGP_MERGE_TO_32( rgp->outerscreen, /* TRACE */
|
|
rgp->innerscreen ), /* TRACE */
|
|
RGP_MERGE_TO_32( rgp->rgppkt.knownstage1, /* TRACE */
|
|
rgp->rgppkt.knownstage2 ), /* TRACE */
|
|
RGP_MERGE_TO_32( rgp->rgppkt.knownstage3, /* TRACE */
|
|
rgp->rgppkt.knownstage4 ), /* TRACE */
|
|
RGP_MERGE_TO_32( rgp->rgppkt.knownstage5, /* TRACE */
|
|
rgp->rgppkt.pruning_result ) ); /* TRACE */
|
|
|
|
/* We are about to start a new pass of the regroup algorithm.
|
|
* This does not necessarily mean we have finished the previous
|
|
* pass; i.e., in an abort situation we may be starting over.
|
|
* This may occur when some other node fails during the current
|
|
* pass through the algorithm leaving us hung up at one of the
|
|
* intermediate stages.
|
|
*/
|
|
|
|
//
|
|
// GN. When we do MM_LEAVE. Our state is COLDLOADED.
|
|
// Bailing out of regroup_restart here would prevent us from
|
|
// forming a regroup packet that would initate a banishing regroup incident
|
|
//
|
|
|
|
/* To avoid split brained nodes from corrupting data in storage
|
|
* devices, we request the transport subsystem to hold all IO requests
|
|
* in a queue and not transfer them over SNet. We will allow IO to
|
|
* be resumed when regroup can guarantee that there can no longer be
|
|
* split brains. This will be done when the final group is determined
|
|
* and regroup enters the RGP_PHASE1_CLEANUP stage.
|
|
*/
|
|
|
|
rgp_hold_all_io();
|
|
|
|
/* The following is a bit of history from the NSK regroup algorithm from
|
|
* pre-Sierra systems based on the InterProcessor Bus (IPB). Some of
|
|
* the particulars mentioned here have changed, but the principle remains.
|
|
*
|
|
* Previously, we used to mark all the known stages as zero, except for
|
|
* stage1. We used to mark only ourselves as in stage1. So, even if our
|
|
* bus reception logic is screwed up, and we are not receiving packets
|
|
* from anybody including ourselves, we would mark ourselves as being in
|
|
* stage1. And after (what used to be) six ticks, we would proceed into
|
|
* stage2 and mark ourselves as being in stage2. This would cause stage1
|
|
* and stage2 to be equal, and our world would constitute just
|
|
* ourselves. Thus we would go through regroup eliminating everybody
|
|
* else. However, since we are not receiving packets from anybody else,
|
|
* we would miss our own iamalive packets, and we too will soon die of
|
|
* %4032. Thus the symptoms would constitute everybody else dying of
|
|
* (%4040 + some node number), and that node dying with a %4032 halt.
|
|
* See TPR S 88070112309628 for more details.
|
|
*
|
|
* To avoid this situation, we now do not mark ourselves as in a
|
|
* particular stage until we get our own regroup packets indicating we
|
|
* are in that stage. Thus, in regroup_restart, all the stages are
|
|
* cleared. Previously, regroupbroadcaststatus in sendqueuedmessages
|
|
* used to send directly from the regroup_control structures.
|
|
* regroupbroadcaststatus has been modified to construct the unsequenced
|
|
* packets on its stack. It would first copy the state from the
|
|
* regroup_control structure, and then would LOR in our node into a known
|
|
* stage, if requested to do so. When we receive that packet, we would
|
|
* merge that information into our state, and thus we would be
|
|
* guaranteed that our bus sending and reception logic is working, and
|
|
* that we can legitimately mark ourselves as being in that stage. This
|
|
* whole change avoids problems where bus sending logic works, but bus
|
|
* reception logic is screwed up for both buses in a node.
|
|
*/
|
|
|
|
rgp->sendstage = 0; /* Don't let anyone know I am in stage 1 until
|
|
* I have seen a regroup clock tick; this is to
|
|
* cause this node to halt if it is not getting
|
|
* clock ticks. I will halt when the other nodes
|
|
* advance without me and send me a status packet
|
|
* indicating this or send me a poison packet
|
|
* after declaring me down.
|
|
*/
|
|
|
|
|
|
rgp->rgpcounter = 0;
|
|
ClusterInit(rgp->rgppkt.knownstage1);
|
|
ClusterInit(rgp->rgppkt.knownstage2);
|
|
ClusterInit(rgp->rgppkt.knownstage3);
|
|
ClusterInit(rgp->rgppkt.knownstage4);
|
|
ClusterInit(rgp->rgppkt.knownstage5);
|
|
ClusterInit(rgp->rgppkt.pruning_result);
|
|
|
|
MatrixInit(rgp->rgppkt.connectivity_matrix);
|
|
MatrixInit(rgp->internal_connectivity_matrix);
|
|
|
|
/* Just for ease of debugging, to send in our poison packets, we keep
|
|
* the known nodes mask at the start of regroup. poison packets contain
|
|
* known nodes at the beginning of regroup and at the end of it.
|
|
*/
|
|
|
|
ClusterCopy(rgp->initnodes, rgp->rgpinfo.cluster);
|
|
ClusterInit(rgp->endnodes);
|
|
|
|
#if defined( NT )
|
|
//
|
|
// increment the event epoch so we can detect stale events
|
|
// from clusnet
|
|
//
|
|
++rgp->OS_specific_control.EventEpoch;
|
|
#endif
|
|
|
|
if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&
|
|
(rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) &&
|
|
ClusterCompare(rgp->rgppkt.knownstage1,
|
|
rgp->rgppkt.knownstage2) )
|
|
{
|
|
//
|
|
// If we were interrupted by this restart after we closed
|
|
// 1st stage regroup window, then no nodes can be added to group w/o joining.
|
|
//
|
|
// Thus we will add missing nodes into our ignorescreen.
|
|
// This will force the regroup not to wait for them in stage1
|
|
cluster_t tmp;
|
|
|
|
ClusterDifference(tmp, rgp->rgpinfo.cluster, rgp->innerscreen);
|
|
ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);
|
|
}
|
|
|
|
if ( ClusterMember(rgp->ignorescreen, rgp->mynode) ) {
|
|
// We shouldn't have get here, but since we are here
|
|
// Let's shield us from the outside world
|
|
RGP_TRACE( "Self Isolation", 0, 0, 0, 0 );
|
|
ClusterCopy(rgp->ignorescreen, rgp->rgpinfo.cluster);
|
|
ClusterDelete(rgp->ignorescreen, rgp->mynode);
|
|
}
|
|
|
|
if ( !ClusterEmpty(rgp->ignorescreen) ) {
|
|
// if we are ignoring somebody we have
|
|
// to be cautious. I.e. we will stay longer in the
|
|
// first stage to give a chance to everybody to learn about
|
|
// our ignorescreen
|
|
rgp->cautiousmode = 1;
|
|
}
|
|
|
|
if ( !ClusterCompare(old_ignorescreen, rgp->ignorescreen) ) {
|
|
// Ignore screen is changed, reset restart counter //
|
|
RGP_TRACE( "Ignorescreen->", GetCluster(old_ignorescreen), GetCluster(rgp->ignorescreen), 0, 0 );
|
|
rgp->restartcount = 0;
|
|
}
|
|
PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);
|
|
|
|
rgp->arbitration_started = 0;
|
|
|
|
rgp->OS_specific_control.ArbitratingNode = MM_INVALID_NODE;
|
|
if ( !rgp_is_perturbed() ) {
|
|
ResetEvent( rgp->OS_specific_control.Stabilized );
|
|
}
|
|
|
|
ClusterInit(rgp->rgppkt.quorumowner);
|
|
if( QuorumOwner == (DWORD)EXT_NODE(rgp->mynode) ) {
|
|
ClusterInsert(rgp->rgppkt.quorumowner, rgp->mynode);
|
|
}
|
|
|
|
|
|
if (rgp->rgppkt.stage == RGP_COLDLOADED)
|
|
{
|
|
if (!rgp->OS_specific_control.ShuttingDown) {
|
|
//
|
|
// Currently, RGP_RELOADFAILED calls ExitProcess
|
|
// During clean shutdown we would like to send the regroup packet
|
|
// out triggering a regroup. So we don't want to die.
|
|
//
|
|
// Since we are not resetting state to RGP_ACTIVATED, this
|
|
// node will not be able to participate in the regroup.
|
|
//
|
|
RGP_ERROR(RGP_RELOADFAILED);
|
|
}
|
|
} else {
|
|
rgp->rgppkt.stage = RGP_ACTIVATED;
|
|
}
|
|
|
|
}
|
|
|
|
/************************************************************************
|
|
* regroup_test_stage2_advance
|
|
* ===========================
|
|
*
|
|
* Description:
|
|
*
|
|
* Checks to see if we can advance to regroup stage 2.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* int - 1 if stage 2 can be entered and 0 if not.
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* Stage 2 can be entered if one of the following conditions is true.
|
|
*
|
|
* (a) all nodes are present and accounted for and at least one
|
|
* regroup clock tick has occurred
|
|
* (b) we are not in cautious mode, all but one node are present
|
|
* and accounted for, AND a minimum number of ticks
|
|
* (rgp_quickdecisionlegit) have elapsed.
|
|
* (c) if RGP_MUST_ENTER_STAGE2 ticks have elapsed.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident static int
|
|
regroup_test_stage2_advance()
|
|
{
|
|
|
|
cluster_t stragglers; /* set of nodes not yet checkd in */
|
|
int num_stragglers; /* # of nodes not yet checkd in */
|
|
|
|
/* Stage 2 must be entered after some interval regardless of any
|
|
* other conditions.
|
|
*/
|
|
if (rgp->rgpcounter == 0)
|
|
return(0);
|
|
if (rgp->rgpcounter >= RGP_MUST_ENTER_STAGE2)
|
|
{
|
|
RGP_TRACE( "RGP S->2cautious",
|
|
rgp->rgpcounter, /* TRACE */
|
|
rgp->cautiousmode, /* TRACE */
|
|
GetCluster( rgp->outerscreen ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage1 ) ); /* TRACE */
|
|
return(1);
|
|
}
|
|
|
|
/* The number of ticks is between 1 and RGP_MUST_ENTER_STAGE2.
|
|
* We need to examine the stage1 mask to decide if we can
|
|
* advance.
|
|
*
|
|
* If every node in the old configuration has checked in, I can
|
|
* advance at once. This is either a false alarm or caused by
|
|
* power failure or connectivity failures.
|
|
*/
|
|
|
|
/* Compute the set of nodes from the original configuration not yet
|
|
* recognized.
|
|
*/
|
|
ClusterDifference(stragglers, rgp->outerscreen,
|
|
rgp->rgppkt.knownstage1);
|
|
|
|
//
|
|
// We shouldn't wait for the nodes we are ignoring,
|
|
// since we cannot get a packet from them anyway
|
|
//
|
|
ClusterDifference(stragglers, stragglers,
|
|
rgp->ignorescreen);
|
|
|
|
if ((num_stragglers = ClusterNumMembers(stragglers)) == 0)
|
|
{
|
|
RGP_TRACE( "RGP S->2 all in ",
|
|
rgp->rgpcounter, /* TRACE */
|
|
GetCluster( rgp->outerscreen ), 0, 0 ); /* TRACE */
|
|
|
|
return(1); /* all present and accounted for */
|
|
}
|
|
|
|
/* If stragglers is non-empty, perhaps I can still advance to stage 2
|
|
* if I am not in cautious mode (no recent power fail and not
|
|
* aborting and rerunning the regroup algorithm) AND all nodes but
|
|
* one have checked in AND some minimum number of ticks have elapsed.
|
|
*
|
|
* The minimum number of ticks is selected to be 1 greater than the
|
|
* the LATEPOLL inititiation period (allowed consecutive missed IamAlive time)
|
|
* since that should guarantee that, if the
|
|
* cluster has broken off into multiple disconnected clusters,
|
|
* the other clusters would have detected the missing IamAlives,
|
|
* started regroup and paused IO, thus preventing the possibility
|
|
* of data corruption caused by a split brain situation.
|
|
*/
|
|
|
|
if (!(rgp->cautiousmode) &&
|
|
(num_stragglers == 1) &&
|
|
(rgp->rgpcounter > rgp->rgpinfo.Min_Stage1_ticks))
|
|
{
|
|
RGP_TRACE( "RGP S->2 1 miss ",
|
|
rgp->rgpcounter, /* TRACE */
|
|
GetCluster( rgp->outerscreen ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage1 ), 0 ); /* TRACE */
|
|
return(1); /* advance - all but one checked in */
|
|
}
|
|
|
|
return(0); /* sorry cannot advance yet */
|
|
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* regroup_stage3_advance
|
|
* ===========================
|
|
*
|
|
* Description:
|
|
*
|
|
* This function is called after the split brain avoidance algorithm
|
|
* is run and the tie-breaker is selected in stage 2. It checks if
|
|
* we can proceed to stage 3 (RGP_PRUNING) and advances to stage 3
|
|
* if possible.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* int - 1 if the regroup stage has been advanced to RGP_PRUNING;
|
|
* 0 if the stage cannot be advanced yet.
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* The algorithm depends on whether we are the tie-breaker or not.
|
|
*
|
|
* On the tie-breaker node, we first check if there are any
|
|
* disconnects in the cluster. If there aren't any, there is no need
|
|
* for pruning. We can then set pruning_result to knownstage2,
|
|
* advance to the RGP_PRUNING stage and return 1. If there are
|
|
* disconnects, we must wait a certain number of ticks to collect
|
|
* connectivity info from all nodes. If the number of ticks have not
|
|
* passed, return 0. If the required number of ticks have elapsed,
|
|
* we must call the pruning algorithm to get the list of potential
|
|
* groups. After that, the select_cluster() routine is called to
|
|
* pick one from the set of possible clusters. After this is done,
|
|
* pruning_result is set to the selected cluster and we return 1.
|
|
*
|
|
* On a non-tiebreaker node, nothing is done till a stage3 packet is
|
|
* received from the tie-breaker node or another node which got a
|
|
* stage 3 packet. If a stage 3 packet has not been received, we
|
|
* simply return 0. If a stage 3 packet is received, RGP_PRUNING
|
|
* stage is entered and we return 1.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident int
|
|
regroup_stage3_advance()
|
|
{
|
|
int stage_advanced = 0, numgroups, groupnum;
|
|
|
|
if (rgp->tiebreaker == rgp->mynode)
|
|
{
|
|
if (connectivity_complete(rgp->rgppkt.connectivity_matrix))
|
|
{
|
|
|
|
/* No disconnects. All nodes in knownstage2 survive. */
|
|
rgp->rgppkt.stage = RGP_PRUNING;
|
|
|
|
ClusterCopy(rgp->rgppkt.pruning_result,
|
|
rgp->rgppkt.knownstage2);
|
|
stage_advanced = 1;
|
|
|
|
RGP_TRACE( "RGP S->3 NoPrune", rgp->rgpcounter, 0, 0, 0 );
|
|
}
|
|
|
|
/* There are disconnects; must wait for connectivity
|
|
* information to be complete. The info is deemed
|
|
* complete after a fixed number of ticks have
|
|
* elapsed.
|
|
*/
|
|
|
|
else if (rgp->pruning_ticks >= RGP_CONNECTIVITY_TICKS)
|
|
{ /* connectivity info collection complete; enter stage 3 */
|
|
|
|
RGP_TRACE( "RGP Con. matrix1",
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[0], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[1] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[2], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[3] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[4], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[5] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[6], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[7])); /*TRACE*/
|
|
RGP_TRACE( "RGP Con. matrix2",
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[8], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[9] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[10], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[11]), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[12], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[13]), /*TRACE*/
|
|
RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[14], /*TRACE*/
|
|
rgp->rgppkt.connectivity_matrix[15]));/*TRACE*/
|
|
|
|
numgroups = find_all_fully_connected_groups(
|
|
rgp->rgppkt.connectivity_matrix,
|
|
rgp->tiebreaker,
|
|
rgp->potential_groups);
|
|
|
|
if ((void *)rgp->select_cluster == RGP_NULL_PTR)
|
|
{
|
|
node_t keynode;
|
|
cluster_t temp;
|
|
ClusterIntersection(
|
|
temp,
|
|
rgp->rgppkt.knownstage2,
|
|
rgp->rgppkt.quorumowner
|
|
);
|
|
if ( ClusterEmpty(temp) ) {
|
|
keynode = RGP_NULL_NODE;
|
|
} else {
|
|
keynode = rgp_select_tiebreaker(temp);
|
|
}
|
|
RGP_TRACE( "RGP keynode ng ", keynode, numgroups, 0, 0); /*TRACE*/
|
|
/* No callback specified; use regroup's own routine. */
|
|
groupnum = rgp_select_cluster_ex(
|
|
rgp->potential_groups, numgroups, keynode);
|
|
}
|
|
else
|
|
{
|
|
/* Call routine specified at rgp_start() time. */
|
|
groupnum = (*(rgp->select_cluster))(
|
|
rgp->potential_groups, numgroups);
|
|
}
|
|
|
|
if (groupnum >= 0)
|
|
ClusterCopy(rgp->rgppkt.pruning_result,
|
|
rgp->potential_groups[groupnum]);
|
|
else
|
|
/* No group can survive. Can't halt yet.
|
|
* Need to tell everyone else.
|
|
*/
|
|
ClusterInit(rgp->rgppkt.pruning_result);
|
|
|
|
rgp->rgppkt.stage = RGP_PRUNING;
|
|
|
|
stage_advanced = 1;
|
|
|
|
RGP_TRACE( "RGP S->3 Pruned ",
|
|
rgp->rgpcounter, /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage2 ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
|
|
numgroups ); /* TRACE */
|
|
|
|
} /* connectivity info collection complete; enter stage 3 */
|
|
|
|
} /* tie-breaker node */
|
|
|
|
else
|
|
|
|
{ /* not tie-breaker node */
|
|
|
|
if (ClusterNumMembers(rgp->rgppkt.knownstage3) != 0)
|
|
{
|
|
/* We got a stage 3 packet from someone. Enter stage 3. */
|
|
rgp->rgppkt.stage = RGP_PRUNING;
|
|
|
|
stage_advanced = 1;
|
|
|
|
RGP_TRACE( "RGP Got S3 pkt ",
|
|
rgp->rgpcounter, /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage2 ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
|
|
GetCluster( rgp->rgppkt.knownstage3 ) ); /* TRACE */
|
|
}
|
|
|
|
} /* not tie-breaker node */
|
|
|
|
return(stage_advanced);
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* enter_first_cleanup_stage
|
|
* =========================
|
|
*
|
|
* Description:
|
|
*
|
|
* This function performs the actions required when entering the
|
|
* first of the message clean up stages.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* void - no return value
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* There are many actions to be performed after the final cluster
|
|
* is selected. The actions are described in comments throughout
|
|
* this routine.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident void
|
|
enter_first_cleanup_stage()
|
|
{
|
|
cluster_t banishees;
|
|
node_t failer;
|
|
|
|
rgp->rgppkt.stage = RGP_PHASE1_CLEANUP;
|
|
|
|
RGP_TRACE( "RGP S->4 ", rgp->rgpcounter, 0, 0, 0 );
|
|
|
|
/* The packets we send now will not indicate we are in the phase 1
|
|
* cleanup stage yet. We indicate we are in this stage only after
|
|
* we have completed the clean up action associated with the stage.
|
|
* This is done in rgp_event_handler, under the
|
|
* RGP_EVT_PHASE1_CLEANUP_DONE event.
|
|
*/
|
|
rgp->sendstage = 0;
|
|
|
|
/* Now, we can resume IO since we have passed the split brain danger.
|
|
* New split brain situations will result in regroup restarting and
|
|
* pausing IO again.
|
|
*/
|
|
|
|
rgp_resume_all_io();
|
|
|
|
/* Compute in banishees the set of nodes being lost from the old
|
|
* configuration.
|
|
*/
|
|
|
|
ClusterDifference(banishees, rgp->rgpinfo.cluster,
|
|
rgp->rgppkt.pruning_result);
|
|
|
|
/* Install the new configuration into the masks. */
|
|
|
|
ClusterCopy(rgp->outerscreen, rgp->rgppkt.pruning_result);
|
|
|
|
#if defined( NT )
|
|
ClusnetSetOuterscreen(
|
|
NmClusnetHandle,
|
|
(ULONG)*((PUSHORT)rgp->outerscreen)
|
|
);
|
|
#endif
|
|
|
|
ClusterCopy(rgp->innerscreen, rgp->rgppkt.pruning_result);
|
|
ClusterCopy(rgp->endnodes, rgp->rgppkt.pruning_result);
|
|
ClusterCopy(rgp->rgpinfo.cluster, rgp->rgppkt.pruning_result);
|
|
|
|
/* Select a new tiebreaker because the previous one may have been */
|
|
/* pruned out. Note: tiebreaker_selected has already been set in S2. */
|
|
rgp->tiebreaker =
|
|
rgp_select_tiebreaker(rgp->rgppkt.pruning_result);
|
|
/* F40 Bug FixID KCY0833 */
|
|
|
|
/* Mark the state of the banishees as dead and invoke the
|
|
* node down callback routine.
|
|
*/
|
|
for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)
|
|
if (ClusterMember(banishees, failer)
|
|
|| rgp->node_states[failer].status == RGP_NODE_COMING_UP // fix bug#265069
|
|
)
|
|
{
|
|
rgp->node_states[failer].status = RGP_NODE_DEAD;
|
|
rgp->node_states[failer].pollstate = AWAITING_IAMALIVE;
|
|
rgp->node_states[failer].lostHBs = 0;
|
|
|
|
#if !defined(NT)
|
|
(*(rgp->nodedown_callback))(EXT_NODE(failer));
|
|
#else
|
|
|
|
ClusnetSetNodeMembershipState(NmClusnetHandle,
|
|
EXT_NODE( failer ),
|
|
ClusnetNodeStateDead);
|
|
|
|
//
|
|
// On NT we do the nodedown callback at the end of stage 5.
|
|
// This allows the cleanup phases to complete before we let
|
|
// the "upper" layers know that a node went down.
|
|
//
|
|
if ( ClusterMember(rgp->OS_specific_control.CPUUPMASK,failer) )
|
|
ClusterInsert(
|
|
rgp->OS_specific_control.NeedsNodeDownCallback,
|
|
failer
|
|
);
|
|
|
|
#endif // !defined(NT)
|
|
|
|
}
|
|
|
|
/* If some nodes have been lost from the configuration, then I will
|
|
* queue regroup status packets to them. This is a best efforts
|
|
* attempt to ensure that they get quickly taken out if they
|
|
* do in fact continue to run.
|
|
*/
|
|
|
|
ClusterUnion(rgp->status_targets, banishees, rgp->status_targets);
|
|
|
|
//
|
|
// In NT, we are using rgp->rgppkt.hadpowerfail to transmit
|
|
// quorum ownership information
|
|
//
|
|
#if !defined(NT)
|
|
|
|
/* I should inform the message system of any node that experienced a
|
|
* power on recovery. The message system can use this to clear error
|
|
* counters so that a link will not be declared down due to errors
|
|
* which may have been caused by the power failure.
|
|
*/
|
|
|
|
for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)
|
|
if ((ClusterMember(rgp->rgppkt.hadpowerfail, failer)) &&
|
|
!(ClusterMember(banishees, failer)))
|
|
/* This survivor had a power failure. */
|
|
rgp_had_power_failure( EXT_NODE(failer) );
|
|
|
|
#endif // NT
|
|
|
|
/* Tell the OS to start clean up operations for the failed nodes. */
|
|
rgp_start_phase1_cleanup();
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* evaluatestageadvance
|
|
* ====================
|
|
*
|
|
* Description:
|
|
*
|
|
* This function evaluates whether additional state transitions are
|
|
* possible as a result of the info just received.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* None
|
|
*
|
|
* Returns:
|
|
*
|
|
* void - no return value
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* To evaluate whether we can advance through the stages, a loop is
|
|
* used with a case entry for each stage. If an entry decides not to
|
|
* advance to the next stage, it must return from the function. If
|
|
* it does advance, it should not return but remain in the loop
|
|
* since it is possible to have cascaded stage transitions
|
|
* especially in a two node system. Thus, the loop is exited when no
|
|
* more stage transitions are possible.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident static void
|
|
evaluatestageadvance()
|
|
{
|
|
cluster_t temp_cluster;
|
|
node_t node;
|
|
node_t i;
|
|
|
|
for (;;) /* loop until someone exits by returning */
|
|
{
|
|
switch (rgp->rgppkt.stage)
|
|
{
|
|
|
|
case RGP_COLDLOADED :
|
|
{
|
|
if (!rgp->OS_specific_control.ShuttingDown) {
|
|
RGP_ERROR(RGP_RELOADFAILED);
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
case RGP_ACTIVATED :
|
|
{ /* evaluate whether to go to stage RGP_CLOSING */
|
|
|
|
if (!regroup_test_stage2_advance())
|
|
return;
|
|
|
|
if (!ClusterMember(rgp->rgppkt.knownstage1, rgp->mynode))
|
|
RGP_ERROR(RGP_MISSED_POLL_TO_SELF);
|
|
|
|
rgp->rgppkt.stage = RGP_CLOSING;
|
|
|
|
rgp->rgpcounter = 0;
|
|
rgp->tiebreaker_selected = 0;
|
|
|
|
/* If we abort the regroup, and there's somebody that everybody
|
|
* banished on this regroup, the following line keeps him from
|
|
* joining up on the next regroup.
|
|
*/
|
|
ClusterCopy(rgp->innerscreen, rgp->rgppkt.knownstage1);
|
|
|
|
break;
|
|
|
|
} /* evaluate whether to go to stage RGP_CLOSING */
|
|
|
|
|
|
case RGP_CLOSING :
|
|
{ /* evaluate whether to go to stage RGP_PRUNING */
|
|
|
|
if (rgp->tiebreaker_selected)
|
|
{
|
|
if (regroup_stage3_advance())
|
|
break; /* try to advance further */
|
|
else
|
|
return; /* cannot advance any more */
|
|
}
|
|
|
|
if (!ClusterCompare(rgp->rgppkt.knownstage1,
|
|
rgp->rgppkt.knownstage2))
|
|
return;
|
|
|
|
//
|
|
// In NT, we no longer use the split-brain avoidance algorithm.
|
|
// We use a cluster-wide arbitration algorithm instead.
|
|
//
|
|
#if !defined(NT)
|
|
/* When the known stage 1 and known stage 2 sets are the
|
|
* same, we have the complete set of nodes that are
|
|
* connected to us. It is time to execute the split-
|
|
* brain avoidance algorithm. If we are a splinter group
|
|
* cut off from the main group, we will not survive this
|
|
* algorithm.
|
|
*/
|
|
|
|
split_brain_avoidance_algorithm();
|
|
|
|
#endif // NT
|
|
|
|
/* We are the lucky survivors of the split brain avoidance
|
|
* algorithm. Now, we must proceed to elect a new tie-breaker
|
|
* since the current tie-breaker may no longer be with us.
|
|
*/
|
|
|
|
rgp->tiebreaker =
|
|
rgp_select_tiebreaker(rgp->rgppkt.knownstage2);
|
|
|
|
rgp->tiebreaker_selected = 1;
|
|
|
|
RGP_TRACE( "RGP S2 tiebr sel",
|
|
rgp->rgpcounter, /* TRACE */
|
|
EXT_NODE(rgp->tiebreaker), /* TRACE */
|
|
0, 0 ); /* TRACE */
|
|
|
|
rgp->pruning_ticks = 0;
|
|
break;
|
|
|
|
} /* evaluate whether to go to stage 3 */
|
|
|
|
|
|
case RGP_PRUNING :
|
|
{ /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */
|
|
|
|
if (rgp->arbitration_started) {
|
|
if (regroup_test_arbitrate_advance()) {
|
|
enter_first_cleanup_stage();
|
|
break;
|
|
} else {
|
|
return; // Stay in this stage //
|
|
}
|
|
}
|
|
|
|
if (rgp->has_unreachable_nodes)
|
|
{
|
|
RGP_TRACE( "RGP Unreach Node",
|
|
GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
|
|
GetCluster( rgp->unreachable_nodes ), 0, 0 ); /* TRACE */
|
|
|
|
/* Must check if the unreachable nodes are in the
|
|
* selected final group. If so, we must restart
|
|
* regroup.
|
|
*/
|
|
ClusterIntersection(temp_cluster, rgp->unreachable_nodes,
|
|
rgp->rgppkt.pruning_result);
|
|
|
|
/* Clear the unreachable node mask and flag after examining
|
|
* them. If we restart, we will start with a clean slate.
|
|
*/
|
|
rgp->has_unreachable_nodes = 0;
|
|
ClusterInit(rgp->unreachable_nodes);
|
|
|
|
if (ClusterNumMembers(temp_cluster) != 0)
|
|
{
|
|
/* We have a node unreachable event to a node
|
|
* selected to survive. We must regenerate
|
|
* the connectivity matrix and re-run the node
|
|
* pruning algorithm. Start a new regroup incident.
|
|
* All restarts are in cautious mode.
|
|
*/
|
|
rgp->cautiousmode = 1;
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
rgp->rgppkt.reason = RGP_EVT_NODE_UNREACHABLE;
|
|
rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
|
|
/* For causingnode, pick the first unreachable node
|
|
* in temp_cluster.
|
|
*/
|
|
for (node = 0; node < (node_t) rgp->num_nodes; node++)
|
|
{
|
|
if (ClusterMember(temp_cluster, node))
|
|
{
|
|
rgp->rgppkt.causingnode = (uint8) EXT_NODE(node);
|
|
break;
|
|
}
|
|
}
|
|
regroup_restart();
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!ClusterCompare(rgp->rgppkt.knownstage2,
|
|
rgp->rgppkt.knownstage3))
|
|
return;
|
|
|
|
/* All nodes in the connected cluster have been notified
|
|
* of the pruning decision (entered stage 3). If we are
|
|
* selected to survive, we can now enter stage 4. If we are
|
|
* not in the selected group (pruning_result), we must halt.
|
|
* Wait for at least one node in PRUNING_RESULT to get into
|
|
* stage 4 before halting. This ensures that the algorithm
|
|
* does not stall in stage 3 with all pruned out nodes
|
|
* halting before ANY of the survivors finds that all nodes
|
|
* entered stage 3.
|
|
*/
|
|
|
|
if (!ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode))
|
|
{
|
|
/* Wait for at least one node in PRUNING_RESULT
|
|
* to get into stage 4 before halting. Since only
|
|
* nodes in PRUNING_RESULT get into stage 4, it is
|
|
* sufficient to check if knownstage4 has any members.
|
|
*/
|
|
if (ClusterNumMembers(rgp->rgppkt.knownstage4) != 0)
|
|
RGP_ERROR(RGP_PRUNED_OUT);
|
|
return;
|
|
}
|
|
|
|
// proceed to second stage of pruning - arbitration
|
|
if( regroup_start_arbitrate() ) {
|
|
return; // stay in this stage
|
|
} else {
|
|
break; // either proceed to the next, or restart
|
|
}
|
|
|
|
break;
|
|
|
|
} /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */
|
|
|
|
|
|
case RGP_PHASE1_CLEANUP :
|
|
{ /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */
|
|
|
|
if (!ClusterCompare(rgp->rgppkt.pruning_result,
|
|
rgp->rgppkt.knownstage4))
|
|
return;
|
|
|
|
rgp->rgppkt.stage = RGP_PHASE2_CLEANUP;
|
|
|
|
RGP_TRACE( "RGP S->5 ", rgp->rgpcounter, 0, 0, 0 );
|
|
|
|
/* The packets we send now will not indicate we are in the phase 2
|
|
* cleanup stage yet. We indicate we are in this stage only after
|
|
* we have completed the clean up action associated with the stage.
|
|
* This is done in rgp_event_handler, under the
|
|
* RGP_EVT_PHASE2_CLEANUP_DONE event.
|
|
*/
|
|
rgp->sendstage = 0;
|
|
|
|
rgp_start_phase2_cleanup();
|
|
|
|
break;
|
|
|
|
} /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */
|
|
|
|
|
|
case RGP_PHASE2_CLEANUP :
|
|
{ /* evaluate whether to go to RGP_STABILIZED stage */
|
|
|
|
if (!ClusterCompare(rgp->rgppkt.knownstage4,
|
|
rgp->rgppkt.knownstage5))
|
|
return;
|
|
|
|
RGP_LOCK;
|
|
|
|
//
|
|
// [HACKHACK] This is not necessary anymore, since we
|
|
// are holding the lock in message.c when delivering
|
|
// regroup packet received event
|
|
//
|
|
if (RGP_PHASE2_CLEANUP != rgp->rgppkt.stage) {
|
|
RGP_TRACE( "RGP S->6 (race) ", rgp->rgpcounter, rgp->rgppkt.stage, 0, 0 );
|
|
break;
|
|
}
|
|
|
|
rgp->rgppkt.stage = RGP_STABILIZED;
|
|
|
|
RGP_TRACE( "RGP S->6 ", rgp->rgpcounter, 0, 0, 0 );
|
|
|
|
rgp->rgpcounter = 0;
|
|
rgp->restartcount = 0;
|
|
|
|
/* Reset the regroup flags which have not yet been cleared. */
|
|
rgp->cautiousmode = 0;
|
|
|
|
/* Clear the mask indicating nodes which own the quorum resrc. */
|
|
ClusterInit(rgp->rgppkt.quorumowner);
|
|
|
|
/* Copy the sequence number into the rgpinfo area. */
|
|
rgp->rgpinfo.seqnum = rgp->rgppkt.seqno;
|
|
|
|
SetEvent( rgp->OS_specific_control.Stabilized );
|
|
if (rgp->OS_specific_control.ArbitratingNode != MM_INVALID_NODE) {
|
|
// Somebody was arbitrating //
|
|
rgp->OS_specific_control.ApproxArbitrationWinner =
|
|
rgp->OS_specific_control.ArbitratingNode;
|
|
if (rgp->OS_specific_control.ArbitratingNode == (DWORD)EXT_NODE(rgp->mynode)) {
|
|
//
|
|
// [HackHack] To close 422405
|
|
// when 421828 is fixed, please uncomment the following line
|
|
//
|
|
// QuorumOwner = rgp->OS_specific_control.ArbitratingNode;
|
|
} else {
|
|
if (QuorumOwner != MM_INVALID_NODE) {
|
|
ClRtlLogPrint(LOG_UNUSUAL,
|
|
"[MM] : clearing quorum owner var (winner is %1!u!), %.\n",
|
|
rgp->OS_specific_control.ArbitratingNode
|
|
);
|
|
}
|
|
QuorumOwner = MM_INVALID_NODE;
|
|
}
|
|
}
|
|
|
|
rgp_cleanup_complete();
|
|
|
|
#if defined(NT)
|
|
//
|
|
// On NT we deferred doing the node down callback until all the
|
|
// cleanup phases have been complete.
|
|
//
|
|
ClusterCopy(
|
|
rgp->OS_specific_control.CPUUPMASK,
|
|
rgp->rgpinfo.cluster
|
|
);
|
|
|
|
(*(rgp->nodedown_callback))(
|
|
rgp->OS_specific_control.NeedsNodeDownCallback
|
|
);
|
|
|
|
//
|
|
// Clear the down node mask
|
|
//
|
|
ClusterInit(rgp->OS_specific_control.NeedsNodeDownCallback);
|
|
|
|
//
|
|
// finally, tell clusnet that regroup has finished
|
|
//
|
|
ClusnetRegroupFinished(NmClusnetHandle,
|
|
rgp->OS_specific_control.EventEpoch,
|
|
rgp->rgppkt.seqno);
|
|
|
|
rgp->last_stable_seqno = rgp->rgppkt.seqno;
|
|
|
|
|
|
RGP_UNLOCK;
|
|
#endif
|
|
|
|
return;
|
|
|
|
} /* evaluate whether to go to RGP_STABILIZED stage */
|
|
|
|
|
|
case RGP_STABILIZED :
|
|
return; /* stabilized, so I am all done */
|
|
|
|
default :
|
|
RGP_ERROR(RGP_INTERNAL_ERROR); /* unknown stage */
|
|
|
|
} /* switch (rgp->rgppkt.stage) */
|
|
|
|
} /* loop until someone exits by returning */
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* rgp_event_handler
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* The state machine and the heart of the regroup algorithm.
|
|
*
|
|
* Parameters:
|
|
*
|
|
* int event -
|
|
* which event happened
|
|
*
|
|
* node_t causingnode -
|
|
* node causing the event: node which sent a regroup status
|
|
* packet or whose IamAlives are missed; if the causing node is
|
|
* not relevant information, RGP_NULL_NODE can be passed and
|
|
* is ignored. *This node ID is in external format.*
|
|
*
|
|
* Returns:
|
|
*
|
|
* void - no return value
|
|
*
|
|
* Algorithm:
|
|
*
|
|
* The state machine is the heart of the regroup algorithm.
|
|
* It is organized as a switch statement with the regroup stage as
|
|
* the case label and the regroup event as the switch variable.
|
|
* Events could cause regroup to start a new incident, to advance
|
|
* through stages or to update information without advancing to
|
|
* another stage. This routine also arranges for regroup status
|
|
* packets to be sent to all relevant nodes including our own
|
|
* node.
|
|
*
|
|
************************************************************************/
|
|
_priv _resident void
|
|
RGP_EVENT_HANDLER_EX(int event, node_t causingnode, void *arg)
|
|
{
|
|
|
|
rgp_pkt_t *rcvd_pkt_p;
|
|
cluster_t ignorescreen_rcvd;
|
|
uint8 oldstage;
|
|
int send_status_pkts = 0;
|
|
|
|
|
|
/* Note: arg is only used when event == RGP_EVENT_RECEIVED_PACKET. It is the ptr to the packet */
|
|
|
|
/* Trace unusual invocations of this routine. */
|
|
if (event != RGP_EVT_RECEIVED_PACKET && event != RGP_EVT_CLOCK_TICK)
|
|
RGP_TRACE( "RGP Event ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter ); /* TRACE */
|
|
|
|
switch (event)
|
|
{
|
|
case RGP_EVT_NODE_UNREACHABLE :
|
|
{ /* All paths to a node are unreachable */
|
|
|
|
/* Ignore the event if the unreachable node has been eliminated
|
|
* from our outerscreen. The message system probably doesn't
|
|
* know it yet.
|
|
*/
|
|
if (ClusterMember(rgp->outerscreen, INT_NODE(causingnode)))
|
|
{
|
|
/* Store this event and check after node pruning (when
|
|
* entering the RGP_PRUNING stage). If a regroup incident
|
|
* is in progress and we haven't entered the RGP_PRUNING
|
|
* stage yet, this will happen in the current incident.
|
|
* If not, it will happen in the next regroup incident
|
|
* which will surely start soon due to this disconnect.
|
|
*
|
|
* We do not start a regroup incident for this event. We will
|
|
* wait for IamAlives to be missed for starting a new regroup
|
|
* incident. This is due to the requirement that, in case
|
|
* of a total disconnect resulting in multiple groups, we must
|
|
* stay in stage 1 till we can guarantee that the other group(s)
|
|
* has started regroup and paused IO. We assume that the
|
|
* regroup incident started at the IamAlive check tick and
|
|
* use the periodic nature of the IamAlive sends and
|
|
* IamAlive checks to limit the stage1 pause to the period
|
|
* of IamAlive sends (+ 1 tick to drain IO). If we started
|
|
* a regroup incident due to the node unreachable event, we
|
|
* have to stay in stage1 longer.
|
|
*/
|
|
rgp->has_unreachable_nodes = 1;
|
|
ClusterInsert(rgp->unreachable_nodes, INT_NODE(causingnode));
|
|
|
|
break;
|
|
}
|
|
} /* All paths to a node are unreachable */
|
|
|
|
|
|
case RGP_EVT_PHASE1_CLEANUP_DONE :
|
|
{
|
|
/* The following checks are needed in case we restarted
|
|
* regroup and asked for phase1 cleanup multiple times.
|
|
* We must make sure that all such requests have been
|
|
* completed.
|
|
*/
|
|
if ( (rgp->rgppkt.stage == RGP_PHASE1_CLEANUP) &&
|
|
(rgp->rgp_msgsys_p->phase1_cleanup == 0) )
|
|
{ /* all caught up */
|
|
|
|
/* Let others and ourselves get packets indicating we are in
|
|
* this stage. When we get that packet, we will update our
|
|
* knownstage field. If our sending or receiving apparatus
|
|
* failed meanwhile and we don't get our own packet, it
|
|
* will cause regroup to be restarted.
|
|
*/
|
|
rgp->sendstage = 1;
|
|
send_status_pkts = 1;
|
|
evaluatestageadvance();
|
|
} /* all caught up */
|
|
|
|
break;
|
|
}
|
|
|
|
|
|
case RGP_EVT_PHASE2_CLEANUP_DONE :
|
|
{
|
|
|
|
/* The following checks are needed in case we restarted
|
|
* regroup and asked for phase2 cleanup multiple times.
|
|
* We must make sure that all such requests have been
|
|
* completed.
|
|
*/
|
|
if ( (rgp->rgppkt.stage == RGP_PHASE2_CLEANUP) &&
|
|
(rgp->rgp_msgsys_p->phase2_cleanup == 0) )
|
|
{ /* all caught up */
|
|
/* Let others and ourselves get packets indicating we are
|
|
* in this stage.
|
|
*/
|
|
rgp->sendstage = 1;
|
|
send_status_pkts = 1;
|
|
evaluatestageadvance();
|
|
} /* all caught up */
|
|
break;
|
|
}
|
|
|
|
|
|
case RGP_EVT_LATEPOLLPACKET :
|
|
{ /* some node is late with IamAlives */
|
|
|
|
RGP_LOCK; // to ensure that the packet receive does not initiate
|
|
// regroup asynchronously.
|
|
/* Start a new regroup incident if not already active. */
|
|
if (rgp->rgppkt.stage == RGP_STABILIZED)
|
|
{
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
rgp->rgppkt.reason = RGP_EVT_LATEPOLLPACKET;
|
|
rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
rgp->rgppkt.causingnode = (uint8) causingnode;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
} else if (rgp->rgppkt.stage == RGP_COLDLOADED)
|
|
{
|
|
RGP_ERROR(RGP_RELOADFAILED);
|
|
}
|
|
RGP_UNLOCK;
|
|
break;
|
|
} /* some node is late with IamAlives */
|
|
|
|
case MM_EVT_LEAVE:
|
|
rgp->OS_specific_control.ShuttingDown = TRUE;
|
|
case RGP_EVT_BANISH_NODE :
|
|
{ /* assumes that the lock is held */
|
|
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
// Pack Ignore Screen in the regroup_restart will
|
|
// fill reason and causingnode fields of the packet
|
|
ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
break;
|
|
}
|
|
#if 0
|
|
case MM_EVT_LEAVE: // this node needs to leave the cluster gracefully
|
|
{
|
|
// Initiate a Regroup Event amongst remaining members if any
|
|
// Start a new regroup incident if not already active.
|
|
if (rgp->rgppkt.stage == RGP_STABILIZED)
|
|
{
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
rgp->rgppkt.reason = MM_EVT_LEAVE;
|
|
rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
rgp->rgppkt.causingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
}
|
|
break;
|
|
}
|
|
#endif
|
|
|
|
case RGP_EVT_CLOCK_TICK :
|
|
{ /* called on regroup clock tick when regroup is active */
|
|
|
|
if( (rgp->rgppkt.stage == RGP_PRUNING) &&
|
|
(rgp->arbitration_started)
|
|
)
|
|
{
|
|
rgp->arbitration_ticks++;
|
|
|
|
if (rgp->arbitration_ticks >= RGP_ARBITRATION_TIMEOUT) {
|
|
//
|
|
// Kill timed-out arbitrator
|
|
//
|
|
if(rgp->tiebreaker == rgp->mynode) {
|
|
//
|
|
// If this node was arbitrating, then die
|
|
//
|
|
if ( IsDebuggerPresent() ) {
|
|
DebugBreak();
|
|
}
|
|
|
|
RGP_ERROR(RGP_ARBITRATION_STALLED);
|
|
}
|
|
else {
|
|
//
|
|
// Kill the arbitrator and initiate another regroup
|
|
//
|
|
RGP_TRACE(
|
|
"RGP arbitration stalled ",
|
|
rgp->rgppkt.stage, 0, 0, 0
|
|
);
|
|
|
|
rgp_event_handler(
|
|
RGP_EVT_BANISH_NODE,
|
|
EXT_NODE(rgp->tiebreaker)
|
|
);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
evaluatestageadvance();
|
|
|
|
//
|
|
// No need to send packets while we are waiting for
|
|
// the arbitrator to win
|
|
//
|
|
// send_status_pkts = rgp->rgppkt.stage != RGP_PRUNING;
|
|
//
|
|
// [GN] Wrong. We do have to send status packets.
|
|
// If we have partial connectivity, we need to
|
|
// continue exchanging packets, so that the pruner,
|
|
// can learn indirectly that all nodes got the pruning results.
|
|
//
|
|
send_status_pkts = 1;
|
|
|
|
break;
|
|
}
|
|
else {
|
|
rgp->rgpcounter++; /* increment the counter */
|
|
}
|
|
|
|
if ( (rgp->rgppkt.stage == RGP_ACTIVATED) && (rgp->sendstage == 0) )
|
|
{
|
|
/* To detect the potential failure of my timer pop mechanism
|
|
* (such as by the corruption of the time list), I wait for
|
|
* at least one regroup clock tick before I let myself and
|
|
* others know I am in stage 1.
|
|
*/
|
|
// [GorN Jan14/2000]
|
|
// We don't send our connectivity information,
|
|
// before we get the first clock tick.
|
|
// However we collect this information in
|
|
// rgp->internal_connectivity_matrix.
|
|
// Let's put it in the outgoing packet
|
|
// so that everybody will see what we think about them.
|
|
|
|
MatrixOr(rgp->rgppkt.connectivity_matrix,
|
|
rgp->internal_connectivity_matrix);
|
|
|
|
rgp->sendstage = 1; /* let everyone know we are in stage 1 */
|
|
}
|
|
else if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&
|
|
(rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) )
|
|
{ /* check for possible abort and restart */
|
|
|
|
if (rgp->rgpcounter >= RGP_MUST_RESTART)
|
|
{
|
|
/* Stalled out. Probably someone died after starting
|
|
* or another node is still in stage 1 cautious mode
|
|
*/
|
|
|
|
if ( ++(rgp->restartcount) > RGP_RESTART_MAX ) {
|
|
// It is not a good idea to die, because somebody
|
|
// is stalling. Let's add stallees into ignore mask and restart
|
|
//
|
|
// RGP_ERROR(RGP_INTERNAL_ERROR); // [Fixed]
|
|
cluster_t tmp, *stage;
|
|
|
|
switch (rgp->rgppkt.stage) {
|
|
case RGP_CLOSING: stage = &rgp->rgppkt.knownstage2; break;
|
|
case RGP_PRUNING: stage = &rgp->rgppkt.knownstage3; break;
|
|
case RGP_PHASE1_CLEANUP: stage = &rgp->rgppkt.knownstage4; break;
|
|
case RGP_PHASE2_CLEANUP: stage = &rgp->rgppkt.knownstage5; break;
|
|
}
|
|
ClusterDifference(tmp, rgp->rgpinfo.cluster, *stage);
|
|
|
|
//
|
|
// If we stalled during closing, due to tiebraker running
|
|
// the pruning algorithn going bunkers, we can have tmp = 0
|
|
// In this case, we need to ignore somebody to guarantee that
|
|
// the algorithm completes.
|
|
//
|
|
if ( ClusterEmpty(tmp) && rgp->tiebreaker_selected) {
|
|
ClusterInsert(tmp, rgp->tiebreaker);
|
|
}
|
|
|
|
ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);
|
|
}
|
|
|
|
/* If we are stalling in stage 3 and we have been pruned out,
|
|
* it is possible that we are stalling because we have been
|
|
* isolated from all other nodes. We must halt in this case.
|
|
*/
|
|
if ( (rgp->rgppkt.stage == RGP_PRUNING) &&
|
|
!ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) )
|
|
RGP_ERROR(RGP_PRUNED_OUT);
|
|
|
|
rgp->cautiousmode = 1;
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
|
|
RGP_TRACE( "RGP stalled ", rgp->rgppkt.stage, 0, 0, 0 );
|
|
|
|
regroup_restart();
|
|
|
|
} /* Stalled out ... */
|
|
} /* check for possible abort and restart */
|
|
|
|
if ((rgp->rgppkt.stage == RGP_CLOSING) && rgp->tiebreaker_selected)
|
|
rgp->pruning_ticks++;
|
|
|
|
evaluatestageadvance();
|
|
|
|
send_status_pkts = 1; /* send rgp packets regardless of progress */
|
|
|
|
break;
|
|
|
|
} /* called on regroup clock tick when regroup is active */
|
|
|
|
|
|
case RGP_EVT_RECEIVED_PACKET :
|
|
{ /* received an rgp packet */
|
|
|
|
/* If the sending node is excluded by the outer screen, then it is
|
|
* not even part of the current (most recently known) configuration.
|
|
* Therefore the packet should not be honored, and a poison message
|
|
* should be sent to try to kill this renegade processor.
|
|
* That is done in the calling routine that processes all incoming
|
|
* regroup module packets (IamAlive, regroup and poison packets).
|
|
*/
|
|
|
|
/* If the sending node was accepted by the outer screen but then
|
|
* excluded by the inner screen, then the packet will be disregarded
|
|
* but no poison message sent. This phenomenon may occur when this
|
|
* node has entered stage 2 without having heard from (recognized)
|
|
* the sending node and then a message arrives late from that
|
|
* sending node. In this case the fate of the sending node, i.e.
|
|
* whether it gets ruled out of the global configuration or not is
|
|
* unknown at this point. If the sender can get itself recognized
|
|
* by some node before that node enters stage 2, then it will be
|
|
* saved. Otherwise it will be declared down and subsequently shot
|
|
* with poison packets if it ever tries to assert itself.
|
|
*/
|
|
|
|
/* Remember the arg to this routine is the packet pointer */
|
|
rcvd_pkt_p = (rgp_pkt_t *)arg; /* address of pkt just received */
|
|
if ( rgp->rgppkt.seqno != rcvd_pkt_p->seqno)
|
|
RGP_TRACE( "RGP Event ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter ); /* TRACE */
|
|
|
|
UnpackIgnoreScreen(rcvd_pkt_p, ignorescreen_rcvd);
|
|
if ( !ClusterEmpty(ignorescreen_rcvd) ) {
|
|
RGP_TRACE( "RGP Incoming pkt", GetCluster(ignorescreen_rcvd),
|
|
rcvd_pkt_p->seqno, rgp->rgppkt.stage, causingnode);
|
|
}
|
|
|
|
if ( !ClusterMember(rgp->innerscreen, INT_NODE(causingnode))) {
|
|
RGP_TRACE( "RGP Ignoring !inner", causingnode, rgp->rgppkt.stage,
|
|
GetCluster(rgp->innerscreen), GetCluster(ignorescreen_rcvd) );
|
|
return;
|
|
}
|
|
|
|
RGP_LOCK; // To ensure that the timer thread does not initiate
|
|
// regroup asynchronously at this time.
|
|
|
|
//////////////////////////// New Ignore Screen Stuff /////////////////////////////////
|
|
|
|
if (ClusterMember(rgp->ignorescreen, INT_NODE(causingnode) )) {
|
|
RGP_UNLOCK;
|
|
RGP_TRACE( "RGP Ignoring", causingnode, rgp->rgppkt.stage,
|
|
GetCluster(rgp->ignorescreen), GetCluster(ignorescreen_rcvd) );
|
|
return;
|
|
}
|
|
|
|
if (rcvd_pkt_p->seqno < rgp->last_stable_seqno ) {
|
|
RGP_UNLOCK;
|
|
RGP_TRACE( "RGP old packet", causingnode, rcvd_pkt_p->seqno, rgp->last_stable_seqno, 0);
|
|
// This is a late packet from the previous regroup incident
|
|
// from the node that is currently in my outerscreen.
|
|
// This node could not have sent it now, this is probably a packet
|
|
// that stuck somewhere and was delieverd eons later.
|
|
// Simply ignore it.
|
|
return;
|
|
}
|
|
|
|
|
|
if ( ClusterMember(ignorescreen_rcvd, rgp->mynode ) ) {
|
|
//
|
|
// Sender ignores me. We will do the same to him.
|
|
//
|
|
ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
RGP_UNLOCK;
|
|
break;
|
|
}
|
|
|
|
if ( ClusterCompare(ignorescreen_rcvd, rgp->ignorescreen) ) {
|
|
// We have the same ignore screen.
|
|
// No work needs to be done
|
|
} else if ( ClusterSubsetOf(rgp->ignorescreen, ignorescreen_rcvd) ) {
|
|
// Incoming packet has smaller ignore screen
|
|
// Ignore this packet, but reply to its sender with
|
|
// our current regroup packet to force to upgrade to
|
|
// our view of the world.
|
|
|
|
// do so only if we are properly initialized
|
|
if (rgp->rgppkt.stage == RGP_COLDLOADED && !rgp->OS_specific_control.ShuttingDown) {
|
|
RGP_ERROR(RGP_RELOADFAILED);
|
|
}
|
|
|
|
RGP_TRACE( "RGP smaller ignore mask ",
|
|
GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */
|
|
rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */
|
|
|
|
ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
|
|
rgp_broadcast(RGP_UNACK_REGROUP);
|
|
RGP_UNLOCK;
|
|
return;
|
|
} else if ( ClusterSubsetOf(ignorescreen_rcvd, rgp->ignorescreen) ) {
|
|
RGP_TRACE( "RGP bigger ignore mask ",
|
|
GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */
|
|
rgp->rgppkt.stage, causingnode ); /* TRACE */
|
|
// Incoming packet has bigger ignore screen.
|
|
// Upgrade to this information and process the packet
|
|
rgp->rgppkt.seqno = rcvd_pkt_p->seqno;
|
|
|
|
/* Somebody else activated regroup. So, let's just copy */
|
|
/* the sender's reason code and reason nodes. */
|
|
|
|
//
|
|
// Ignore mask parts are in the reason and activatingnode fields
|
|
//
|
|
|
|
ClusterCopy(rgp->ignorescreen, ignorescreen_rcvd); // fix bug #328216
|
|
rgp->rgppkt.reason = rcvd_pkt_p->reason;
|
|
rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;
|
|
rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
} else {
|
|
RGP_TRACE( "RGP different ignore masks ",
|
|
GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */
|
|
rgp->rgppkt.stage, causingnode ); /* TRACE */
|
|
// Ignore masks are different and neither of them is
|
|
// a subset of another.
|
|
//
|
|
// We need to merge information out of these masks
|
|
// and restart the regroup.
|
|
//
|
|
// Packet that we just received will be ignored
|
|
|
|
ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, ignorescreen_rcvd);
|
|
rgp->rgppkt.seqno = max(rgp->rgppkt.seqno, rcvd_pkt_p->seqno) + 1;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
RGP_UNLOCK;
|
|
break;
|
|
}
|
|
|
|
//////////////////////////// End of new Ignore Screen Stuff /////////////////////////////////
|
|
|
|
// Now ignorescreens of this node packet and incoming packet are the same //
|
|
// proceed with regular regroup processing //
|
|
|
|
/* Since the packet is acceptable, the regroup sequence number
|
|
* must be compared to that of this node. If the incoming message
|
|
* has a higher sequence number, then a new pass of the regroup
|
|
* algorithm has started. This node must accept the new sequence
|
|
* number, reinitialize its data, and start partcicipating in
|
|
* the new pass. Also, the incoming message must be processed
|
|
* since, once the algorithm reinitializes, the sequence numbers
|
|
* now match.
|
|
*
|
|
* If the incoming packet has a matching sequence number, then it
|
|
* should be accepted. The knowledge of the global state of the
|
|
* algorithm it reflects must be merged with that already present
|
|
* in this node. Then this node must evaluate whether further
|
|
* state transitions are possible.
|
|
*
|
|
* Finally, if the incoming packet has a lower sequence number, then
|
|
* it comes from a node unaware of the current level of the global
|
|
* algorithm. The data in it should be ignored, but a packet should
|
|
* be sent to it so that it will reinitialize its algorithm.
|
|
*
|
|
* The sequence number is a 32 bit algebraic value - hopefully it
|
|
* will never wrap around.
|
|
*/
|
|
|
|
|
|
if (rcvd_pkt_p->seqno < rgp->rgppkt.seqno)
|
|
{ /* sender below current level - ignore but let him know it*/
|
|
|
|
RGP_TRACE( "RGP lower seqno ",
|
|
rgp->rgppkt.seqno, rcvd_pkt_p->seqno, /* TRACE */
|
|
rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */
|
|
|
|
ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
|
|
rgp_broadcast(RGP_UNACK_REGROUP);
|
|
RGP_UNLOCK;
|
|
return;
|
|
}
|
|
|
|
if (rcvd_pkt_p->seqno > rgp->rgppkt.seqno)
|
|
{ /* sender above current level - I must upgrade to it*/
|
|
|
|
// The node that forces a restart responsible for keeping
|
|
// track of restarts and making a decision who will die/be ignored
|
|
// if ( ++(rgp->restartcount) > RGP_RESTART_MAX )
|
|
// RGP_ERROR(RGP_INTERNAL_ERROR);
|
|
|
|
if ( (rgp->rgppkt.stage != RGP_STABILIZED) ||
|
|
((rcvd_pkt_p->seqno - rgp->rgppkt.seqno) > 1) )
|
|
{
|
|
RGP_TRACE( "RGP higher seqno",
|
|
rgp->rgppkt.seqno, rcvd_pkt_p->seqno, /* TRACE */
|
|
rgp->rgppkt.stage, rcvd_pkt_p->stage );/* TRACE */
|
|
rgp->cautiousmode = 1;
|
|
}
|
|
|
|
rgp->rgppkt.seqno = rcvd_pkt_p->seqno;
|
|
|
|
/* Somebody else activated regroup. So, let's just copy */
|
|
/* the sender's reason code and reason nodes. */
|
|
|
|
rgp->rgppkt.reason = rcvd_pkt_p->reason;
|
|
rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;
|
|
rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
|
|
} /* sender above current level - I must upgrade to it*/
|
|
|
|
/* Now we are at the same level - even if we weren't at first.
|
|
*
|
|
* If the sender has already commited to a view of the world
|
|
* that excludes me, I must halt in order to keep the system in
|
|
* a consistent state.
|
|
*
|
|
* This is true even with the split brain avoidance algorithm.
|
|
* The fact that stage1 = stage2 in the packet implies that the
|
|
* sender has already run the split brain avoidance algorithm
|
|
* and decided that he should survive.
|
|
*/
|
|
|
|
if ( (rcvd_pkt_p->stage > RGP_ACTIVATED) &&
|
|
ClusterCompare(rcvd_pkt_p->knownstage1,
|
|
rcvd_pkt_p->knownstage2) &&
|
|
!ClusterMember(rcvd_pkt_p->knownstage1, rgp->mynode) )
|
|
{
|
|
ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
|
|
rgp->rgppkt.seqno ++;
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
RGP_UNLOCK;
|
|
// /* I must die for overall consistency. */
|
|
// RGP_ERROR((uint16) (RGP_PARIAH + causingnode)); // [Fixed]
|
|
break;
|
|
}
|
|
RGP_UNLOCK;
|
|
|
|
|
|
/* If I have terminated the active part of the algorithm, I
|
|
* am in stage 6 and am not routinely broadcasting my status
|
|
* anymore. If I get a packet from someone else who has not
|
|
* yet terminated, then I must send him the word. But if he
|
|
* has terminated, I must not send any packet or else there
|
|
* will be an infinite loop of packets bouncing back and forth.
|
|
*/
|
|
|
|
if (rgp->rgppkt.stage == RGP_STABILIZED)
|
|
{ /* I have terminated so can't learn anything more. */
|
|
if (!ClusterCompare(rcvd_pkt_p->knownstage5,
|
|
rgp->rgppkt.knownstage5))
|
|
{ /* but sender has not so I must notify him */
|
|
ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
|
|
rgp_broadcast(RGP_UNACK_REGROUP);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* At this point, the packet is from a legal node within the
|
|
* current round of the algorithm and I have not terminated
|
|
* at stage RGP_STABILIZED so I need to absorb whatever new
|
|
* info is in this packet.
|
|
*
|
|
* The way to merge what this packet says with what I already
|
|
* know is to just logically OR the known stage x fields
|
|
* together.
|
|
*/
|
|
{
|
|
int seqno = rcvd_pkt_p->seqno&0xffff;
|
|
int stage = rcvd_pkt_p->stage&0xffff;
|
|
int trgs = *(int*)rgp->status_targets & 0xffff;
|
|
int node = INT_NODE(causingnode)&0xffff;
|
|
|
|
RGP_TRACE( "RGP recv pkt ",
|
|
((seqno << 16) | stage),
|
|
RGP_MERGE_TO_32(
|
|
rcvd_pkt_p->knownstage1,
|
|
rcvd_pkt_p->knownstage2
|
|
),
|
|
RGP_MERGE_TO_32(
|
|
rcvd_pkt_p->knownstage3,
|
|
rcvd_pkt_p->knownstage4
|
|
),
|
|
(trgs << 16) | node
|
|
);
|
|
}
|
|
|
|
rgp_sanity_check(rcvd_pkt_p, "RGP Received packet");
|
|
rgp_sanity_check(&(rgp->rgppkt), "RGP Internal packet");
|
|
|
|
ClusterUnion(rgp->rgppkt.quorumowner, rcvd_pkt_p->quorumowner,
|
|
rgp->rgppkt.quorumowner);
|
|
ClusterUnion(rgp->rgppkt.knownstage1, rcvd_pkt_p->knownstage1,
|
|
rgp->rgppkt.knownstage1);
|
|
ClusterUnion(rgp->rgppkt.knownstage2, rcvd_pkt_p->knownstage2,
|
|
rgp->rgppkt.knownstage2);
|
|
ClusterUnion(rgp->rgppkt.knownstage3, rcvd_pkt_p->knownstage3,
|
|
rgp->rgppkt.knownstage3);
|
|
ClusterUnion(rgp->rgppkt.knownstage4, rcvd_pkt_p->knownstage4,
|
|
rgp->rgppkt.knownstage4);
|
|
ClusterUnion(rgp->rgppkt.knownstage5, rcvd_pkt_p->knownstage5,
|
|
rgp->rgppkt.knownstage5);
|
|
ClusterUnion(rgp->rgppkt.pruning_result, rcvd_pkt_p->pruning_result,
|
|
rgp->rgppkt.pruning_result);
|
|
|
|
/* But when I am in stage 2, it is possible that I can learn to
|
|
* recognize some node I have not previously recognized by hearing
|
|
* of it indirectly from some other node that I have recognized.
|
|
* To handle this case, I always merge knownstage1 info into
|
|
* the inner screen so that subsequent messages from the newly
|
|
* recognized node will be accepted and processed.
|
|
*/
|
|
if ((rgp->rgppkt.stage == RGP_CLOSING) &&
|
|
!(rgp->tiebreaker_selected))
|
|
ClusterUnion(rgp->innerscreen, rgp->rgppkt.knownstage1,
|
|
rgp->innerscreen);
|
|
|
|
/* In the first two stages of regroup, the inter-node connectivity
|
|
* information is collected and propagated. When we get a regroup
|
|
* packet, we turn ON the bit corresponding to the [our-node,
|
|
* sender-node] entry in the connectivity matrix. We also OR in
|
|
* the matrix sent by the sender node in the regroup packet.
|
|
*
|
|
* The matrix is not updated if we are in stage 1 and haven't
|
|
* received the first clock tick. This is to prevent the
|
|
* node pruning algorithm from considering us alive if our
|
|
* timer mechanism is disrupted, but the IPC mechanism is OK.
|
|
*/
|
|
|
|
/* [GorN 01/07/2000] If we are not collection connectivity information,
|
|
* until we receive a first tick we can ran into problems if the node is
|
|
* killed right after it send out its first timer driven packet
|
|
* (which doesn't have any connectivity info yet). This can cause a
|
|
* confusion. See bug 451792.
|
|
*
|
|
* What we will do is we will collect connectivity information on
|
|
* the side even when rgp->sendstage is FALSE and move it into the regroup
|
|
* packet if we ever get a clock tick
|
|
*/
|
|
|
|
if (rgp->rgppkt.stage < RGP_PRUNING && !rgp->sendstage)
|
|
{
|
|
MatrixSet(rgp->internal_connectivity_matrix,
|
|
rgp->mynode, INT_NODE(causingnode));
|
|
if (causingnode != EXT_NODE(rgp->mynode))
|
|
MatrixOr(rgp->internal_connectivity_matrix,
|
|
rcvd_pkt_p->connectivity_matrix);
|
|
}
|
|
|
|
if ((rgp->rgppkt.stage < RGP_PRUNING) && rgp->sendstage)
|
|
{
|
|
MatrixSet(rgp->rgppkt.connectivity_matrix,
|
|
rgp->mynode, INT_NODE(causingnode));
|
|
if (causingnode != EXT_NODE(rgp->mynode))
|
|
MatrixOr(rgp->rgppkt.connectivity_matrix,
|
|
rcvd_pkt_p->connectivity_matrix);
|
|
}
|
|
|
|
/* Now, I can evaluate whether additional state transitions are
|
|
* possible as a result of the info just received.
|
|
*/
|
|
oldstage = rgp->rgppkt.stage;
|
|
|
|
// QuorumCheck now runs in a separate thread
|
|
// if (oldstage != RGP_CLOSING) // Cannot run Quorumcheck from here.
|
|
evaluatestageadvance();
|
|
|
|
/* To speed things up, let us broadcast our status if our
|
|
* stage has changed and we are willing to let others and
|
|
* ourselves see it.
|
|
*/
|
|
|
|
if ( (oldstage != rgp->rgppkt.stage) && rgp->sendstage )
|
|
send_status_pkts = 1; /* broadcast at once to speed things up */
|
|
|
|
break;
|
|
} /* received an rgp packet */
|
|
|
|
//
|
|
// We do not support power failure notifications in NT
|
|
//
|
|
#if defined(NT)
|
|
|
|
CL_ASSERT(event != RGP_EVT_POWERFAIL);
|
|
//
|
|
// Fall thru to default case
|
|
//
|
|
|
|
#else // NT
|
|
|
|
case RGP_EVT_POWERFAIL :
|
|
{ /* Our node got a power up interrupt or an indication of power
|
|
* failure from another node. */
|
|
|
|
/* Note that this code will unconditionally abort and restart
|
|
* the algorithm even if it was active before the power failure.
|
|
* The new incident must be in cautious mode.
|
|
*/
|
|
|
|
rgp->cautiousmode = 1;
|
|
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
|
|
rgp->rgppkt.reason = RGP_EVT_POWERFAIL;
|
|
rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
|
|
rgp->rgppkt.causingnode = (uint8) causingnode;
|
|
|
|
/* rgp->pfail_state is set to a non-zero value when a pfail event
|
|
* is reported to regroup. It is decremented at every regroup clock
|
|
* tick till it reaches zero. While this number is non-zero, missing
|
|
* self IamAlives are ignored and do not cause the node to halt.
|
|
* This gives the sending hardware some time to recover from power
|
|
* failures before self IamAlives are checked.
|
|
*/
|
|
if (causingnode == EXT_NODE(rgp->mynode))
|
|
rgp->pfail_state = RGP_PFAIL_TICKS;
|
|
|
|
/* Store the fact that causingnode experienced a PFAIL,
|
|
* for reporting to the message system when regroup stabilizes.
|
|
*/
|
|
ClusterInsert(rgp->rgppkt.hadpowerfail, INT_NODE(causingnode));
|
|
|
|
regroup_restart();
|
|
send_status_pkts = 1;
|
|
break;
|
|
} /* power failure */
|
|
|
|
#endif // NT
|
|
|
|
default :
|
|
{
|
|
RGP_ERROR(RGP_INTERNAL_ERROR);
|
|
}
|
|
}
|
|
|
|
if (send_status_pkts) /* significant change - send status at once */
|
|
{
|
|
ClusterUnion(rgp->status_targets,
|
|
rgp->outerscreen, rgp->status_targets);
|
|
rgp_broadcast(RGP_UNACK_REGROUP);
|
|
}
|
|
}
|
|
|
|
/************************************************************************
|
|
* rgp_check_packet
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* verifies that RGP packet has reasonable values in
|
|
* powerfail, knownstages, pruning_result, and connectivity_matrix fields
|
|
*
|
|
* Parameters:
|
|
*
|
|
* rgp_pkt_t* pkt -
|
|
* packet to be checked
|
|
*
|
|
* Returns:
|
|
*
|
|
* 0 - packet looks good
|
|
* 1,2,3... - strange looking packet
|
|
*
|
|
************************************************************************/
|
|
int rgp_check_packet(rgp_pkt_t* pkt) {
|
|
node_t i;
|
|
|
|
//
|
|
// Verify that
|
|
// knownstage5 \subset knownstage4 \subset knownstage3 \subset
|
|
// knownstage2 \subset knownstage1 \subset rgp->rgpinfo.cluster
|
|
//
|
|
// int ClusterSubsetOf(cluster_t big, cluster_t small)
|
|
// Returns 1 if set small = set big or small is a subset of big.
|
|
//
|
|
|
|
if( !ClusterSubsetOf(pkt->knownstage4, pkt->knownstage5) ) {
|
|
return 5;
|
|
}
|
|
if( !ClusterSubsetOf(pkt->knownstage3, pkt->knownstage4) ) {
|
|
return 4;
|
|
}
|
|
if( !ClusterSubsetOf(pkt->knownstage2, pkt->knownstage3) ) {
|
|
return 3;
|
|
}
|
|
if( !ClusterSubsetOf(pkt->knownstage1, pkt->knownstage2) ) {
|
|
return 2;
|
|
}
|
|
if( !ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->knownstage1) ) {
|
|
return 1;
|
|
}
|
|
|
|
//
|
|
// pruning_result has to be a subset of knownstage2
|
|
//
|
|
if( !ClusterSubsetOf(pkt->knownstage2, pkt->pruning_result) ) {
|
|
return 9;
|
|
}
|
|
|
|
//
|
|
// quorumowner has to be a subset of original cluster
|
|
//
|
|
if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->quorumowner)) {
|
|
return 8;
|
|
}
|
|
//
|
|
// Check connectivity matrix
|
|
//
|
|
for(i = 0; i < MAX_CLUSTER_SIZE; ++i) {
|
|
if( ClusterMember( rgp->rgpinfo.cluster, i ) ) {
|
|
//
|
|
// Node i is a member of a cluster
|
|
// Its connectivity bitmap has to be a subset of rgp->rgpinfo.cluster
|
|
//
|
|
if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->connectivity_matrix[i])) {
|
|
return 10;
|
|
}
|
|
} else {
|
|
//
|
|
// Node i is not a member of a cluster
|
|
// Its connectivity bitmap has to be 0
|
|
//
|
|
if(!ClusterEmpty(pkt->connectivity_matrix[i]))
|
|
return 11;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/************************************************************************
|
|
* rgp_print_packet
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* Prints RGP packet fields
|
|
*
|
|
* Parameters:
|
|
*
|
|
* rgp_pkt_t* pkt -
|
|
* packet to be printed
|
|
* char* label -
|
|
* label to be printed together with a packet
|
|
* int code -
|
|
* a number to be printed together with a packet
|
|
*
|
|
* Returns:
|
|
*
|
|
* VOID
|
|
*
|
|
************************************************************************/
|
|
void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code)
|
|
{
|
|
uint8 pktsubtype;
|
|
uint8 stage;
|
|
uint16 reason;
|
|
uint32 seqno;
|
|
uint8 activatingnode;
|
|
uint8 causingnode;
|
|
cluster_t quorumowner;
|
|
|
|
RGP_TRACE( label,
|
|
pkt->seqno, /* TRACE */
|
|
code,
|
|
(pkt->stage << 16) |
|
|
(pkt->activatingnode << 8) |
|
|
(pkt->causingnode), /* TRACE */
|
|
RGP_MERGE_TO_32( rgp->outerscreen,
|
|
rgp->innerscreen )
|
|
);
|
|
RGP_TRACE( "RGP CHK masks ",
|
|
RGP_MERGE_TO_32( rgp->rgpinfo.cluster, /* TRACE */
|
|
pkt->quorumowner ), /* TRACE */
|
|
RGP_MERGE_TO_32( pkt->knownstage1, /* TRACE */
|
|
pkt->knownstage2 ), /* TRACE */
|
|
RGP_MERGE_TO_32( pkt->knownstage3, /* TRACE */
|
|
pkt->knownstage4 ), /* TRACE */
|
|
RGP_MERGE_TO_32( pkt->knownstage5, /* TRACE */
|
|
pkt->pruning_result ) ); /* TRACE */
|
|
RGP_TRACE( "RGP CHK Con. matrix1",
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[0], /*TRACE*/
|
|
pkt->connectivity_matrix[1] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[2], /*TRACE*/
|
|
pkt->connectivity_matrix[3] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[4], /*TRACE*/
|
|
pkt->connectivity_matrix[5] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[6], /*TRACE*/
|
|
pkt->connectivity_matrix[7])); /*TRACE*/
|
|
RGP_TRACE( "RGP CHK Con. matrix2",
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[8], /*TRACE*/
|
|
pkt->connectivity_matrix[9] ), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[10], /*TRACE*/
|
|
pkt->connectivity_matrix[11]), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[12], /*TRACE*/
|
|
pkt->connectivity_matrix[13]), /*TRACE*/
|
|
RGP_MERGE_TO_32( pkt->connectivity_matrix[14], /*TRACE*/
|
|
pkt->connectivity_matrix[15]));/*TRACE*/
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* UnpackIgnoreScreen
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* Extracts ignorescreen out of regroup packet
|
|
*
|
|
* Parameters:
|
|
*
|
|
* rgp_pkt_t* from -
|
|
* source packet
|
|
* cluster_t to -
|
|
* target node set
|
|
*
|
|
* Returns:
|
|
*
|
|
* VOID
|
|
*
|
|
* Comments:
|
|
*
|
|
* If the packet is received from NT4 node, unpacked ignorescreen
|
|
* will ne always 0.
|
|
*
|
|
************************************************************************/
|
|
void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to)
|
|
{
|
|
#pragma warning( push )
|
|
#pragma warning( disable : 4244 )
|
|
if (from->reason < RGP_EVT_IGNORE_MASK) {
|
|
ClusterInit(to);
|
|
} else {
|
|
to[0] = ((uint16)from->reason) >> 8;
|
|
to[1] = (uint8)from->causingnode;
|
|
}
|
|
#pragma warning( pop )
|
|
}
|
|
|
|
/************************************************************************
|
|
* rgp_print_packet
|
|
* =================
|
|
*
|
|
* Description:
|
|
*
|
|
* Put an ignorescreen back into a regroup packet
|
|
*
|
|
* Parameters:
|
|
*
|
|
* rgp_pkt_t* to -
|
|
* packet to be updated
|
|
* cluster_t from -
|
|
* source node set
|
|
*
|
|
* Returns:
|
|
*
|
|
* VOID
|
|
*
|
|
************************************************************************/
|
|
void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from)
|
|
{
|
|
if ( ClusterEmpty(from) ) {
|
|
to->reason &= 255;
|
|
to->causingnode = 0;
|
|
} else {
|
|
to->reason = (uint8)RGP_EVT_IGNORE_MASK | (from[0] << 8);
|
|
to->causingnode = from[1];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif /* __cplusplus */
|
|
|
|
|
|
#if 0
|
|
|
|
History of changes to this file:
|
|
-------------------------------------------------------------------------
|
|
1995, December 13 F40:KSK0610 /*F40:KSK06102.2*/
|
|
|
|
This file is part of the portable Regroup Module used in the NonStop
|
|
Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
|
|
are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
|
|
srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
|
|
The last two are simulation files to test the Regroup Module on a
|
|
UNIX workstation in user mode with processes simulating processor nodes
|
|
and UDP datagrams used to send unacknowledged datagrams.
|
|
|
|
This file was first submitted for release into NSK on 12/13/95.
|
|
------------------------------------------------------------------------------
|
|
This change occurred on 19 Jan 1996 /*F40:MB06458.1*/
|
|
Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/
|
|
- Some cleanup of the code /*F40:MB06458.3*/
|
|
- Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/
|
|
unsequenced messages sent. /*F40:MB06458.5*/
|
|
- Fixed some bugs /*F40:MB06458.6*/
|
|
- Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/
|
|
- Change per-packet-timeout to 5ms /*F40:MB06458.8*/
|
|
- Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/
|
|
tnet services queue. /*F40:MB06458.10*/
|
|
- Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/
|
|
- Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/
|
|
- Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/
|
|
to be unstoppable before calling this routine. /*F40:MB06458.14*/
|
|
- Added new steps in the build file called /*F40:MB06458.15*/
|
|
MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/
|
|
MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/
|
|
REGROUP - compiles all the regroup files /*F40:MB06458.18*/
|
|
- remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/
|
|
parameter. /*F40:MB06458.20*/
|
|
----------------------------------------------------------------------- /*F40:MB06458.21*/
|
|
|
|
#endif /* 0 - change descriptions */
|