2020-09-30 16:53:55 +02:00

737 lines
31 KiB
C

#ifndef _WRGP_H_
#define _WRGP_H_
#ifdef __TANDEM
#pragma columns 79
#pragma page "wrgp.h - T9050 - internal declarations for Regroup Module"
#endif
/* @@@ START COPYRIGHT @@@
** Tandem Confidential: Need to Know only
** Copyright (c) 1995, Tandem Computers Incorporated
** Protected as an unpublished work.
** All Rights Reserved.
**
** The computer program listings, specifications, and documentation
** herein are the property of Tandem Computers Incorporated and shall
** not be reproduced, copied, disclosed, or used in whole or in part
** for any reason without the prior express written permission of
** Tandem Computers Incorporated.
**
** @@@ END COPYRIGHT @@@
**/
/*---------------------------------------------------------------------------
* This file (wrgp.h) contains the cluster_t data type and types used for the
* node pruning algorithm and declares the routines exported by the Cluster
* data type and the node pruning algorithm.
*---------------------------------------------------------------------------*/
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <jrgp.h>
#include <wrgpos.h>
#include <bitset.h>
#define RGP_VERSION 1 /* version # of data structures */
#define RGP_INITSEQNUM 0 /* starting seq# # of regroup */
#define RGPPKTLEN sizeof(rgp_pkt_t) /* byte length of regroup pkts */
#define IAMALIVEPKTLEN sizeof(iamalive_pkt_t)/* byte length of IamAlive pkts */
#define POISONPKTLEN sizeof(poison_pkt_t) /* byte length of poison pkts */
/*-------------------------------------------------------*/
/* The following are the stages of the regroup algorithm */
/*-------------------------------------------------------*/
#define RGP_COLDLOADED 0
#define RGP_ACTIVATED 1
#define RGP_CLOSING 2
#define RGP_PRUNING 3
#define RGP_PHASE1_CLEANUP 4
#define RGP_PHASE2_CLEANUP 5
#define RGP_STABILIZED 6
/*--------------------------------------------------------------------*/
/* Macros to transform node numbers used by the OS to node numbers */
/* used by the Regroup module and vice versa. Regroup's internal node */
/* numbers start at 0 while the OS starts node numbers at */
/* LOWEST_NODENUM. */
/*--------------------------------------------------------------------*/
#define EXT_NODE(int_node) ((node_t)(int_node + LOWEST_NODENUM))
#define INT_NODE(ext_node) ((node_t)(ext_node - LOWEST_NODENUM))
/*----------------------------------------*/
/* Defines for the node pruning algorithm */
/*----------------------------------------*/
/* The data type "cluster_t" is a bit array of size equal to the maximum
* number of nodes in the cluster. The bit array is implemented as an
* array of uint8s.
*
* Given a node#, its bit position in the bit array is computed by first
* locating the byte in the array (node# / BYTEL) and then the bit in
* the byte. Bits in the byte are numbered 0..7 (from left to right).
* Thus, node 0 is placed in byte 0, bit 0, which is the left-most bit
* in the bit array.
*/
#define BYTE(cluster, node) ( (cluster)[(node) / BYTEL] ) /* byte# in array */
#define BIT(node) ( (node) % BYTEL ) /* bit# in byte */
/* The connectivity matrix is an array of elements of type cluster_t.
* cluster_t is equivalent to a bit array with one bit per node. Thus the
* matrix is equivalent to a two-dimensional bit array, with each
* dimension being MAX_CLUSTER_SIZE large. A bit value of 1 for matrix[i][j]
* represents a unidirectional connection between nodes i and j (a
* regroup packet received on node i from node j).
*/
typedef cluster_t connectivity_matrix_t[MAX_CLUSTER_SIZE];
#define connected(i,j) (ClusterMember(c[(int)i],j) && \
ClusterMember(c[(int)j],i)) /* bidirectional */
/* Should a node that cannot receive its own regroup packets be considered
* dead? Not necessarily. It may be able to send packets to others and
* be considered alive by everyone. There is no real need for the ability
* to send to yourself on the network. Software bugs could result in
* such a situation. Therefore, the correct way to check if a node is
* alive would be to check if there is a non-zero bit in either the row
* or column corresponding to the node; that is, if the node has
* received regroup packets from or sent regroup packets to any node,
* it may be considered alive. But for simplicity, we will assume in
* the following macro that a node that does not receive its own
* regroup packets will be considered dead.
*/
#define node_considered_alive(i) ClusterMember(c[(int)i],i)
/* The upper bound on the number of potential fully-connected groups is
* the lower of 2**N and 2**D where N is the number of live nodes and
* D is the number of disconnects. If this number exceeds MAX_GROUPS,
* do not attempt to exhaustively generate all possible groups;
* just return an arbitrary fully-connected group which includes a
* node selected by the cluster manager.
*/
#define MAX_GROUPS 256 /* if more than these, pick arbitrary group */
#define LOG2_MAX_GROUPS 8 /* log (base 2) of MAX_GROUPS */
#define too_many_groups(nodes, disconnects) \
((nodes > LOG2_MAX_GROUPS) && (disconnects > LOG2_MAX_GROUPS))
/* The disconnect array is an array of (i,j) pairs which represent a
* break in connectivity between nodes i and j.
*/
typedef node_t disconnect_array [LOG2_MAX_GROUPS * (LOG2_MAX_GROUPS-1)/2] [2];
/*---------------------------------------------------------------------------*/
/* Following are templates for three kinds of unacknowledged datagrams sent */
/* by the regroup module (regroup pkts, IamAlive pkts and poison pkts). */
/*---------------------------------------------------------------------------*/
//
// We already hand packed all on the wire structures.
// packon will instruct the compiler not to mess with field alignment (kind of)
//
#include <packon.h>
/************************************************************************
* rgp_pkt_t (regroup status packet)
* ---------------------------------
* This structure is used to send the current state of the regroup state
* machine to other nodes.
*
* ___________________________________________________________
* wd0 | pktsubtype | stage | reason | Low8 ignscr |
* |_____________|_______________|_____________________________|
* wd1 | seqno |
* |_____________________________|_____________________________|
* wd2 | activa- | causingnode | quorumowner |
* | tingnode | Hi8 ignscr | (was hadpowerfail) |
* |_____________|_______________|_____________________________|
* wd3 | knownstage1 | knownstage2 |
* |_____________________________|_____________________________|
* wd4 | knownstage3 | knownstage4 |
* |_____________________________|_____________________________|
* wd5 | knownstage5 | pruning_result |
* |_____________________________|_____________________________|
* wd6 : :
* | connectivity_matrix |
* : :
* wd13|___________________________________________________________|
*
*
* pktsubtype - packet subtype = RGP_UNACK_REGROUP
* stage - current stage (state) of the regroup algorithm
* reason - reason for the activation of regroup
* seqno - sequence number of current regroup incident
* activatingnode - node that calls for a regroup incident
* causingnode - node whose poll packet was missed or which
* had a power failure or otherwise caused
* a regroup incident being called for
* quorumowner - mask of nodes that think they own the quorum resrc
* knownstage1 - mask of nodes known to have entered stage 1
* knownstage2 - mask of nodes known to have entered stage 2
* knownstage3 - mask of nodes known to have entered stage 3
* knownstage4 - mask of nodes known to have entered stage 4
* knownstage5 - mask of nodes known to have entered stage 5
* pruning_result - result of node pruning by tie-breaker node
* connectivity_matrix - current connectivity info for entire cluster
*
*/
#ifdef __TANDEM
#pragma fieldalign shared8 rgp_pkt
#endif /* __TANDEM */
typedef struct rgp_pkt
{
uint8 pktsubtype;
uint8 stage;
uint16 reason;
uint32 seqno;
uint8 activatingnode;
uint8 causingnode;
cluster_t quorumowner;
cluster_t knownstage1;
cluster_t knownstage2;
cluster_t knownstage3;
cluster_t knownstage4;
cluster_t knownstage5;
cluster_t pruning_result;
connectivity_matrix_t connectivity_matrix;
} rgp_pkt_t;
/************************************************************************
* iamalive_pkt_t
* --------------
* This structure is used by a node to indicate to another node that it
* is alive and well.
*
* ___________________________________________________________
* wd0 | pktsubtype | filler |
* |_____________|_____________________________________________|
* wd1 : :
* | testpattern |
* : :
* wd13|___________________________________________________________|
*
*
* pktsubtype - packet subtype = RGP_UNACK_IAMALIVE
* testpattern - a bit pattern used for testing
*
*/
#ifdef __TANDEM
#pragma fieldalign shared8 iamalive_pkt
#endif /* __TANDEM */
typedef struct iamalive_pkt
{
uint8 pktsubtype;
uint8 filler[3];
union
{
uint8 bytes[RGP_UNACK_PKTLEN - 4];
uint32 words[(RGP_UNACK_PKTLEN - 4)/4];
} testpattern;
} iamalive_pkt_t;
/************************************************************************
* poison_pkt_t
* ------------
* This structure is used to send a poison packet to another node to
* force the other node to halt.
*
* ___________________________________________________________
* wd0 | pktsubtype | unused1 | reason |
* |_____________|_______________|_____________________________|
* wd1 | seqno |
* |_____________________________|_____________________________|
* wd2 | activa- | causingnode | |
* | tingnode | | unused2 |
* |_____________|_______________|_____________________________|
* wd3 | initnodes | endnodes |
* |_____________________________|_____________________________|
*
*
* pktsubtype - packet subtype = RGP_UNACK_POISON
* reason - reason for the last activation of regroup
* seqno - current regroup sequence number
* (sequence number of last regroup incident)
* activatingnode - node which called for last regroup incident
* causingnode - node whose poll packet was missed or which
* had a power failure or otherwise caused
* the last regroup incident being called for
* initnodes - mask of nodes at beginning of last regroup
* endnodes - mask of nodes at end of last regroup
*
*/
#ifdef __TANDEM
#pragma fieldalign shared8 poison_pkt
#endif /* __TANDEM */
typedef struct poison_pkt
{
uint8 pktsubtype;
uint8 unused1;
uint16 reason;
uint32 seqno;
uint8 activatingnode;
uint8 causingnode;
uint16 unused2;
cluster_t initnodes;
cluster_t endnodes;
} poison_pkt_t;
#include <packoff.h>
//
// There is no room for a 16 bit ignorescreen mask
// in rgp_pkt_t structure. We use a few bit from several
// fields to store the ignore screen.
// The following routines do packing and unpacking
// of ignorescreen from/into the packet
//
extern void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from);
extern void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to);
extern void SetMulticastReachable(uint32 mask);
/*---------------------------------------------------------------------------*/
/* This struct is keeps track of the state of each node in the cluster. */
/*---------------------------------------------------------------------------*/
typedef struct
{
uint16 status; /* state of node - alive, dead etc. */
uint16 pollstate; /* whether I'm alives have been received */
uint16 lostHBs; /* tracks the number of consecutive I'm alives lost */
} node_state_t;
/* The status and pollstate fields of the node_state_t struct can have the
* following values.
*/
/* Node status of nodes */
#define RGP_NODE_ALIVE 1 /* node is considered alive */
#define RGP_NODE_COMING_UP 2 /* node is coming up */
#define RGP_NODE_DEAD 3 /* node has failed */
#define RGP_NODE_NOT_CONFIGURED 4 /* node is not even configured */
/* IamAlive status codes of nodes */
#define AWAITING_IAMALIVE 1 /* awaiting IamAlives */
#define IAMALIVE_RECEIVED 2 /* got IamAlive */
#define RGP_IAMALIVE_THRESHOLD 100 /* after getting this many Iam- *
* Alives, we check if every *
* node has sent at least one */
/************************************************************************
* rgp_control_t (regroup's only global data structure)
* ----------------------------------------------------
* This structure holds all the Regroup state and other info.
* This is the only global data structure used by Regroup.
*
* NOTE: The word offsets shown in this picture assume that
* MAX_CLUSTER_SIZE is 16.
*
* ___________________________________________________________
* wd0 | |
* : rgpinfo structure :
* : :
* |___________________________________________________________|
* wd3 | mynode | tiebreaker |
* |_____________________________|_____________________________|
* wd4 | num_nodes |
* |___________________________________________________________|
* wd5 | clock_ticks | rgpcounter |
* |_____________________________|_____________________________|
* wd6 | restartcount | pruning_ticks |
* |_____________________________|_____________________________|
* wd7 | pfail_state | flags |
* |_____________________________|_____________________________|
* wd8 | outerscreen | innerscreen |
* |_____________________________|_____________________________|
* wd9 | status_targets | poison_targets |
* |_____________________________|_____________________________|
* wd10| initnodes | endnodes |
* |_____________________________|_____________________________|
* wd11| unreachable_nodes | arbitration_ticks |
* |_____________________________|_____________________________|
* wd12| ignorescreen | filler[0] |
* |_____________________________|_____________________________|
* wd13| filler[1] | filler[2] |
* |_____________________________|_____________________________|
* wd14| |
* : node_states[MAX_CLUSTER_SIZE] :
* : :
* |___________________________________________________________|
* wd30| *nodedown_callback() |
* |___________________________________________________________|
* wd31| *select_cluster() |
* |___________________________________________________________|
* wd32| *rgp_msgsys_p |
* |___________________________________________________________|
* wd33| *received_pktaddr |
* |___________________________________________________________|
* wd34| |
* : rgppkt :
* : :
* |___________________________________________________________|
* wd48| |
* : rgppkt_to_send :
* : :
* |___________________________________________________________|
* wd62| |
* : iamalive_pkt :
* : :
* |___________________________________________________________|
* wd76| |
* : poison_pkt :
* |___________________________________________________________|
* wd80| |
* : :
* : potential_groups[MAX_GROUPS] :
* : :
* |___________________________________________________________|
*wd208| |
* : last_stable_seqno :
* |___________________________________________________________|
*wd212| |
* : internal_connectivity_matrix :
* |___________________________________________________________|
*wdyyy| |
* : OS_specific_control :
*wdxxx|___________________________________________________________|
*
*
* rgpinfo - contains regroup timing parameters and mask of
* fully-integrated cluster (to send IamAlives and monitor)
*
* mynode - node number of local node
*
* tiebreaker - node selected to act as a tie-breaker in the
* split-brain avoidance algorithm and to run the
* pruning algorithm
*
* num_nodes - number of nodes configured in the system, including
* any unused node numbers in the middle; this is equal
* to (the largest configured node# in the system -
* lowest possible node # + 1).
*
* clock_ticks- regroup's internal clock used for checking if it is
* time to send IamAlive packets and to check if IamAlives
* have been received. It is incremented every
* RGP_CLOCK_PERIOD and reset to 0 after checking
* for IamAlives.
*
* rgpcounter - counts regroup clock ticks in a regroup incident in
* order to detect if the algorithm is stalling.
* This is reset when a new regroup incident begins and
* is incremented at each regroup clock tick while
* regroup is perturbed.
*
* restartcount - counts # of regroup algorithm restarts in each regroup
* incident; the node is halted if there are too many
* restarts.
*
* pruning_ticks - number of regroup clock ticks after the tie-breaker
* has been selected; if there are disconnects, the
* tie-breaker should wait a fixed number of ticks
* before running the pruning algorithm.
*
* pfail_state - set to a +ve value when a pfail event is reported
* to regroup. It is decremented at every regroup
* clock tick till it reaches zero. While this number
* is +ve, missing self IamAlives are ignored and
* do not cause the node to halt. This gives the
* sending hardware some time to recover from power
* failures before self IamAlives are checked.
*
* outerscreen - outer recognition mask: nodes not in this mask are
* considered dead or outcasts; if they try to contact
* us, send them poison packets to make sure they stay down
*
* innerscreen - inner recognition mask: nodes not in this mask are
* considered tardy. Regroup packts from them will be
* ignored. They may survive if they can find some
* node which hasn't eliminated them from this screen.
*
* status_targets - nodes to send regroup status packets to
*
* poison_targets - nodes to send poison packets to
*
* initnodes - nodes alive at the beginning of last regroup incident
*
* endnodes - nodes alive at the end of last regroup incident
*
* unreachable_nodes - stores unreachable_node events till the events
* can be processed
*
* arbitration_ticks - number of regroup clock ticks after the arbitration
* started. If arbitration_ticks counter exceeds
* RGP_ARBITRATION_TIMEOUT number of ticks,
* the arbitrating node will shoot itself, and the rest
* of the group will restart the regroup ignoring stalled
* arbitrator
*
* ignorescreen - this is a local copy of ignorescreen passed as
* a part of the regroup packet. The packets from
* the nodes in this screen are ignored and no wait
* for the nodes in ignorescreen is performed in stage 1
*
* last_stable_seqno - this is a sequence number of the last successful regroup.
* It allows to detect really outdated packets
*
* flags:
*
* cautiousmode - need to be "cautious"; wait longer in stage 1
*
* sendstage - This flag is used to indicate whether the
* regroup status packets should indicate we
* are in the current stage. When we enter the
* cleanup stages, we don't let others know we
* are in the stage until the cleanup actions
* are completed.
*
* This flag is set when a new regroup incident
* is started. It is then cleared when we enter
* a cleanup stage and set again when the
* cleanup operations are completed.
*
* tiebreaker_selected - set in stage 2 after tie-breaker is selected
*
* has_unreachable_nodes - set when a node_unreachable event is detected
* in stages 1 or 2. checked in stage 3.
*
* flags_unused - 11 unused bits
*
* node_states[MAX_CLUSTER_SIZE] - state of all the nodes
*
* *nodedown_callback() - registered callback routine to be invoked
* to report node failure
*
* *select_cluster() - registered callback routine to be invoked
* when multiple cluster options exist
*
* *rgp_msgsys_p - pointer to struct shared by regroup and message system
*
* *received_pktaddr - address of rgp packet received
*
* rgp_lock - lock to serialize access to this struct
*
* rgppkt - regroup status in the form of a packet
*
* rgppkt_to_send - regroup packet to be broadcast
*
* iamalive_pkt - I am alive packet to be broadcast
*
* poison_pkt - poison packet to be sent
*
* potential_groups[MAX_GROUPS] - scratch pad for pruning algorithm
*
*/
#ifdef __TANDEM
#pragma fieldalign shared8 rgp_control
#endif /* __TANDEM */
typedef struct rgp_control
{
/* timing parameters and cluster membership */
rgpinfo_t rgpinfo;
/* node numbers */
node_t mynode;
node_t tiebreaker;
uint32 num_nodes;
/* various counters counting clock ticks */
uint16 clock_ticks;
uint16 rgpcounter;
uint16 restartcount;
uint16 pruning_ticks;
uint16 pfail_state;
/* rgpflags */
uint16 cautiousmode : 1;
uint16 sendstage : 1;
uint16 tiebreaker_selected : 1;
uint16 has_unreachable_nodes : 1;
uint16 arbitration_started : 1;
uint16 flags_unused : 11;
/* cluster masks */
cluster_t outerscreen;
cluster_t innerscreen;
cluster_t status_targets;
cluster_t poison_targets;
cluster_t initnodes;
cluster_t endnodes;
cluster_t unreachable_nodes;
uint16 arbitration_ticks;
cluster_t ignorescreen;
uint16 filler[3]; /* for alignment and future use */
/* node states */
node_state_t node_states[MAX_CLUSTER_SIZE];
/* callback routines */
void (*nodedown_callback)(cluster_t failed_nodes);
int (*select_cluster)(cluster_t cluster_choices[], int num_clusters);
/* pointers to other structures */
rgp_msgsys_p rgp_msgsys_p;
rgp_pkt_t *received_pktaddr;
/* current status in the form of a regroup packet */
rgp_pkt_t rgppkt;
/* packets to be sent */
rgp_pkt_t rgppkt_to_send;
iamalive_pkt_t iamalive_pkt;
poison_pkt_t poison_pkt;
/* scratch pad for node pruning algorithm */
cluster_t potential_groups[MAX_GROUPS];
/* The rest of the struct is an OS-specific substruct
* (defined in wrgpos.h).
*/
uint32 last_stable_seqno;
/* temporary place to collect connectivity information
* while send_stage = 0. (Can't use rgp_pkt conn.matrix,
* because we don't want to see our info until we get
* the first timer tick */
connectivity_matrix_t internal_connectivity_matrix;
OS_specific_rgp_control_t OS_specific_control;
} rgp_control_t;
/*---------------------------------------------------------------------------*/
/* Procedures exported by the Cluster type implementation */
_priv _resident extern void
ClusterInit(cluster_t c);
_priv _resident extern void
ClusterUnion(cluster_t dst, cluster_t src1, cluster_t src2);
_priv _resident extern void
ClusterIntersection(cluster_t dst, cluster_t src1, cluster_t src2);
_priv _resident extern void
ClusterDifference(cluster_t dst, cluster_t src1, cluster_t src2);
_priv _resident extern int
ClusterCompare(cluster_t c1, cluster_t c2);
_priv _resident extern int
ClusterSubsetOf(cluster_t big, cluster_t small);
_priv _resident extern void
ClusterComplement(cluster_t dst, cluster_t src);
_priv _resident extern int
ClusterMember(cluster_t c, node_t i);
_priv _resident extern void
ClusterInsert(cluster_t c, node_t i);
_priv _resident extern void
ClusterDelete(cluster_t c, node_t i);
_priv _resident extern void
ClusterCopy(cluster_t dst, cluster_t src);
_priv _resident extern void
ClusterSwap(cluster_t c1, cluster_t c2);
_priv _resident extern int
ClusterNumMembers(cluster_t c);
extern int
ClusterEmpty(cluster_t c);
/*---------------------------------------------------------------------------*/
/* Function to select the tie-breaker node used in both the split-brain
* avoidance and node pruning algorithms
*/
_priv _resident extern node_t
rgp_select_tiebreaker(cluster_t cluster);
/*---------------------------------------------------------------------------*/
/* Procedures exported by the node pruning algorithm */
_priv _resident extern void MatrixInit(connectivity_matrix_t c);
/* Initialize the matrix c to show 0 connectivity. */
_priv _resident extern void
MatrixSet(connectivity_matrix_t c, int row, int column);
/* Set c[row,column] to 1. */
_priv _resident extern void
MatrixOr(connectivity_matrix_t t, connectivity_matrix_t s);
/* OR in s into t. */
_priv _resident extern int connectivity_complete(connectivity_matrix_t c);
/* Returns 1 if all live nodes are connected to all other live nodes
* and 0 if there is at least one disconnect.
*/
_priv _resident extern int
find_all_fully_connected_groups(connectivity_matrix_t c,
node_t selected_node,
cluster_t groups[]);
/* Analyzes the connectivity matrix and comes up with the list of
* all maximal, fully-connected groups. Returns the number of
* such groups found. 0 is returned iff there are no live nodes.
*/
/*---------------------------------------------------------------------------*/
/* Declaration of Regroup's global data structure */
#ifdef NSK
#include <wmsgsac.h>
#define rgp ((rgp_control_t *) MSGROOT->RegroupControlAddr)
#else
extern rgp_control_t *rgp;
#endif /* NSK */
/*---------------------------------------------------------------------------*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
#if 0
History of changes to this file:
-------------------------------------------------------------------------
1995, December 13 F40:KSK0610 /*F40:KSK06102.1*/
This file is part of the portable Regroup Module used in the NonStop
Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
The last two are simulation files to test the Regroup Module on a
UNIX workstation in user mode with processes simulating processor nodes
and UDP datagrams used to send unacknowledged datagrams.
This file was first submitted for release into NSK on 12/13/95.
------------------------------------------------------------------------------
#endif /* 0 - change descriptions */
#endif /* _WRGP_H_ defined */