NT4/private/ntos/nthals/halalpha/errframe.h
2020-09-30 17:12:29 +02:00

698 lines
18 KiB
C

/*++
Copyright (c) 1995,1996 Digital Equipment Corporation
Module Name:
errframe.h
Abstract:
Definitions for both the correctable and uncorrectable error
frames.
Author:
Joe Notarangelo 10-Mar-1995
Chao Chen 24-Apr-1995
Environment:
Kernel mode only.
Revision History:
0.1 28-Feb-1995 Joe Notarangelo Initial version.
0.2 10-Mar-1995 Joe Notarangelo Incorporate initial review comments from
6-Mar-95 review with: C. Chen, S. Jenness,
Bala, E. Rehm.
0.3 24-Apr-1995 Chao Chen Made into .h file for inclusion by other
modules.
0.4 16-Apr-1996 Gene Morgan Add fields to SystemError struct for PSU,
Wachdog events.
--*/
#ifndef ERRFRAME_H
#define ERRFRAME_H
#include "errplat.h"
/*
*
* Common defines.
*
*/
#define ERROR_FRAME_VERSION 0x1
#define UNIDENTIFIED 0x0
#define MEMORY_SPACE 0x1
#define IO_SPACE 0x2
#define PROCESSOR_CACHE 0x2
#define SYSTEM_CACHE 0x3
#define PROCESSOR_MEMORY 0x4
#define SYSTEM_MEMORY 0x5
#define CACHE_ERROR_MASK 0x2
#define MEMORY_ERROR_MASK 0x4
#define BUS_DMA_READ 0x1
#define BUS_DMA_WRITE 0x2
#define BUS_DMA_OP 0x3
#define BUS_IO_READ 0x4
#define BUS_IO_WRITE 0x5
#define BUS_IO_OP 0x6
#define TLB_MISS_READ 0x7
#define TLB_MISS_WRITE 0x8
#define VICTIM_WRITE 0x9
#define MAX_UNCORRERR_STRLEN 128
#define ERROR_FRAME_SIGNATURE 0xE22F2F2A // ERRORFRA R=2 (18%16)
/*
*
* Definitions for the correctable error frame.
*
*/
//
// Correctable Error Flags
//
typedef struct _CORRECTABLE_FLAGS{
//
// Address space
// Unidentified: 00
// Memory Space: 01
// I/O Space: 10
// Reserved: 11
//
ULONGLONG AddressSpace: 2;
//
// Error Interpretation Validity.
//
ULONGLONG ServerManagementInformationValid: 1;
ULONGLONG PhysicalAddressValid: 1;
ULONGLONG ErrorBitMasksValid: 1;
ULONGLONG ExtendedErrorValid: 1;
ULONGLONG ProcessorInformationValid: 1;
ULONGLONG SystemInformationValid: 1;
//
// Memory Space Error Source:
//
// Unidentified: 000
// ProcessorCache: 010
// SystemCache: 011
// ProcessorMemory: 100
// SystemMemory: 101
//
ULONGLONG MemoryErrorSource: 4;
//
// Driver Actions.
//
ULONGLONG ScrubError: 1;
//
// Lost errors.
//
ULONGLONG LostCorrectable: 1;
ULONGLONG LostAddressSpace: 2;
ULONGLONG LostMemoryErrorSource: 4;
} CORRECTABLE_FLAGS, *PCORRECTABLE_FLAGS;
//
// Description of CORRECTABLE_FLAG structure:
//
// AddressSpace
//
// Identifies the system address space that was the source of the
// correctable error.
//
// PhysicalAddressValid
//
// Indicates if the physical address in the CORRECTABLE_ERROR
// structure is valid. A value of 1 indicates the physical address
// is valid, a value of 0 indicates the physical address is not
// valid.
//
// ErrorBitMasksValid
//
// Indicates if the error bit mask fields in the CORRECTABLE_ERROR
// structure are valid. A value of 1 indicates the DataBitErrorMask
// and the CheckBitErrorMask are valid, a value of 0 indicates they
// are not valid.
//
// ExtendedErrorValid
//
// Indicates if the extended error information structure in the
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
// extended error information structure is valid, a value of 0
// indicates that it is not.
//
// ProcessorInformationValid
//
// Indicates if the raw processor information pointer in the
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
// processor information is valid, a value of 0 indicates it is not.
//
// SystemInformationValid
//
// Indicates if the raw system information pointer in the
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
// system information is valid, a value of 0 indicates it is not.
//
// ServerManagementInformationValid
//
// Indicates that the server management information in the extended
// error information structure is valid. The server management
// information relays information about failed fans or high
// temperature in the system.
//
//
// MemoryErrorSource
//
// Identifies the source of a memory error as either main error
// or cache for either a system or processor.
//
// ScrubError
//
// Instructs the driver to scrub the correctable error. If the
// value is 1 the driver should scrub the error, if the value is
// 0 the driver must not scrub the error.
//
// LostCorrectable
//
// Identifies if a lost correctable error has been reported. A
// lost error is an error that reported by the hardware while
// correctable error handling for a previous error was in progress.
// A value of 1 indicates that a correctable error report was lost.
//
// LostAddressSpace
//
// Identifies the system address space that was the source of the
// correctable error. Valid only if LostCorrectable == 1.
//
// LostMemoryErrorSource
//
// Identifies the source of a memory error as either main error
// or cache for either a system or processor. Valid only if
// LostCorrectable == 1.
//
//
// Processor information.
//
typedef struct _PROCESSOR_INFO{
ULONG ProcessorType;
ULONG ProcessorRevision;
ULONG PhysicalProcessorNumber;
ULONG LogicalProcessorNumber;
} PROCESSOR_INFO, *PPROCESSOR_INFO;
//
// Description of PROCESSOR_INFO structure:
//
// ProcessorType
//
// Identifies the type of processor running on the system
// (eg. 21064, 21066, 21164). Note that the type of processor
// is expected to be consistent across the system for MP machines.
//
// ProcessorRevision
//
// Identifies the revision number of the processor running on
// the system. Note that the revision is expected to be consistent
// across the system for MP machines.
//
// PhysicalProcessorNumber
//
// The physical processor number as numbered in the hardware
// specifications.
//
// LogicalProcessorNumber
//
// The logical processor number assigned to the processor by NT.
//
//
// System Information.
//
typedef struct _SYSTEM_INFORMATION{
UCHAR SystemType[8];
ULONG ClockSpeed;
ULONG OsRevisionId;
ULONG PalMajorVersion;
ULONG PalMinorVersion;
UCHAR FirmwareRevisionId[16];
ULONG SystemVariant;
ULONG SystemRevision;
UCHAR SystemSerialNumber[16];
ULONG ModuleVariant;
ULONG ModuleRevision;
ULONG ModuleSerialNumber;
} SYSTEM_INFORMATION, *PSYSTEM_INFORMATION;
//
// Description of SYSTEM_INFORMATION structure:
//
// SystemType
//
// Identifies the type of system that reported the error
// (eg. "Sable", "Gamma"). The actual format and value of the
// SystemType string is system-specific.
//
// OsRevisionId
//
// A numeric value that identifies the OS revision executing
// on the system that reported the fault.
//
// PalRevisionId
//
// A numeric value that identifies the pal revision executing
// on the system that reported the fault.
//
// FirmwareRevisionId
//
// A numeric value that identifies the firmware revision executing
// on the system that reported the fault.
//
// SystemVariant
//
// A numeric value used to distinguish variants of the same system
// type. The values and their interpretation are system_specific.
//
// SystemRevision
//
// A numeric value used to distinguish revisions of the same system
// type. The values and their interpretation are system_specific.
//
//
//
// Extended Error Information.
//
typedef union _EXTENDED_ERROR{
struct{
struct{
ULONG CacheLevelValid: 1;
ULONG CacheBoardValid: 1;
ULONG CacheSimmValid: 1;
} Flags;
PROCESSOR_INFO ProcessorInfo;
ULONG CacheLevel;
ULONG CacheBoardNumber;
ULONG CacheSimm;
ULONG TransferType;
ULONG Reserved;
} CacheError;
struct{
struct{
ULONG MemoryBoardValid: 1;
ULONG MemorySimmValid: 1;
} Flags;
PROCESSOR_INFO ProcessorInfo;
ULONG MemoryBoard;
ULONG MemorySimm;
ULONG TransferType;
ULONG Reserved[2];
} MemoryError;
struct{
INTERFACE_TYPE Interface;
ULONG BusNumber;
PHYSICAL_ADDRESS BusAddress;
ULONG TransferType;
ULONG Reserved[5];
} IoError;
struct{
struct{
ULONG FanNumberValid: 1;
ULONG TempSensorNumberValid: 1;
ULONG PowerSupplyNumberValid: 1;
ULONG WatchDogExpiredValid: 1;
} Flags;
ULONG FanNumber;
ULONG TempSensorNumber;
ULONG PowerSupplyNumber;
ULONG WatchdogExpiration;
ULONG Reserved[5];
} SystemError;
} EXTENDED_ERROR, *PEXTENDED_ERROR;
//
// Description of EXTENDED_ERROR union:
//
// The EXTENDED_ERROR union has different interpretation depending
// upon the AddressSpace and MemoryErrorSource fields of the
// CORRECTABLE_FLAGS structure according to the following table:
//
// 1. AddressSpace=MemorySpace
// MemoryErrorSource = 01x use CacheError structure
//
// 2. AddressSpace=MemorySpace
// MemoryErrorSource = 10x use MemoryError structure
//
// 3. AddressSpace=IoSpace use IoError
//
// 4. AddressSpace=Unidentified use SystemError
// MemoryErrorSource = 0x0 (note: ServerManagementInformationValid
// should be set)
//
//
// CacheError.Flags
//
// Identifies which fields of the CacheError structure are valid.
// CacheLevelValid = 1 indicates CacheLevel is valid.
// CacheBoardValid = 1 indicates CacheBoardNumber is valid.
// CacheSimmValid = 1 indicates CacheSimm is valid.
//
// CacheError.ProcessorInfo
//
// Identifies the processor associated with the error. Most
// frequently will identify the processor that experienced and
// reported the error. However, it is possible that the processor
// that is reporting has experienced the error from another
// processor's cache. This field is valid only if the
// MemoryErrorSource = 010 .
//
// CacheError.CacheLevel
//
// Identifies the level of the cache that caused the error. Primary
// caches are Level 1, Secondary are Level 2, etc.. This field
// only valid if CacheError.Flags.CacheLevelValid == 1.
//
// CacheError.CacheBoardNumber
//
// Identifies the board number of the cache that caused the error.
// This field only valid if
// CacheError.Flags.CacheBoardNumberValid == 1.
//
// CacheError.CacheSimm
//
// Identifies the Cache Simm that caused the error.
// This field only valid if CacheError.Flags.CacheSimmValid == 1.
//
//
// MemoryError.Flags
//
// Identifies which fields of the CacheError structure are valid.
// CacheLevelValid = 1 indicates CacheLevel is valid.
// CacheBoardValid = 1 indicates CacheBoardNumber is valid.
// CacheSimmValid = 1 indicates CacheSimm is valid.
//
// MemoryError.ProcessorInfo
//
// Identifies the processor associated with the error. Most
// frequently will identify the processor that experienced and
// reported the error. However, it is possible that the processor
// that is reporting has experienced the error from another
// processor's cache. This field is valid only if the
// MemoryErrorSource = 010 .
//
// MemoryError.MemoryBoardNumber
//
// Identifies the board number of the cache that caused the error.
// This field only valid if MemoryError.Flags.MemoryBoardValid == 1.
//
// MemoryError.MemorySimm
//
// Identifies the memory SIMM that caused the error.
// This field only valid if MemoryError.Flags.MemorySimmValid == 1.
//
//
// IoError.Interface
//
// Identifies the bus interface type (eg. PCI) of the bus that caused
// the correctable error.
//
// IoError.BusNumber
//
// Identifies the bus number of the bus that caused the correctable
// error.
//
// IoError.BusAddress
//
// Identifies the bus address of the bus that caused the correctable
// error.
//
//
// Correctable Error Frame.
//
typedef struct _CORRECTABLE_ERROR{
CORRECTABLE_FLAGS Flags;
ULONGLONG PhysicalAddress;
ULONGLONG DataBitErrorMask;
ULONGLONG CheckBitErrorMask;
EXTENDED_ERROR ErrorInformation;
PROCESSOR_INFO ReportingProcessor;
SYSTEM_INFORMATION System;
ULONG RawProcessorInformationLength;
PVOID RawProcessorInformation;
ULONG RawSystemInformationLength;
PVOID RawSystemInformation;
ULONG Reserved;
} CORRECTABLE_ERROR, *PCORRECTABLE_ERROR;
//
// Description of CORRECTABLE_ERROR structure:
//
// Flags
//
// The flags describe the various aspects of the error report. The
// CORRECTABLE_FLAGS structure is described below.
//
// PhysicalAddress
//
// The physical CPU address of the quadword that caused the correctable
// error report. This value is optional, its validiity is indicated
// in the Flags.
//
// DataBitErrorMask
//
// A mask that describes which data bits in the corrected word were
// in error. A value of one in the mask indicates that the
// corresponding bit in the data word were in error.
//
// CheckBitErrorMask
//
// A mask that describes which check bits in the corrected word were
// in error. A value of one in the mask indicates that the
// corresponding bit in the check bits were in error.
//
// ErrorInformation
//
// A structure that desribes interpretation of the error. The values
// in the structure are optional, the EXTENDED_ERROR structure is
// described below.
//
// ReportingProcessor
//
// A structure that describes the processor type on the system and
// the processor that reported the error. PROCESSOR_INFO structure
// is described below.
//
// RawProcessorInformationLength
//
// The length of the raw processor error information structure in
// bytes.
//
// RawProcessorInformation
//
// A pointer to the raw processor error information structure. The
// definition of the structure is processor-specific. The definitions
// for the known processors is defined in Appendix A below.
//
// System
//
// A structure that describes the type of system for which the
// error was reported. SYSTEM_INFORMATION structure is described below.
//
// RawSystemInformationLength
//
// The length of the raw processor error information structure in
// bytes.
//
// RawSystemInformation
//
// A pointer to the raw system error information structure. The
// definition of the structure is system/ASIC-specific. The
// definitions for the known systems/ASICs is defined in Appendix B.
//
//
/*
*
* Definitions for the uncorrectable error frame.
*
*/
//
// Uncorrectable Error Flags.
//
typedef struct _UNCORRECTABLE_FLAGS{
//
// Address space
// Unidentified: 00
// Memory Space: 01
// I/O Space: 10
// Reserved: 11
//
ULONGLONG AddressSpace: 2;
//
// Error Interpretation Validity.
//
ULONGLONG ErrorStringValid: 1;
ULONGLONG PhysicalAddressValid: 1;
ULONGLONG ErrorBitMasksValid: 1;
ULONGLONG ExtendedErrorValid: 1;
ULONGLONG ProcessorInformationValid: 1;
ULONGLONG SystemInformationValid: 1;
//
// Memory Space Error Source:
//
// Unidentified: 000
// ProcessorCache: 010
// SystemCache: 011
// ProcessorMemory: 100
// SystemMemory: 101
//
ULONGLONG MemoryErrorSource: 4;
} UNCORRECTABLE_FLAGS, *PUNCORRECTABLE_FLAGS;
//
// The extended error information, processor information and system
// information structures are identical for correctable and uncorrectable
// errors. The rules for printing the failing FRU are also identical
// to the rules for the correctable errors.
//
//
// Uncorrectable Error Frame.
//
typedef struct _UNCORRECTABLE_ERROR{
UNCORRECTABLE_FLAGS Flags;
CHAR ErrorString[MAX_UNCORRERR_STRLEN];
ULONGLONG PhysicalAddress;
ULONGLONG DataBitErrorMask;
ULONGLONG CheckBitErrorMask;
EXTENDED_ERROR ErrorInformation;
PROCESSOR_INFO ReportingProcessor;
SYSTEM_INFORMATION System;
ULONG RawProcessorInformationLength;
PVOID RawProcessorInformation;
ULONG RawSystemInformationLength;
PVOID RawSystemInformation;
ULONG Reserved;
} UNCORRECTABLE_ERROR, *PUNCORRECTABLE_ERROR;
/*
*
* Generic definitions for the error frame.
*
*/
//
// Generic error frame.
//
typedef enum _FRAME_TYPE{
CorrectableFrame,
UncorrectableFrame
} FRAME_TYPE, *PFRAME_TYPE;
typedef struct _ERROR_FRAME{
ULONG Signature; // Needed to make sure that the buffer is infact
// an error frame.
ULONG LengthOfEntireErrorFrame;
FRAME_TYPE FrameType;
ULONG VersionNumber;
ULONG SequenceNumber;
ULONGLONG PerformanceCounterValue;
union {
CORRECTABLE_ERROR CorrectableFrame;
UNCORRECTABLE_ERROR UncorrectableFrame;
};
} ERROR_FRAME, *PERROR_FRAME;
//
// Description of the generic error frame structure:
//
// FrameType
//
// Specify which frame type we have. Either correctable or
// uncorrectable.
//
// VersionNumber
//
// Defines the version of the error structure. Current version
// number = 1.
//
// SequenceNumber
//
// A number that identifies a particular error frame.
//
// PerformanceCounterValue
//
// The value of the system performance counter when the error was
// reported. The value is determined during error processing by the
// HAL and so may be captured significantly after the hardware
// detected the error. The value cannot be used for fine grained
// timing of when errors occurred but can be used for coarse grained
// timing and approximate timing.
//
// CorrectableFrame
//
// Shared common area for a correctable frame.
//
// UncorrectableFrame
//
// Shared common area for an uncorrectable frame.
//
#endif //ERRFRAME_H