698 lines
18 KiB
C
698 lines
18 KiB
C
/*++
|
|
|
|
Copyright (c) 1995,1996 Digital Equipment Corporation
|
|
|
|
Module Name:
|
|
|
|
errframe.h
|
|
|
|
Abstract:
|
|
|
|
Definitions for both the correctable and uncorrectable error
|
|
frames.
|
|
|
|
Author:
|
|
|
|
Joe Notarangelo 10-Mar-1995
|
|
Chao Chen 24-Apr-1995
|
|
|
|
Environment:
|
|
|
|
Kernel mode only.
|
|
|
|
Revision History:
|
|
|
|
0.1 28-Feb-1995 Joe Notarangelo Initial version.
|
|
|
|
0.2 10-Mar-1995 Joe Notarangelo Incorporate initial review comments from
|
|
6-Mar-95 review with: C. Chen, S. Jenness,
|
|
Bala, E. Rehm.
|
|
|
|
0.3 24-Apr-1995 Chao Chen Made into .h file for inclusion by other
|
|
modules.
|
|
|
|
0.4 16-Apr-1996 Gene Morgan Add fields to SystemError struct for PSU,
|
|
Wachdog events.
|
|
|
|
--*/
|
|
|
|
#ifndef ERRFRAME_H
|
|
#define ERRFRAME_H
|
|
|
|
#include "errplat.h"
|
|
|
|
/*
|
|
*
|
|
* Common defines.
|
|
*
|
|
*/
|
|
|
|
#define ERROR_FRAME_VERSION 0x1
|
|
|
|
#define UNIDENTIFIED 0x0
|
|
#define MEMORY_SPACE 0x1
|
|
#define IO_SPACE 0x2
|
|
#define PROCESSOR_CACHE 0x2
|
|
#define SYSTEM_CACHE 0x3
|
|
#define PROCESSOR_MEMORY 0x4
|
|
#define SYSTEM_MEMORY 0x5
|
|
|
|
#define CACHE_ERROR_MASK 0x2
|
|
#define MEMORY_ERROR_MASK 0x4
|
|
|
|
#define BUS_DMA_READ 0x1
|
|
#define BUS_DMA_WRITE 0x2
|
|
#define BUS_DMA_OP 0x3
|
|
#define BUS_IO_READ 0x4
|
|
#define BUS_IO_WRITE 0x5
|
|
#define BUS_IO_OP 0x6
|
|
#define TLB_MISS_READ 0x7
|
|
#define TLB_MISS_WRITE 0x8
|
|
#define VICTIM_WRITE 0x9
|
|
|
|
#define MAX_UNCORRERR_STRLEN 128
|
|
|
|
#define ERROR_FRAME_SIGNATURE 0xE22F2F2A // ERRORFRA R=2 (18%16)
|
|
|
|
/*
|
|
*
|
|
* Definitions for the correctable error frame.
|
|
*
|
|
*/
|
|
|
|
//
|
|
// Correctable Error Flags
|
|
//
|
|
|
|
typedef struct _CORRECTABLE_FLAGS{
|
|
|
|
//
|
|
// Address space
|
|
// Unidentified: 00
|
|
// Memory Space: 01
|
|
// I/O Space: 10
|
|
// Reserved: 11
|
|
//
|
|
|
|
ULONGLONG AddressSpace: 2;
|
|
|
|
//
|
|
// Error Interpretation Validity.
|
|
//
|
|
|
|
ULONGLONG ServerManagementInformationValid: 1;
|
|
ULONGLONG PhysicalAddressValid: 1;
|
|
ULONGLONG ErrorBitMasksValid: 1;
|
|
ULONGLONG ExtendedErrorValid: 1;
|
|
ULONGLONG ProcessorInformationValid: 1;
|
|
ULONGLONG SystemInformationValid: 1;
|
|
|
|
//
|
|
// Memory Space Error Source:
|
|
//
|
|
// Unidentified: 000
|
|
// ProcessorCache: 010
|
|
// SystemCache: 011
|
|
// ProcessorMemory: 100
|
|
// SystemMemory: 101
|
|
//
|
|
|
|
ULONGLONG MemoryErrorSource: 4;
|
|
|
|
//
|
|
// Driver Actions.
|
|
//
|
|
|
|
ULONGLONG ScrubError: 1;
|
|
|
|
//
|
|
// Lost errors.
|
|
//
|
|
|
|
ULONGLONG LostCorrectable: 1;
|
|
ULONGLONG LostAddressSpace: 2;
|
|
ULONGLONG LostMemoryErrorSource: 4;
|
|
|
|
} CORRECTABLE_FLAGS, *PCORRECTABLE_FLAGS;
|
|
|
|
//
|
|
// Description of CORRECTABLE_FLAG structure:
|
|
//
|
|
// AddressSpace
|
|
//
|
|
// Identifies the system address space that was the source of the
|
|
// correctable error.
|
|
//
|
|
// PhysicalAddressValid
|
|
//
|
|
// Indicates if the physical address in the CORRECTABLE_ERROR
|
|
// structure is valid. A value of 1 indicates the physical address
|
|
// is valid, a value of 0 indicates the physical address is not
|
|
// valid.
|
|
//
|
|
// ErrorBitMasksValid
|
|
//
|
|
// Indicates if the error bit mask fields in the CORRECTABLE_ERROR
|
|
// structure are valid. A value of 1 indicates the DataBitErrorMask
|
|
// and the CheckBitErrorMask are valid, a value of 0 indicates they
|
|
// are not valid.
|
|
//
|
|
// ExtendedErrorValid
|
|
//
|
|
// Indicates if the extended error information structure in the
|
|
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
|
|
// extended error information structure is valid, a value of 0
|
|
// indicates that it is not.
|
|
//
|
|
// ProcessorInformationValid
|
|
//
|
|
// Indicates if the raw processor information pointer in the
|
|
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
|
|
// processor information is valid, a value of 0 indicates it is not.
|
|
//
|
|
// SystemInformationValid
|
|
//
|
|
// Indicates if the raw system information pointer in the
|
|
// CORRECTABLE_ERROR structure is valid. A value of 1 indicates the
|
|
// system information is valid, a value of 0 indicates it is not.
|
|
//
|
|
// ServerManagementInformationValid
|
|
//
|
|
// Indicates that the server management information in the extended
|
|
// error information structure is valid. The server management
|
|
// information relays information about failed fans or high
|
|
// temperature in the system.
|
|
//
|
|
//
|
|
// MemoryErrorSource
|
|
//
|
|
// Identifies the source of a memory error as either main error
|
|
// or cache for either a system or processor.
|
|
//
|
|
// ScrubError
|
|
//
|
|
// Instructs the driver to scrub the correctable error. If the
|
|
// value is 1 the driver should scrub the error, if the value is
|
|
// 0 the driver must not scrub the error.
|
|
//
|
|
// LostCorrectable
|
|
//
|
|
// Identifies if a lost correctable error has been reported. A
|
|
// lost error is an error that reported by the hardware while
|
|
// correctable error handling for a previous error was in progress.
|
|
// A value of 1 indicates that a correctable error report was lost.
|
|
//
|
|
// LostAddressSpace
|
|
//
|
|
// Identifies the system address space that was the source of the
|
|
// correctable error. Valid only if LostCorrectable == 1.
|
|
//
|
|
// LostMemoryErrorSource
|
|
//
|
|
// Identifies the source of a memory error as either main error
|
|
// or cache for either a system or processor. Valid only if
|
|
// LostCorrectable == 1.
|
|
//
|
|
|
|
//
|
|
// Processor information.
|
|
//
|
|
|
|
typedef struct _PROCESSOR_INFO{
|
|
|
|
ULONG ProcessorType;
|
|
ULONG ProcessorRevision;
|
|
ULONG PhysicalProcessorNumber;
|
|
ULONG LogicalProcessorNumber;
|
|
|
|
} PROCESSOR_INFO, *PPROCESSOR_INFO;
|
|
|
|
//
|
|
// Description of PROCESSOR_INFO structure:
|
|
//
|
|
// ProcessorType
|
|
//
|
|
// Identifies the type of processor running on the system
|
|
// (eg. 21064, 21066, 21164). Note that the type of processor
|
|
// is expected to be consistent across the system for MP machines.
|
|
//
|
|
// ProcessorRevision
|
|
//
|
|
// Identifies the revision number of the processor running on
|
|
// the system. Note that the revision is expected to be consistent
|
|
// across the system for MP machines.
|
|
//
|
|
// PhysicalProcessorNumber
|
|
//
|
|
// The physical processor number as numbered in the hardware
|
|
// specifications.
|
|
//
|
|
// LogicalProcessorNumber
|
|
//
|
|
// The logical processor number assigned to the processor by NT.
|
|
//
|
|
|
|
//
|
|
// System Information.
|
|
//
|
|
|
|
typedef struct _SYSTEM_INFORMATION{
|
|
|
|
UCHAR SystemType[8];
|
|
ULONG ClockSpeed;
|
|
ULONG OsRevisionId;
|
|
ULONG PalMajorVersion;
|
|
ULONG PalMinorVersion;
|
|
UCHAR FirmwareRevisionId[16];
|
|
ULONG SystemVariant;
|
|
ULONG SystemRevision;
|
|
UCHAR SystemSerialNumber[16];
|
|
ULONG ModuleVariant;
|
|
ULONG ModuleRevision;
|
|
ULONG ModuleSerialNumber;
|
|
} SYSTEM_INFORMATION, *PSYSTEM_INFORMATION;
|
|
|
|
//
|
|
// Description of SYSTEM_INFORMATION structure:
|
|
//
|
|
// SystemType
|
|
//
|
|
// Identifies the type of system that reported the error
|
|
// (eg. "Sable", "Gamma"). The actual format and value of the
|
|
// SystemType string is system-specific.
|
|
//
|
|
// OsRevisionId
|
|
//
|
|
// A numeric value that identifies the OS revision executing
|
|
// on the system that reported the fault.
|
|
//
|
|
// PalRevisionId
|
|
//
|
|
// A numeric value that identifies the pal revision executing
|
|
// on the system that reported the fault.
|
|
//
|
|
// FirmwareRevisionId
|
|
//
|
|
// A numeric value that identifies the firmware revision executing
|
|
// on the system that reported the fault.
|
|
//
|
|
// SystemVariant
|
|
//
|
|
// A numeric value used to distinguish variants of the same system
|
|
// type. The values and their interpretation are system_specific.
|
|
//
|
|
// SystemRevision
|
|
//
|
|
// A numeric value used to distinguish revisions of the same system
|
|
// type. The values and their interpretation are system_specific.
|
|
//
|
|
//
|
|
|
|
//
|
|
// Extended Error Information.
|
|
//
|
|
|
|
typedef union _EXTENDED_ERROR{
|
|
|
|
struct{
|
|
struct{
|
|
ULONG CacheLevelValid: 1;
|
|
ULONG CacheBoardValid: 1;
|
|
ULONG CacheSimmValid: 1;
|
|
} Flags;
|
|
PROCESSOR_INFO ProcessorInfo;
|
|
ULONG CacheLevel;
|
|
ULONG CacheBoardNumber;
|
|
ULONG CacheSimm;
|
|
ULONG TransferType;
|
|
ULONG Reserved;
|
|
} CacheError;
|
|
|
|
struct{
|
|
struct{
|
|
ULONG MemoryBoardValid: 1;
|
|
ULONG MemorySimmValid: 1;
|
|
} Flags;
|
|
PROCESSOR_INFO ProcessorInfo;
|
|
ULONG MemoryBoard;
|
|
ULONG MemorySimm;
|
|
ULONG TransferType;
|
|
ULONG Reserved[2];
|
|
} MemoryError;
|
|
|
|
struct{
|
|
INTERFACE_TYPE Interface;
|
|
ULONG BusNumber;
|
|
PHYSICAL_ADDRESS BusAddress;
|
|
ULONG TransferType;
|
|
ULONG Reserved[5];
|
|
} IoError;
|
|
|
|
struct{
|
|
struct{
|
|
ULONG FanNumberValid: 1;
|
|
ULONG TempSensorNumberValid: 1;
|
|
ULONG PowerSupplyNumberValid: 1;
|
|
ULONG WatchDogExpiredValid: 1;
|
|
} Flags;
|
|
ULONG FanNumber;
|
|
ULONG TempSensorNumber;
|
|
ULONG PowerSupplyNumber;
|
|
ULONG WatchdogExpiration;
|
|
ULONG Reserved[5];
|
|
} SystemError;
|
|
|
|
} EXTENDED_ERROR, *PEXTENDED_ERROR;
|
|
|
|
//
|
|
// Description of EXTENDED_ERROR union:
|
|
//
|
|
// The EXTENDED_ERROR union has different interpretation depending
|
|
// upon the AddressSpace and MemoryErrorSource fields of the
|
|
// CORRECTABLE_FLAGS structure according to the following table:
|
|
//
|
|
// 1. AddressSpace=MemorySpace
|
|
// MemoryErrorSource = 01x use CacheError structure
|
|
//
|
|
// 2. AddressSpace=MemorySpace
|
|
// MemoryErrorSource = 10x use MemoryError structure
|
|
//
|
|
// 3. AddressSpace=IoSpace use IoError
|
|
//
|
|
// 4. AddressSpace=Unidentified use SystemError
|
|
// MemoryErrorSource = 0x0 (note: ServerManagementInformationValid
|
|
// should be set)
|
|
//
|
|
//
|
|
// CacheError.Flags
|
|
//
|
|
// Identifies which fields of the CacheError structure are valid.
|
|
// CacheLevelValid = 1 indicates CacheLevel is valid.
|
|
// CacheBoardValid = 1 indicates CacheBoardNumber is valid.
|
|
// CacheSimmValid = 1 indicates CacheSimm is valid.
|
|
//
|
|
// CacheError.ProcessorInfo
|
|
//
|
|
// Identifies the processor associated with the error. Most
|
|
// frequently will identify the processor that experienced and
|
|
// reported the error. However, it is possible that the processor
|
|
// that is reporting has experienced the error from another
|
|
// processor's cache. This field is valid only if the
|
|
// MemoryErrorSource = 010 .
|
|
//
|
|
// CacheError.CacheLevel
|
|
//
|
|
// Identifies the level of the cache that caused the error. Primary
|
|
// caches are Level 1, Secondary are Level 2, etc.. This field
|
|
// only valid if CacheError.Flags.CacheLevelValid == 1.
|
|
//
|
|
// CacheError.CacheBoardNumber
|
|
//
|
|
// Identifies the board number of the cache that caused the error.
|
|
// This field only valid if
|
|
// CacheError.Flags.CacheBoardNumberValid == 1.
|
|
//
|
|
// CacheError.CacheSimm
|
|
//
|
|
// Identifies the Cache Simm that caused the error.
|
|
// This field only valid if CacheError.Flags.CacheSimmValid == 1.
|
|
//
|
|
//
|
|
// MemoryError.Flags
|
|
//
|
|
// Identifies which fields of the CacheError structure are valid.
|
|
// CacheLevelValid = 1 indicates CacheLevel is valid.
|
|
// CacheBoardValid = 1 indicates CacheBoardNumber is valid.
|
|
// CacheSimmValid = 1 indicates CacheSimm is valid.
|
|
//
|
|
// MemoryError.ProcessorInfo
|
|
//
|
|
// Identifies the processor associated with the error. Most
|
|
// frequently will identify the processor that experienced and
|
|
// reported the error. However, it is possible that the processor
|
|
// that is reporting has experienced the error from another
|
|
// processor's cache. This field is valid only if the
|
|
// MemoryErrorSource = 010 .
|
|
//
|
|
// MemoryError.MemoryBoardNumber
|
|
//
|
|
// Identifies the board number of the cache that caused the error.
|
|
// This field only valid if MemoryError.Flags.MemoryBoardValid == 1.
|
|
//
|
|
// MemoryError.MemorySimm
|
|
//
|
|
// Identifies the memory SIMM that caused the error.
|
|
// This field only valid if MemoryError.Flags.MemorySimmValid == 1.
|
|
//
|
|
//
|
|
// IoError.Interface
|
|
//
|
|
// Identifies the bus interface type (eg. PCI) of the bus that caused
|
|
// the correctable error.
|
|
//
|
|
// IoError.BusNumber
|
|
//
|
|
// Identifies the bus number of the bus that caused the correctable
|
|
// error.
|
|
//
|
|
// IoError.BusAddress
|
|
//
|
|
// Identifies the bus address of the bus that caused the correctable
|
|
// error.
|
|
//
|
|
|
|
//
|
|
// Correctable Error Frame.
|
|
//
|
|
|
|
typedef struct _CORRECTABLE_ERROR{
|
|
|
|
CORRECTABLE_FLAGS Flags;
|
|
ULONGLONG PhysicalAddress;
|
|
ULONGLONG DataBitErrorMask;
|
|
ULONGLONG CheckBitErrorMask;
|
|
EXTENDED_ERROR ErrorInformation;
|
|
PROCESSOR_INFO ReportingProcessor;
|
|
SYSTEM_INFORMATION System;
|
|
ULONG RawProcessorInformationLength;
|
|
PVOID RawProcessorInformation;
|
|
ULONG RawSystemInformationLength;
|
|
PVOID RawSystemInformation;
|
|
ULONG Reserved;
|
|
|
|
} CORRECTABLE_ERROR, *PCORRECTABLE_ERROR;
|
|
|
|
//
|
|
// Description of CORRECTABLE_ERROR structure:
|
|
//
|
|
// Flags
|
|
//
|
|
// The flags describe the various aspects of the error report. The
|
|
// CORRECTABLE_FLAGS structure is described below.
|
|
//
|
|
// PhysicalAddress
|
|
//
|
|
// The physical CPU address of the quadword that caused the correctable
|
|
// error report. This value is optional, its validiity is indicated
|
|
// in the Flags.
|
|
//
|
|
// DataBitErrorMask
|
|
//
|
|
// A mask that describes which data bits in the corrected word were
|
|
// in error. A value of one in the mask indicates that the
|
|
// corresponding bit in the data word were in error.
|
|
//
|
|
// CheckBitErrorMask
|
|
//
|
|
// A mask that describes which check bits in the corrected word were
|
|
// in error. A value of one in the mask indicates that the
|
|
// corresponding bit in the check bits were in error.
|
|
//
|
|
// ErrorInformation
|
|
//
|
|
// A structure that desribes interpretation of the error. The values
|
|
// in the structure are optional, the EXTENDED_ERROR structure is
|
|
// described below.
|
|
//
|
|
// ReportingProcessor
|
|
//
|
|
// A structure that describes the processor type on the system and
|
|
// the processor that reported the error. PROCESSOR_INFO structure
|
|
// is described below.
|
|
//
|
|
// RawProcessorInformationLength
|
|
//
|
|
// The length of the raw processor error information structure in
|
|
// bytes.
|
|
//
|
|
// RawProcessorInformation
|
|
//
|
|
// A pointer to the raw processor error information structure. The
|
|
// definition of the structure is processor-specific. The definitions
|
|
// for the known processors is defined in Appendix A below.
|
|
//
|
|
// System
|
|
//
|
|
// A structure that describes the type of system for which the
|
|
// error was reported. SYSTEM_INFORMATION structure is described below.
|
|
//
|
|
// RawSystemInformationLength
|
|
//
|
|
// The length of the raw processor error information structure in
|
|
// bytes.
|
|
//
|
|
// RawSystemInformation
|
|
//
|
|
// A pointer to the raw system error information structure. The
|
|
// definition of the structure is system/ASIC-specific. The
|
|
// definitions for the known systems/ASICs is defined in Appendix B.
|
|
//
|
|
//
|
|
|
|
|
|
/*
|
|
*
|
|
* Definitions for the uncorrectable error frame.
|
|
*
|
|
*/
|
|
|
|
//
|
|
// Uncorrectable Error Flags.
|
|
//
|
|
|
|
typedef struct _UNCORRECTABLE_FLAGS{
|
|
|
|
//
|
|
// Address space
|
|
// Unidentified: 00
|
|
// Memory Space: 01
|
|
// I/O Space: 10
|
|
// Reserved: 11
|
|
//
|
|
|
|
ULONGLONG AddressSpace: 2;
|
|
|
|
//
|
|
// Error Interpretation Validity.
|
|
//
|
|
|
|
ULONGLONG ErrorStringValid: 1;
|
|
ULONGLONG PhysicalAddressValid: 1;
|
|
ULONGLONG ErrorBitMasksValid: 1;
|
|
ULONGLONG ExtendedErrorValid: 1;
|
|
ULONGLONG ProcessorInformationValid: 1;
|
|
ULONGLONG SystemInformationValid: 1;
|
|
|
|
//
|
|
// Memory Space Error Source:
|
|
//
|
|
// Unidentified: 000
|
|
// ProcessorCache: 010
|
|
// SystemCache: 011
|
|
// ProcessorMemory: 100
|
|
// SystemMemory: 101
|
|
//
|
|
|
|
ULONGLONG MemoryErrorSource: 4;
|
|
|
|
} UNCORRECTABLE_FLAGS, *PUNCORRECTABLE_FLAGS;
|
|
|
|
//
|
|
// The extended error information, processor information and system
|
|
// information structures are identical for correctable and uncorrectable
|
|
// errors. The rules for printing the failing FRU are also identical
|
|
// to the rules for the correctable errors.
|
|
//
|
|
|
|
//
|
|
// Uncorrectable Error Frame.
|
|
//
|
|
|
|
typedef struct _UNCORRECTABLE_ERROR{
|
|
|
|
UNCORRECTABLE_FLAGS Flags;
|
|
CHAR ErrorString[MAX_UNCORRERR_STRLEN];
|
|
ULONGLONG PhysicalAddress;
|
|
ULONGLONG DataBitErrorMask;
|
|
ULONGLONG CheckBitErrorMask;
|
|
EXTENDED_ERROR ErrorInformation;
|
|
PROCESSOR_INFO ReportingProcessor;
|
|
SYSTEM_INFORMATION System;
|
|
ULONG RawProcessorInformationLength;
|
|
PVOID RawProcessorInformation;
|
|
ULONG RawSystemInformationLength;
|
|
PVOID RawSystemInformation;
|
|
ULONG Reserved;
|
|
|
|
} UNCORRECTABLE_ERROR, *PUNCORRECTABLE_ERROR;
|
|
|
|
|
|
/*
|
|
*
|
|
* Generic definitions for the error frame.
|
|
*
|
|
*/
|
|
|
|
//
|
|
// Generic error frame.
|
|
//
|
|
|
|
typedef enum _FRAME_TYPE{
|
|
CorrectableFrame,
|
|
UncorrectableFrame
|
|
} FRAME_TYPE, *PFRAME_TYPE;
|
|
|
|
typedef struct _ERROR_FRAME{
|
|
|
|
ULONG Signature; // Needed to make sure that the buffer is infact
|
|
// an error frame.
|
|
ULONG LengthOfEntireErrorFrame;
|
|
FRAME_TYPE FrameType;
|
|
ULONG VersionNumber;
|
|
ULONG SequenceNumber;
|
|
ULONGLONG PerformanceCounterValue;
|
|
|
|
union {
|
|
CORRECTABLE_ERROR CorrectableFrame;
|
|
UNCORRECTABLE_ERROR UncorrectableFrame;
|
|
};
|
|
|
|
} ERROR_FRAME, *PERROR_FRAME;
|
|
|
|
//
|
|
// Description of the generic error frame structure:
|
|
//
|
|
// FrameType
|
|
//
|
|
// Specify which frame type we have. Either correctable or
|
|
// uncorrectable.
|
|
//
|
|
// VersionNumber
|
|
//
|
|
// Defines the version of the error structure. Current version
|
|
// number = 1.
|
|
//
|
|
// SequenceNumber
|
|
//
|
|
// A number that identifies a particular error frame.
|
|
//
|
|
// PerformanceCounterValue
|
|
//
|
|
// The value of the system performance counter when the error was
|
|
// reported. The value is determined during error processing by the
|
|
// HAL and so may be captured significantly after the hardware
|
|
// detected the error. The value cannot be used for fine grained
|
|
// timing of when errors occurred but can be used for coarse grained
|
|
// timing and approximate timing.
|
|
//
|
|
// CorrectableFrame
|
|
//
|
|
// Shared common area for a correctable frame.
|
|
//
|
|
// UncorrectableFrame
|
|
//
|
|
// Shared common area for an uncorrectable frame.
|
|
//
|
|
|
|
#endif //ERRFRAME_H
|