NT4/private/ntos/nthals/halraw/alpha/ioderr.c

2305 lines
61 KiB
C
Raw Normal View History

2001-01-01 00:00:00 +01:00
/*++
Copyright (c) 1995 Digital Equipment Corporation
Module Name:
ioderr.c
Abstract:
This module implements error handling functions for the Rawhide
IOD (CAP and MDP ASICs).
Author:
Eric Rehm 13-Apr-1995
Environment:
Kernel mode
Revision History:
--*/
#include "halp.h"
//#include "iod.h"
#include "rawhide.h"
#include "stdio.h"
//
// Externals and globals.
//
extern PERROR_FRAME PUncorrectableError;
extern ULONG HalDisablePCIParityChecking;
ULONG IodCorrectedErrors = 0;
//
// Define the context structure for use by interrupt service routines.
//
typedef BOOLEAN (*PSECOND_LEVEL_DISPATCH)(
PKINTERRUPT InterruptObject,
PVOID ServiceContext
);
//
// The Soft Error interrupt is always turned on for Rawhide. When a
// Soft Error interrupt occurs, HalpIodSoftErrorInterrupt() must
// be called to reset the error condition on the offending IOD to
// insure system integrity.
//
// A Correctable Error Driver might also connect to the Soft Error interrupt
// via the Internal Bus interface. When a Soft Error interrupt occurs,
// we determine if it is also necessary to dispatch an ISR for the
// Correctable Error Driver via a boolean.
//
BOOLEAN HalpLogCorrectableErrors = FALSE;
//
// Keep the first time we read the WhoAmI register
// since it does not always read the same the second time.
//
// Zero value indicates that we haven't read WhoAmI yet and that
// this global variable is not valid.
//
// (On machine checks that we dismiss, we must remember to
// to reset this to zero.)
//
IOD_WHOAMI HalpIodWhoAmIOnError = { 0 };
//
// Function prototypes.
//
VOID
HalpSetMachineCheckEnables(
IN BOOLEAN DisableMachineChecks,
IN BOOLEAN DisableProcessorCorrectables,
IN BOOLEAN DisableSystemCorrectables
);
VOID
HalpUpdateMces(
IN BOOLEAN ClearMachineCheck,
IN BOOLEAN ClearCorrectableError
);
//
// Function prototypes for routines not visible outside this module
//
VOID
HalpBuildIodErrorFrame(
MC_DEVICE_ID McDeviceId,
PIOD_ERROR_FRAME IodErrorFrame
);
BOOLEAN
bFindIodError(
PMC_DEVICE_ID pMcDeviceId,
PIOD_CAP_ERR pIodCapErr
);
BOOLEAN
bHandleFatalIodError(
MC_DEVICE_ID McDeviceId,
BOOLEAN bMachineCheck
);
BOOLEAN
bHandleIsaError(
MC_DEVICE_ID pMcDeviceId,
IOD_CAP_ERR IodCapErr
);
VOID
HalpErrorFrameString(
PUNCORRECTABLE_ERROR uncorr,
PUCHAR OutBuffer
);
ULONG
BuildActiveCpus (
VOID
);
//
// Allocate a flag that indicates when a PCI Master Abort is expected.
// PCI Master Aborts are signaled on configuration reads to non-existent
// PCI slots. A cardinal value (0-128) indicates that a Master Abort is expected.
// A value of 0xffffffff indicates that a Master Abort is *not* expected.
//
IOD_EXPECTED_ERROR HalpMasterAbortExpected = {MASTER_ABORT_NOT_EXPECTED, 0x0};
VOID
HalpInitializeIodMachineChecks(
IN BOOLEAN ReportCorrectableErrors,
IN BOOLEAN PciParityChecking
)
/*++
Routine Description:
This routine initializes machine check handling for a IOD-based
system by clearing all pending errors in the IOD registers and
enabling correctable errors according to the callers specification.
Arguments:
ReportCorrectableErrors - Supplies a boolean value which specifies
if correctable error reporting should be
enabled.
Return Value:
None.
--*/
{
IOD_CAP_CONTROL IodCapControl;
IOD_CAP_ERR IodCapError;
IOD_MDPA_DIAG IodMdpaDiag;
IOD_MDPB_DIAG IodMdpbDiag;
IOD_INT_MASK IodIntMask;
MC_DEVICE_ID McDeviceId;
MC_ENUM_CONTEXT mcCtx;
ULONG numIods;
BOOLEAN bfoundIod;
//
// Clear any pending error bits in the IOD_CAP_ERR register:
//
IodCapError.all = 0; // Clear all bits
IodCapError.Perr = 1; // PCI bus perr detected
IodCapError.Serr = 1; // PCI bus serr detected
IodCapError.Mab = 1; // PCI bus master abort detected
IodCapError.PteInv = 1; // Invalid Pte
IodCapError.PioOvfl = 1; // Pio Ovfl
IodCapError.LostMcErr = 1; // Lost error
IodCapError.McAddrPerr = 1; // MC bus comd/addr parity error
IodCapError.Nxm = 1; // Non-existent memory error
IodCapError.CrdA = 1; // Correctable ECC error on MDPA
IodCapError.CrdB = 1; // Correctable ECC error on MDPB
IodCapError.RdsA = 1; // Uncorrectable ECC error on MDPA
IodCapError.RdsA = 1; // Uncorrectable ECC error on MDPA
//
// Intialize enumerator.
//
numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );
//
// Intialize each Iod
//
while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {
McDeviceId = mcCtx.McDeviceId;
//
// Initialize IOD_CAP_ERR
//
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
IodCapError.all );
//
// Set the Iod error enable bits in the IOD_CAP_CTRL and
// IOD_MDPA/B_DIAG registers. The configuration bits in the IOD
// will be left as set by the Extended SROM, with the few
// exceptions documented below.
//
IodCapControl.all = READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl );
#if 0 // CAP/MDP Bug
IodMdpaDiag.all =
READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaDiag );
IodMdpbDiag.all =
READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbDiag );
#else
//
// Clear Mdp Diagnotic Check Registers....
//
IodMdpaDiag.all = 0;
IodMdpbDiag.all = 0;
//
// Enable ECC checking on all MC Bus transactions
//
IodMdpaDiag.EccCkEn = 1;
IodMdpbDiag.EccCkEn = 1;
#endif
//
// Disable/enable PCI parity checking as requested
//
if (PciParityChecking == FALSE) {
IodCapControl.PciAddrPe= 0; // Do *not* check PCI address parity
IodMdpaDiag.ParCkEn = 0; // Do *not* check PCI data parity
IodMdpbDiag.ParCkEn = 0; // Do *not* check PCI data parity
} else {
IodCapControl.PciAddrPe= PciParityChecking;
IodMdpaDiag.ParCkEn = PciParityChecking;
IodMdpbDiag.ParCkEn = PciParityChecking;
}
//
// Disable McBus NXM's
//
// (If enabled, accesses to non-existent McBus device will cause an
// EV5 fill error. Non existant CSRs will return all 0s most of the time
// and not fill error.)
//
IodCapControl.McNxmEn = 0;
//
// Disable monitoring of McBus bystander errors.
//
// That means the IOD will not capture the failing address in the event of
// an MC bus NXM. It has no effect on what the IOD does in the event of a
// PCI NXM (which causes a Master Abort).
//
// Regardless of how McBusMonEn PCI PERR, SERR, MAB, and PTE_INV
// will only show up in IOD CAP_ERR of the participant in the transaction.
//
// If McBusMonEn is set, there can be a difference between the bystander CAP_ERR
// state and the participant CAP_ERR state (as per Sam Duncan, 5/3/95)
// shows up in an unlikely situation:
// "Cache single bit or double bit error: read is dirty in a cache
// and the fill has an ecc error, don't want to indite a memory for this
// (very unlikely) error."
// Thus, we choose not to be able to correctly detect this situation in
// order to make machine check and error handling easier, i.e., we
// always only need to clear only one IOD's CAP_ERROR.
//
IodCapControl.McBusMonEn= 0;
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl,
IodCapControl.all );
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaDiag,
IodMdpaDiag.all );
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbDiag,
IodMdpbDiag.all );
//
// Soft and Hard Error handling
//
// ecrfix - IntMask0 on Bus 0 only.
IodIntMask.all = READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0 );
IodIntMask.SoftErr = (ReportCorrectableErrors == TRUE);
IodIntMask.HardErr = 0; // ecrfix - Mask Hard Errors for now
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0,
IodIntMask.all );
} // while ( HalpMcBusEnum ( &mcCtx ) )
//
// Set the machine check enables within the EV5.
//
if( ReportCorrectableErrors == TRUE ){
HalpSetMachineCheckEnables( FALSE, FALSE, FALSE );
} else {
HalpSetMachineCheckEnables( FALSE, TRUE, TRUE );
}
return;
}
#define MAX_ERROR_STRING 128
BOOLEAN
HalpIodUncorrectableError(
PMC_DEVICE_ID pMcDeviceId
)
/*++
Routine Description:
Read the IOD error register and determine if an uncorrectable error
is latched in the error bits.
Arguments:
None.
Return Value:
TRUE is returned if an uncorrectable error has been detected. FALSE
is returned otherwise.
--*/
{
UCHAR OutBuffer[ MAX_ERROR_STRING ];
IOD_WHOAMI IodWhoAmI;
IOD_CAP_ERR IodCapErr;
//
// Check for a duplicate tag parity error on this (in the Smalltalk
// sense) processor.
//
IodWhoAmI.all = HalpReadWhoAmI();
HalpIodWhoAmIOnError.all = IodWhoAmI.all;
if ( IodWhoAmI.CpuInfo & CACHED_CPU_DTAG_PARITY_ERROR ) {
pMcDeviceId->all = IodWhoAmI.Devid;
return TRUE;
} else {
//
// None of the uncorrectable error conditions were detected.
//
return FALSE;
}
}
VOID
HalpBuildIodErrorFrame(
MC_DEVICE_ID McDeviceId,
PIOD_ERROR_FRAME IodErrorFrame
)
/*++
Routine Description:
This function reports and interprets a fatal hardware error
detected by the IOD chipset. It is assumed that HalGetDisplayOwnership()
has been called prior to this function.
Arguments:
McDevid - Supplies the MC Bus Device ID of the IOD
IodErrorFrame - Supplies a pointer to an IOD_ERROR_FRAME
Return Value:
None.
--*/
{
//
// Clear it first, since caller may reuse the IodErrorFrame
//
RtlZeroMemory(IodErrorFrame, sizeof(IOD_ERROR_FRAME));
//
// Everything is valid
//
IodErrorFrame->ValidBits.all = 0xffffffff; // all valid
//
// Read the General registers
//
IodErrorFrame->BaseAddress = IOD_IO_SPACE_START |
IOD_SPARSE_CSR_OFFSET |
MCDEVID_TO_PHYS_ADDR(McDeviceId.all);
IodErrorFrame->WhoAmI = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->WhoAmI
);
IodErrorFrame->PciRevision = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->PciRevision
);
IodErrorFrame->CapCtrl = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl
);
IodErrorFrame->HaeMem = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->HaeMem
);
IodErrorFrame->HaeIo = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->HaeIo
);
//
// Read Interrupt Control and Status Registers
//
IodErrorFrame->IntCtrl = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntCtrl
);
IodErrorFrame->IntReq = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntReq
);
IodErrorFrame->IntMask0 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0
);
IodErrorFrame->IntMask1 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask1
);
//
// Read the rest of the error registers and then unlock them by
// writing to CAP_ERR
//
IodErrorFrame->CapErr = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr
);
IodErrorFrame->PciErr1 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->PciErr1
);
IodErrorFrame->McErr0 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr0
);
IodErrorFrame->McErr1 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr1
);
#if 0 // CAP/MDP Bug
IodErrorFrame->MdpaStat = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat
);
IodErrorFrame->MdpaSyn = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaSyn
);
IodErrorFrame->MdpbStat = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbStat
);
IodErrorFrame->MdpbSyn = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbSyn
);
#else
//
// CAP/MDP Bug - these registers are not valid.
//
IodErrorFrame->ValidBits.MdpaStatValid = 0;
IodErrorFrame->ValidBits.MdpbStatValid = 0;
IodErrorFrame->ValidBits.MdpaSynValid = 0;
IodErrorFrame->ValidBits.MdpbSynValid = 0;
#endif // CAP/MDP Bug
}
VOID
HalpIodReportFatalError(
MC_DEVICE_ID ErrorMcDeviceId
)
/*++
Routine Description:
This function reports and interprets a fatal hardware error
detected by the IOD chipset. It is assumed that HalGetDisplayOwnership()
has been called prior to this function.
Arguments:
ErrorMcDeviceId - Supplies the MC Bus Device ID of the IOD
where the error was found
- In the case of a Duplicate Tag Parity Error, supplies
the CPU that took the error. Note, in this case
the ErrorMcDeviceId will never match a IOD McDeviceId.
No MC Bus snapshot is present in this case.
Return Value:
None.
--*/
{
UCHAR OutBuffer[ MAX_ERROR_STRING ];
IOD_ERROR_FRAME IodErrorFrame, *pCurrentIodErrorFrame;
MC_ENUM_CONTEXT mcCtx;
MC_DEVICE_ID McDeviceId;
ULONG numIods;
BOOLEAN bfoundIod;
PUNCORRECTABLE_ERROR uncorr = NULL;
PRAWHIDE_UNCORRECTABLE_FRAME rawerr = NULL;
PEXTENDED_ERROR PExtErr;
//
// Do we have an uncorrectable error frame?
//
if (PUncorrectableError) {
uncorr = (PUNCORRECTABLE_ERROR)
&PUncorrectableError->UncorrectableFrame;
rawerr = (PRAWHIDE_UNCORRECTABLE_FRAME)
PUncorrectableError->UncorrectableFrame.RawSystemInformation;
PExtErr = &PUncorrectableError->UncorrectableFrame.ErrorInformation;
}
//
// Validate the ProcessorInfo portion of the Error Frame.
//
if (uncorr) {
uncorr->Flags.ProcessorInformationValid = 1;
HalpGetProcessorInfo(&uncorr->ReportingProcessor);
//
// Initialize our "error string accumulator"
//
HalpErrorFrameString( uncorr, NULL );
}
//
// Validate the Rawhide Uncorrectable Frame
// (Common RCUD Header was already set up.)
//
if (rawerr) {
rawerr->Revision = RAWHIDE_UNCORRECTABLE_FRAME_REVISION;
rawerr->WhoAmI = HalpIodWhoAmIOnError.all;
rawerr->ErrorSubpacketFlags.all = 0;
rawerr->CudHeader.ActiveCpus = BuildActiveCpus();
}
//
// Handle cached CPU duplicate tag parity error.
// (Note that a DTAG parity error implies that we don't
// take an MC Bus Snapshot.
//
if ( HalpIodWhoAmIOnError.CpuInfo & CACHED_CPU_DTAG_PARITY_ERROR ) {
sprintf( OutBuffer, "Duplicate Tag Parity Error on CPU %x\n",
MCDEVID_TO_PHYS_CPU(HalpIodWhoAmIOnError.McDevId.all) );
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( "Duplicate Tag Parity Error on CPU (%d, %d)\n",
HalpIodWhoAmIOnError.McDevId.Gid, HalpIodWhoAmIOnError.McDevId.Mid);
#endif
HalpErrorFrameString( uncorr, OutBuffer );
//
// OK. This is tedious:
// * Error is in memory space and is the system (external) cache.
// * And we know this is the L3 cache.
// * And we'll subvert the "CacheBoard" to squirrel away the
// Cached CPU Revision Info and Cache size.
//
uncorr->Flags.AddressSpace = MEMORY_SPACE;
uncorr->Flags.ExtendedErrorValid = 1;
uncorr->Flags.MemoryErrorSource = SYSTEM_CACHE;
PExtErr->CacheError.Flags.CacheLevelValid = 1;
PExtErr->CacheError.Flags.CacheBoardValid = 1;
PExtErr->CacheError.Flags.CacheSimmValid = 0;
PExtErr->CacheError.CacheLevel = 3;
PExtErr->CacheError.CacheBoardNumber = HalpIodWhoAmIOnError.CpuInfo;
return;
}
//
// Handle cached CPU fill error.
// Since this could be caused by an MC Bus or PCI error,
// we continue to create an MC Bus snapshot.
//
if ( HalpIodWhoAmIOnError.CpuInfo & CACHED_CPU_FILL_ERROR ) {
sprintf( OutBuffer, "Fill Error on CPU %x\n",
MCDEVID_TO_PHYS_CPU(HalpIodWhoAmIOnError.McDevId.all) );
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( "Fill Error on CPU (%d, %d)\n",
HalpIodWhoAmIOnError.McDevId.Gid, HalpIodWhoAmIOnError.McDevId.Mid);
#endif
HalpErrorFrameString( uncorr, OutBuffer );
//
// * WhoAmI tells us Addr<38:33> of reference causing error.
// * However, PciErr1 and/or McErr0/McErr1 give us more bits,
// so the data entered here my get overwritten later.
//
uncorr->Flags.PhysicalAddressValid = 1;
uncorr->PhysicalAddress =
( ((ULONGLONG)(HalpIodWhoAmIOnError.CpuInfo & 0x3f)) << 33 );
}
//
// Validate the MCBusSnapshot header.
//
if (rawerr) {
rawerr->ErrorSubpacketFlags.McBusPresent = 1;
rawerr->McBusSnapshot.ReportingCpuBaseAddr =
IOD_IO_SPACE_START |
MCDEVID_TO_PHYS_ADDR( HalpIodWhoAmIOnError.Devid );
pCurrentIodErrorFrame = (PIOD_ERROR_FRAME) (rawerr + 1);
}
//
// Intialize enumerator.
//
numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );
ASSERT( numIods == HalpNumberOfIods);
//
// Gather data from each Iod
//
while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {
McDeviceId.all = mcCtx.McDeviceId.all;
HalpBuildIodErrorFrame( McDeviceId, &IodErrorFrame );
//
// Fill in IOD_ERROR_FRAME portion of the RAWHIDE_UNCORRECTABLE_FRAME
//
if (rawerr) {
RtlCopyMemory( pCurrentIodErrorFrame,
&IodErrorFrame,
sizeof(IOD_ERROR_FRAME));
pCurrentIodErrorFrame++;
}
//
// If this is the IOD where we found the error
// a. clear the error
// b. complete the uncorrectable error frame processing
// c. Display an interpretation of the error to the screen
//
if (ErrorMcDeviceId.all == McDeviceId.all) {
// ecrfix Put below into HalpInterpretIodError(McDeviceId, IodErrorFrame) ???
IOD_WHOAMI IodWhoAmI;
IOD_CAP_CONTROL IodCapCtrl;
IOD_CAP_ERR IodCapErr;
IOD_PCI_ERR1 IodPciErr1;
IOD_MC_ERR0 IodMcErr0;
IOD_MC_ERR1 IodMcErr1;
IOD_MDPA_STAT IodMdpaStat;
IOD_MDPB_STAT IodMdpbStat;
ULONG HwBusNumber = ErrorMcDeviceId.Mid & 0x3;
//
// Copy error frame variables in locals for bitfield access
//
IodWhoAmI.all = IodErrorFrame.WhoAmI;
IodCapCtrl.all = IodErrorFrame.CapCtrl;
IodCapErr.all = IodErrorFrame.CapErr;
IodPciErr1.PciAddress = IodErrorFrame.PciErr1;
IodMcErr0.all = IodErrorFrame.McErr0;
IodMcErr1.all = IodErrorFrame.McErr1;
#if 0 // CAP/MDP Bug
IodMdpaStat.all = IodErrorFrame.MdpaStat;
IodMdpbStat.all = IodErrorFrame.MdpbStat;
IodMdpaSyn.all = IodErrorFrame.MdpaSyn;
IodMdpbSyn.all = IodErrorFrame.MdpbSyn;
#else
IodMdpaStat.all = 0xffffffff;
IodMdpbStat.all = 0xffffffff;
#endif // CAP/MDP Bug
//
// Clear state in MDPA and MDPB before clearing CAP_ERR
//
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
IodErrorFrame.MdpaStat
);
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbStat,
IodErrorFrame.MdpbStat
);
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
IodErrorFrame.CapErr
);
sprintf( OutBuffer,
"IOD MC_DEVICE_ID : (%x, %x) CAP_CTRL : %08x CAP_ERR : %08x\n",
McDeviceId.Gid, McDeviceId.Mid,
IodCapCtrl.all,
IodCapErr.all );
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
sprintf( OutBuffer,
"PCI_ERR1 %08x MC_ERR0 : %08x MC_ERR1 : %08x\n",
IodPciErr1.PciAddress,
IodMcErr0.all,
IodMcErr1.all );
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
#if 0 // CAP/MDP Bug
sprintf( OutBuffer,
"MDPA_STAT : %08x MDPA_SYN : %08x MDPB_STAT : %08x MDPB_SYN : %08x\n",
IodMdpaStat.all,
IodMdpaSyn.all,
IodMdpbStat.all,
IodMdpbSyn.all );
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
#endif
//
// If no valid error then no interpretation.
//
if (( IodCapErr.PciErrValid == 0 ) && ( IodCapErr.McErrValid == 0 ) ){
return; // No IOD error detected
}
//
// Interpret any detected errors:
//
if (IodCapErr.McErrValid == 1) {
if ( IodMcErr1.Dirty != 1 ) {
sprintf( OutBuffer,
"MC Bus Error, Bus Master=(%x,%x)\n",
( ( IodMcErr1.DevId & 0x38) >> 3 ),
( IodMcErr1.DevId & 0x07)
);
} else {
sprintf( OutBuffer,
"MC bus error on a read/dirty transaction\n"
);
}
//
// Output the detected error message:
//
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
HalpErrorFrameString( uncorr, OutBuffer);
sprintf( OutBuffer,
"IOD Addr=%x%x, Cmd=%x\n",
IodMcErr1.Addr39_32, // bits 39:32
IodMcErr0.Addr, // bits 31:4
IodMcErr1.McCmd
);
//
// Output the detected error message:
//
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
HalpErrorFrameString( uncorr, OutBuffer);
//
// Interpret specific MC bus error
//
uncorr->Flags.PhysicalAddressValid = 1;
uncorr->PhysicalAddress = (
(((ULONGLONG)IodMcErr1.Addr39_32) << 32) |
((ULONGLONG)IodMcErr0.Addr) );
//
// McAddr<39> indicates whether this was a
// memory or I/O transaction.
//
if ( (IodMcErr1.Addr39_32 & 0x80) == 1) {
uncorr->Flags.AddressSpace = IO_SPACE;
} else {
uncorr->Flags.AddressSpace = MEMORY_SPACE;
}
if ( IodCapErr.PioOvfl == 1 ){
sprintf( OutBuffer,
"IOD PIO Overflow, PendNumb=%x\n",
IodCapCtrl.PendNum
);
} else if ( IodCapErr.McAddrPerr == 1 ){
sprintf( OutBuffer,
"MC bus parity error\n"
);
} else if ( IodCapErr.Nxm == 1 ){
sprintf( OutBuffer,
"MC bus NXM\n"
);
} else if ( IodCapErr.CrdA == 1 ){
sprintf( OutBuffer,
"IOD Correctable ECC error in MDPA\n"
);
} else if ( IodCapErr.CrdB == 1 ){
sprintf( OutBuffer,
"IOD Correctable ECC error in MDPB\n"
);
} else if ( IodCapErr.RdsA == 1 ){
sprintf( OutBuffer,
"IOD Uncorrectable ECC error in MDPA\n"
);
} else if ( IodCapErr.RdsB == 1 ){
sprintf( OutBuffer,
"IOD Uncorrectable ECC error in MDPB\n"
);
}
//
// Output the detected error message:
//
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
HalpErrorFrameString( uncorr, OutBuffer);
}
if ( IodCapErr.PciErrValid == 1 ){
//
// Interpret specific PCI bus error
//
uncorr->Flags.AddressSpace = IO_SPACE;
uncorr->Flags.PhysicalAddressValid = 1;
uncorr->PhysicalAddress = IOD_IO_SPACE_START |
MCDEVID_TO_PHYS_ADDR(IodWhoAmI.McDevId.all) |
IodPciErr1.PciAddress << IO_BIT_SHIFT;
uncorr->Flags.ExtendedErrorValid = 1;
PExtErr->IoError.Interface = PCIBus;
PExtErr->IoError.BusNumber = HwBusNumber;
PExtErr->IoError.BusAddress.LowPart = IodPciErr1.PciAddress;
if ( IodCapErr.Perr == 1 ){
sprintf( OutBuffer,
"PERR detected on PCI-%d, Addr=%x\n",
HwBusNumber,
IodPciErr1.PciAddress
);
} else if ( IodCapErr.Serr == 1 ){
sprintf( OutBuffer,
"SERR detected on PCI-%d, Addr=%x\n",
HwBusNumber,
IodPciErr1.PciAddress
);
} else if ( IodCapErr.Mab == 1 ){
sprintf( OutBuffer,
"Master Abort on PCI-%d, Addr=%x\n",
HwBusNumber,
IodPciErr1.PciAddress
);
} else if ( IodCapErr.PteInv == 1 ){
sprintf( OutBuffer,
"Invalid Scatter/Gather PTE on PCI-%d, Addr=%x\n",
HwBusNumber,
IodPciErr1.PciAddress
);
}
//
// Output the detected error message:
//
HalDisplayString( OutBuffer );
#if HALDBG
DbgPrint( OutBuffer );
#endif
HalpErrorFrameString( uncorr, OutBuffer);
}
//
// Check for lost errors and output message if any occurred:
//
if ( IodCapErr.LostMcErr == 1 ){
HalDisplayString("IOD Lost errors were detected\n");
#if HALDBG
DbgPrint("IOD Lost errors were detected\n");
#endif
HalpErrorFrameString(uncorr, "IOD Lost errors were detected\n");
}
} // if (ErrorMcDeviceID == McDeviceId)
} // while (bfoundIod = HalpMcBusEnum)
return; // Fatal error detected
}
BOOLEAN
HalpIodMachineCheck(
IN PEXCEPTION_RECORD ExceptionRecord,
IN PKEXCEPTION_FRAME ExceptionFrame,
IN PKTRAP_FRAME TrapFrame
)
/*++
Routine Description:
This routine is given control when an hard error is acknowledged
by the IOD chipset. The routine is given the chance to
correct and dismiss the error.
Arguments:
ExceptionRecord - Supplies a pointer to the exception record generated
at the point of the exception.
ExceptionFrame - Supplies a pointer to the exception frame generated
at the point of the exception.
TrapFrame - Supplies a pointer to the trap frame generated
at the point of the exception.
Return Value:
TRUE is returned if the machine check has been handled and dismissed -
indicating that execution can continue. FALSE is return otherwise.
--*/
{
IOD_CAP_ERR IodCapErr;
IOD_CAP_ERR IodCapErrMask;
IOD_MC_ERR1 IodMcErr1;
IOD_WHOAMI IodWhoAmI;
MC_DEVICE_ID McDeviceId;
BOOLEAN ExpectedMchk;
BOOLEAN ExpectedMcAddrPerr;
BOOLEAN PciMemReadMchk;
BOOLEAN bfoundIod;
//
// We don't expect a machine check yet...
//
ExpectedMchk = FALSE;
ExpectedMcAddrPerr = FALSE;
//
// Make sure any error due to 2Mb/4Mb Cached CUD bug is latched.
//
// At this point, WhoAmI may indicate the symptoms of a fill_error
// and CUD cache size is not available. We'll read it again when
// we need to know the Cache size. However, we save he here so we
// can figure out if this was a fill error or not.
//
HalpIodWhoAmIOnError.all = HalpReadWhoAmI();
//
// Where do we look for the error symptoms?
//
// 1. If we expected this machine check, then we know which
// IOD to check.
// 2. If we didn't expect this machine check, find the IOD that
// generated the error.
//
//
// For an expected machine check, HalpMasterAbortExpected will
// contain the processor number and address of a PCI config
// space read. CAP_ERR will indicate a MasterAbort.
//
if( HalpMasterAbortExpected.Number == (ULONG)KeGetCurrentProcessorNumber() ) {
//
// Determine expected IOD from the address of the PCI config read
//
McDeviceId.all = MCDEVID_FROM_PHYS_ADDR(HalpMasterAbortExpected.Addr);
//
// Now get the Bcache size information.
//
IodWhoAmI.all = READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->WhoAmI);
//
//
// Make sure there is a Master abort on this IOD
//
IodCapErr.all = READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );
if( IodCapErr.Mab == 1 ) {
ExpectedMchk = TRUE;
//
// If 2Mb or 4 Mb cached CUD, and we may get an MCbus address parity
// error with MC command signature in MC_ERR1 equal to zero (cached
// CPU idle transaction). Also dismiss this error that's the result
// of the cached 2Mb/4Mb cached CPU VCTY bug.
//
IodMcErr1.all = READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr1 );
if ((IodWhoAmI.CpuInfo & 0x7) && // Cached CPU?
IodCapErr.McAddrPerr && // McAddrPerr?
(IodMcErr1.McCmd == 0) ) { // McCmd is zero?
ExpectedMcAddrPerr = TRUE; // All yes, then dismiss it!
}
}
#if HALDBG
DbgPrint( "Expected Mchk (Mab) on IOD (%x, %x), Processor number %x\n",
McDeviceId.Gid,
McDeviceId.Mid,
HalpMasterAbortExpected.Number);
#endif //HALDBG
}
//
// If this isn't the machine check we expected, then
// we must find the IOD that took the error.
//
if (!ExpectedMchk) {
bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );
//
// Check that we found an IOD that has a valid PCI or MC error.
// If it is not this is a pretty weird (fatal???) condition.
// For now, we'll just go return TRUE.
//
// ecrfix - should we check the error interrupts? probably not...
//
if( !bfoundIod ) {
#if HALDBG
DbgPrint( "HalpIodMachineCheck called but no PCI or MC error found\n");
#endif
return (TRUE);
}
#if 0 // HALDBG
DbgPrint( "Unexpected Mchk on IOD (%x, %x)\n",
McDeviceId.Gid,
McDeviceId.Mid );
#endif //HALDBG
//
// Case: Uexpected Master Abort, e.g. a PCI memory or I/O space read to
// legacy ISA space (0 - 1 Mb) on PCI-1,2,3.
//
if ( bHandleIsaError( McDeviceId, IodCapErr) ) {
return TRUE;
}
}
//
// Case: PCI or MC Bus error other than master abort
//
// At this point we have either:
// (a) an expected PCI Master Abort (ExpectedMchk == TRUE), or
// (b) an unexpected PCI or MC Bus error.
//
// However, it's possible that we have *both* (a) AND (b).
// So, even if ExpectedMch == TRUE, check for other PCI or MC Bus
// errors. Any of these other errors indicate a
// fatal condition.
//
if( (IodCapErr.Perr == 1) || // PCI bus perr detected
(IodCapErr.Serr == 1) || // PCI bus serr detected
(IodCapErr.PteInv == 1) || // Invalid Pte
(IodCapErr.PioOvfl == 1) || // Pio Ovfl
//
// Cached CUD with 2 Mb and 4 Mb Cache may also assert an MCAddrPerr
// or Nxm upon a config space read. Lost Error bit will also be set.
//
//
( (IodCapErr.LostMcErr == 1) && !ExpectedMcAddrPerr) ||
// Lost error
( (IodCapErr.McAddrPerr == 1) && !ExpectedMcAddrPerr ) ||
// MC bus comd/addr parity error
( (IodCapErr.Nxm == 1) && !ExpectedMcAddrPerr ) ||
// Non-existent memory error
(IodCapErr.CrdA == 1) || // Correctable ECC error on MDPA
(IodCapErr.CrdB == 1) || // Correctable ECC error on MDPB
(IodCapErr.RdsA == 1) || // Uncorrectable ECC error on MDPA
(IodCapErr.RdsA == 1) // Uncorrectable ECC error on MDPA
){
return ( bHandleFatalIodError(McDeviceId, TRUE) );
}
//
// At this point, we have either an expected or unexpected Master
// abort. There are three cases:
// 1. Expected MAB from a PCI config space read that must be handled
// 2. Unexpected MAB from a PCI memory or I/O space read in ISA legacy
// space that can be handled.
// 3. Unexpected MAB. Don't handle or fix up this error condition.
// (Really take the machine check.)
//
//
// Case 1: Expected Master Abort, e.g. a PCI configuration read error.
//
if ( (IodCapErr.Mab == 1) && ExpectedMchk ){
//
// Here's how a PCI config space read to an empty slot will transpire:
//
// READ_CONFIG_Usize indicates the issuing CPU and address in
// HalpMasterAbortExpected.Number and HalpMasterAbortExpected.Addr.
//
// PCI config space read will case a MC Bus FILL_ERROR on the issuing CPU
// FILL_ERROR causes a machine check.
//
// The targeted MC-PCI bus bridge will set CAP_ERR<MasterAbort> bit.
//
// So far, the error looks like a PCI configuration space read
// that accessed a device that does not exist. In order to fix
// this up we expect that the original faulting instruction must
// be a load with v0 as the destination register. Unfortunately,
// machine checks are not precise exceptions so we may have exectued
// many instructions since the faulting load. For EV5 a pair of
// memory barrier instructions following the load will stall the pipe
// waiting for load completion before the second memory barrier can
// be issued. Therefore, we expect the exception PC to point to either
// the load instruction or one of the two memory barriers. We will
// assume that if the exception pc is not an mb that instead it
// points to the load that machine checked. We must be careful to
// not reexectute the load.
//
ALPHA_INSTRUCTION FaultingInstruction;
FaultingInstruction.Long = *(PULONG)((ULONG)TrapFrame->Fir);
if( FaultingInstruction.Memory.Opcode != MEMSPC_OP ){
//
// Exception pc does not point to a memory barrier, return
// to the instruction after the exception pc.
//
TrapFrame->Fir += 4;
}
//
// The error has matched all of our conditions. Fix it up by
// writing the value 0xffffffff into the destination of the load.
//
TrapFrame->IntV0 = (ULONGLONG)0xffffffffffffffff;
//
// Clear all error conditions in CAP_ERR.
// (McAddrPerr, LostMcErr, Mab)
//
#if 0
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
IodCapErr.all );
#else
IodCapErrMask.all = ALL_CAP_ERRORS;
HalpClearAllIods( IodCapErrMask );
#endif
//
// Clear the hard error interrupt.
// ecrfix - For now, the Hard error interrupt is masked, so
// we don't have to clear it.
//
return TRUE;
}
#if 0
//
// Case 2: Uexpected Master Abort, e.g. a PCI memory or I/O space read to
// legacy ISA space (0 - 1 Mb) on PCI-1,2,3.
//
if ( bHandleIsaError( McDeviceId, IodCapErr) ) {
return TRUE;
}
#endif
//
// Case 3: Unexpected Master abort.
// (Or anything I might have missed.... )
//
#if (DBG) || (HALDBG)
DbgPrint( "Unexpected PCI master abort\n" );
#endif
return ( bHandleFatalIodError(McDeviceId, TRUE) );
}
#define ENTIRE_FRAME_SIZE (sizeof(ERROR_FRAME) + sizeof(RAWHIDE_CORRECTABLE_FRAME))
VOID
HalpIodSoftErrorInterrupt(
VOID
)
/*++
Routine Description:
Handle a IOD soft (correctable) error interrupt.
Arguments:
None.
Return Value:
None.
--*/
{
BOOLEAN bfoundIod;
MC_DEVICE_ID McDeviceId;
static UCHAR Frame[ENTIRE_FRAME_SIZE];
static PERROR_FRAME pFrame;
static RAWHIDE_CORRECTABLE_FRAME RawhideFrame;
static BOOLEAN RawhideFrameInitialized = FALSE;
UCHAR TempFrame[ENTIRE_FRAME_SIZE];
PERROR_FRAME pTempFrame;
PCORRECTABLE_ERROR pCorr;
PRAWHIDE_CORRECTABLE_FRAME pRawCorr;
PBOOLEAN ErrorlogBusy;
PULONG DispatchCode;
PKINTERRUPT InterruptObject;
PKSPIN_LOCK ErrorlogSpinLock;
PRAWHIDE_UNCORRECTABLE_FRAME rawerr;
IOD_CAP_ERR IodCapErr;
IOD_MDPA_STAT IodMdpaStat;
IOD_MDPA_STAT IodMdpbStat;
IOD_MC_ERR0 IodMcErr0;
IOD_MC_ERR1 IodMcErr1;
KIRQL Irql;
#if 0 // CAP/MDP Bug
IOD_MDPA_SYN IodMdpaSyn;
IOD_MDPB_SYN IodMdpbSyn;
#endif
//ecrfix - later we should log the error, throttle the logging and turn off
// correctable error reporting if the frequency is too high
//
// The error is expected to be a corrected ECC error on a DMA or
// Scatter/Gather TLB read/write. Read the error registers relevant
// to this error.
//
//
// Find the IOD that latched the error.
//
bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );
#ifdef FORCE_CORRECTABLE_ERROR
IodCapErr.all = 0x88000000;
bfoundIod = 1;
#endif // FORCE_CORRECTABLE_ERROR
//
// Check that we found an IOD that has a valid PCI or MC error.
// If it is not this is a pretty weird (fatal???) condition.
// For now, we'll just go return TRUE.
//
if( !bfoundIod ) {
#if 0 //HALDBG
DbgPrint( "HalpIodSoftErrorInterrupt: no PCI or MC error found.\n");
#endif
return;
}
//
// Check if an error is latched into the IOD. If not, goodbye.
//
if( IodCapErr.McErrValid == 0 ){
#if HALDBG
DbgPrint( "Iod soft error interrupt without valid MC error\n" );
#endif //HALDBG
return;
}
//
// Check for the correctable error bit.
//
if( (IodCapErr.CrdA == 0) && (IodCapErr.CrdB == 0) ){
#if HALDBG
DbgPrint( "Iod soft error interrupt without correctable error indicated in CapErr\n" );
#endif //HALDBG
}
//
// Increment the number of IOD correctable errors.
//
IodCorrectedErrors += 1;
//
// Read the rest of the error registers
//
IodMcErr0.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->McErr0
);
IodMcErr1.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->McErr1
);
#ifdef FORCE_CORRECTABLE_ERROR
IodMcErr0.all = 0x00bebad0;
IodMcErr1.all = 0x800f3f00;
#endif // FORCE_CORRECTABLE_ERROR
#if 0 // CAP/MDP Bug
IodMdpaStat.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpaStat
);
IodMdpaSyn.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpaSyn
);
IodMdpbStat.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpbStat
);
IodMdpbSyn.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpbSyn
);
#endif
#if HALDBG
//
// Print a correctable error message to the debugger.
//
DbgPrint( "IOD Correctable Error Number %d, state follows: \n",
IodCorrectedErrors );
DbgPrint( "\tIOD_CAP_ERR: 0x%x\n", IodCapErr.all );
DbgPrint( "\tIOD_MC_ERR0: 0x%x\n", IodMcErr0.all );
DbgPrint( "\tIOD_MC_ERR1: 0x%x\n", IodMcErr1.all );
// DbgPrint( "\tIOD_MDPA_STAT: 0x%x\n", IodMdpaStat.all );
// DbgPrint( "\tIOD_MDPA_SYN: 0x%x\n", IodMdpaSyn.all );
// DbgPrint( "\tIOD_MDPB_STAT: 0x%x\n", IodMdpbStat.all );
// DbgPrint( "\tIOD_MDPB_SYN: 0x%x\n", IodMdpbSyn.all );
#endif //HALDBG
//
// Fill in the Correctable Error frame only if we've connected
// to the Correctable Error interrupt.
//
if (HalpLogCorrectableErrors) {
//
// Real error, get the interrupt object.
//
DispatchCode = (PULONG)PCR->InterruptRoutine[RawhideSoftErrVector];
InterruptObject = CONTAINING_RECORD(
DispatchCode,
KINTERRUPT,
DispatchCode
);
//
// Set various pointers so we can use them later.
//
pFrame = (PERROR_FRAME) Frame;
pTempFrame = (PERROR_FRAME) TempFrame;
pCorr = (PCORRECTABLE_ERROR) &pTempFrame->CorrectableFrame;
pRawCorr = (PRAWHIDE_CORRECTABLE_FRAME) (TempFrame +
sizeof(ERROR_FRAME) );
ErrorlogBusy = (PBOOLEAN)((PUCHAR)InterruptObject->ServiceContext +
sizeof(PERROR_FRAME));
ErrorlogSpinLock = (PKSPIN_LOCK)((PUCHAR)ErrorlogBusy + sizeof(PBOOLEAN));
//
// Clear the data structures that we will use.
//
RtlZeroMemory(&TempFrame, ENTIRE_FRAME_SIZE);
//
// Fill in the error frame information.
//
pTempFrame->Signature = ERROR_FRAME_SIGNATURE;
pTempFrame->LengthOfEntireErrorFrame = ENTIRE_FRAME_SIZE;
pTempFrame->FrameType = CorrectableFrame;
pTempFrame->VersionNumber = ERROR_FRAME_VERSION;
pTempFrame->SequenceNumber = IodCorrectedErrors;
pTempFrame->PerformanceCounterValue =
KeQueryPerformanceCounter(NULL).QuadPart;
//
// Check for lost error.
//
if( IodCapErr.LostMcErr ) {
//
// Since the error registers are locked from a previous error,
// we do not know where the error came from. Mark everything
// as UNIDENTIFIED.
//
pCorr->Flags.LostCorrectable = 1;
pCorr->Flags.LostAddressSpace = UNIDENTIFIED;
pCorr->Flags.LostMemoryErrorSource = UNIDENTIFIED;
}
pCorr->Flags.ErrorBitMasksValid = 0;
//
// Determine error type.
//
if (IodMcErr1.Addr39_32 & 0x80) {
//
// I/O ECC error occurred.
//
pCorr->Flags.AddressSpace = IO_SPACE;
pCorr->Flags.ExtendedErrorValid = 1;
pCorr->ErrorInformation.IoError.Interface = PCIBus;
pCorr->ErrorInformation.IoError.BusNumber = IodMcErr1.DevId & 0x3;
// We never alloc PCI address higher than 1 Gb for any PCI
// address space (sparse mem, dense mem, sparse I/O), so this
// trick works.
pCorr->ErrorInformation.IoError.BusAddress.LowPart =
((IodMcErr0.Addr & 0x3FFFFFFF) >> IO_BIT_SHIFT);
// The code below is not strictly correct. Based on the MC Bus
// spec, p.32, we can roughly say that McCmd<3> tells us whether
// there was a write or read transaction on the bus. If I looked
// at the spec harder, I might be able to distinguish a PIO op
// from a DMA operation.
pCorr->ErrorInformation.IoError.TransferType
= ((IodMcErr1.McCmd & 0x8) ? BUS_IO_READ : BUS_IO_WRITE);
} else {
//
// Memory ECC error occurred.
//
pCorr->Flags.AddressSpace = MEMORY_SPACE;
}
//
// Get the physical address where the error occurred.
//
if (IodMcErr1.Valid) {
pCorr->Flags.PhysicalAddressValid = 1;
pCorr->PhysicalAddress =
((ULONGLONG) (IodMcErr1.Addr39_32)) << 32;
pCorr->PhysicalAddress |= IodMcErr0.all;
}
//
// Scrub the error if it's any type of memory error.
//
if ( pCorr->Flags.AddressSpace == MEMORY_SPACE &&
pCorr->Flags.PhysicalAddressValid ) {
pCorr->Flags.ScrubError = 1;
}
//
// Acquire the spinlock.
//
KeAcquireSpinLock(ErrorlogSpinLock, &Irql );
//
// Check to see if an errorlog operation is in progress already.
//
if (!*ErrorlogBusy) {
//
// Set reporting processor information. Disregard at the moment.
//
pCorr->Flags.ProcessorInformationValid = 0;
//
// Copy the SYSTEM_INFORMATION from the uncorrectable frame
//
pCorr->System = PUncorrectableError->UncorrectableFrame.System;
//
//
// Set raw system information flag.
//
pCorr->Flags.SystemInformationValid = 1;
//
// Do the Rawhide-specific stuff here
//
pRawCorr->Revision = RAWHIDE_CORRECTABLE_FRAME_REVISION;
//
// Copy the CUD header from the uncorrectable frame
//
rawerr = (PRAWHIDE_UNCORRECTABLE_FRAME)
PUncorrectableError->UncorrectableFrame.RawSystemInformation;
if (rawerr) {
pRawCorr->CudHeader = rawerr->CudHeader;
}
//
// Fill in the rest of the dynamic portion of the
// correctable frame.
//
pRawCorr->CudHeader.ActiveCpus = BuildActiveCpus();
pRawCorr->ErrorSubpacketFlags.all = 0;
pRawCorr->ErrorSubpacketFlags.IodSubpacketPresent = 1;
pRawCorr->WhoAmI = HalpReadWhoAmI();
HalpBuildIodErrorFrame( McDeviceId, &(pRawCorr->IodErrorFrame) );
//
// Copy the information that we need to log.
//
RtlCopyMemory(&Frame,
&TempFrame,
ENTIRE_FRAME_SIZE);
pFrame->CorrectableFrame.RawSystemInformation =
(PVOID)((PUCHAR)pFrame + sizeof(ERROR_FRAME) );
pFrame->CorrectableFrame.RawSystemInformationLength =
sizeof(RAWHIDE_CORRECTABLE_FRAME);
//
// Put frame into ISR service context.
//
*(PERROR_FRAME *)InterruptObject->ServiceContext = pFrame;
} else {
//
// An errorlog operation is in progress already. We will
// set various lost bits and then get out without doing
// an actual errorloging call.
//
pFrame->CorrectableFrame.Flags.LostCorrectable = TRUE;
pFrame->CorrectableFrame.Flags.LostAddressSpace =
pTempFrame->CorrectableFrame.Flags.AddressSpace;
pFrame->CorrectableFrame.Flags.LostMemoryErrorSource =
pTempFrame->CorrectableFrame.Flags.MemoryErrorSource;
}
//
// Release the spinlock.
//
KeReleaseSpinLock(ErrorlogSpinLock, Irql );
//
// Dispatch to the secondary correctable interrupt service routine.
// The assumption here is that if this interrupt ever happens, then
// some driver enabled it, and the driver should have the ISR connected.
//
((PSECOND_LEVEL_DISPATCH)InterruptObject->DispatchAddress)(
InterruptObject,
InterruptObject->ServiceContext
);
}
//
// Clear state in MDPA and MDPB before clearing CAP_ERR
//
IodCapErr.all = 0;
IodCapErr.CrdA = 1;
IodCapErr.CrdB = 1;
IodMdpaStat.all = 0xffffffff;
IodMdpbStat.all = 0xffffffff;
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
IodMdpaStat.all
);
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
IodMdpbStat.all
);
WRITE_IOD_REGISTER_NEW( McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
IodCapErr.all
);
return;
}
VOID
HalpIodHardErrorInterrupt(
VOID
)
/*++
Routine Description:
Handle a IOD hard (uncorrectable) error interrupt.
Arguments:
None.
Return Value:
None.
--*/
{
BOOLEAN bfoundIod;
MC_DEVICE_ID McDeviceId;
IOD_CAP_ERR IodCapErr;
IOD_WHOAMI IodWhoAmI;
KIRQL OldIrql;
//
// Raise IRQL to the highest level.
// Prevents us from taking other hard error interrupts
// during this one.
//
// Also, acquire a spin lock to keep entry
// to this code serialized.
//
KeRaiseIrql(HIGH_LEVEL, &OldIrql);
KiAcquireSpinLock(&HalpSystemInterruptLock);
//
// Find the IOD that latched the error.
//
bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );
//
// Check that we found an IOD that has a valid PCI or MC error.
// If it is not this is a pretty weird (fatal???) condition.
// For now, we'll just return.
//
if( !bfoundIod ) {
#if 0 // HALDBG
DbgPrint( "HalpIodHardErrorInterrupt: no PCI or MC error found.\n");
#endif
//
// Lower IRQL to the previous level.
//
KiReleaseSpinLock(&HalpSystemInterruptLock);
KeLowerIrql(OldIrql);
return;
}
#if 1 // ecrfix
//
// See if this was an ISA legacy space access
// on PCI-1,2,3. If so, dismiss this interrupt.
//
if ( bHandleIsaError( McDeviceId, IodCapErr) ) {
//
// Lower IRQL to the previous level.
//
KiReleaseSpinLock(&HalpSystemInterruptLock);
KeLowerIrql(OldIrql);
return;
}
#endif
#if HALDBG
DbgPrint( "Hard Error Found on IOD (%x, %x)\n",
McDeviceId.Gid,
McDeviceId.Mid );
#endif //HALDBG
//
// Save IodWhoAmI
//
IodWhoAmI.all = HalpReadWhoAmI();
HalpIodWhoAmIOnError.all = IodWhoAmI.all;
//
// Handle the Fatal Error
//
bHandleFatalIodError( McDeviceId, FALSE );
KeBugCheckEx( DATA_BUS_ERROR,
0xbeadfeed, //ecrfix - quick error interrupt id
McDeviceId.all,
0,
(ULONG) PUncorrectableError );
}
BOOLEAN
bHandleFatalIodError(
MC_DEVICE_ID McDeviceId,
BOOLEAN bMachineCheck
)
/*++
Routine Description:
Handles the epilogue of a fatal IOD unccorrectable error
from either a machine check or IOD hard error interrupt.
Arguments:
McDeviceId - IOD on which the error was found
bMachineCheck - TRUE if we're handling a fatal machine check
FALSE if we're handling a fatal hard error interrupt
Return Value:
TRUE is returned if the IOD error has been handled and dismissed -
indicating that execution can continue. FALSE is return otherwise.
--*/
{
#if HALDBG
if (bMachineCheck ) {
DbgPrint( "Handling fatal error - machine check\n" );
} else {
DbgPrint( "Handling fatal error - hard error interrupt\n" );
}
#endif
//
// Clear the error condition in the MCES register.
//
// ecrfix - the way this is written, this will be done on hard
// error interrupts too (where there has been *no* machine check).
// I hope it will be benign in this case....
//
HalpUpdateMces( TRUE, TRUE );
//
// Proceed to display the error.
//
HalAcquireDisplayOwnership(NULL);
//
// Display the dreaded banner.
//
HalDisplayString( "\nFatal system hardware error.\n" );
#ifdef DUMPIODS
DumpAllIods(AllRegisters);
#endif
HalpIodReportFatalError( McDeviceId );
return( FALSE );
}
BOOLEAN
bFindIodError(
PMC_DEVICE_ID pMcDeviceId,
PIOD_CAP_ERR pIodCapErr
)
/*++
Routine Description:
Determines which IOD has an error latched in it.
Arguments:
None.
Return Value:
TRUE if an IOD was found with an error latched in CAP_ERR.
FALSE otherwise.
--*/
{
MC_ENUM_CONTEXT mcCtx;
ULONG numIods;
BOOLEAN bfoundIod;
IOD_CAP_ERR IodCapErr;
//
// Intialize enumerator.
//
numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );
#if 0 // HALDBG
DbgPrint( "FindIodError: Searching: %d Iods: ", numIods);
#endif // HALDBG
//
// Search each Iod and look for a PCI or McBus error.
//
while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {
//
// Read the IOD error register to determine the source of the
// error.
//
#if 0 //HALDBG
DbgPrint( "(%d, %d) ", mcCtx.McDeviceId.Gid, mcCtx.McDeviceId.Mid);
#endif // HALDBG
IodCapErr.all = READ_IOD_REGISTER_NEW( mcCtx.McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );
if( (IodCapErr.PciErrValid != 0) || (IodCapErr.McErrValid != 0) ){
break;
}
}
#if 0 // HALDBG
if (bfoundIod) {
DbgPrint( "Found!\n");
} else {
DbgPrint( "Error Not Found!\n");
}
#endif // HALDBG
//
// Return the McDeviceId and CapErr register contents
// of the first IOD that has an error.
//
*pMcDeviceId = mcCtx.McDeviceId;
pIodCapErr->all = IodCapErr.all;
return (bfoundIod);
}
BOOLEAN
bHandleIsaError(
MC_DEVICE_ID McDeviceId,
IOD_CAP_ERR IodCapErrIn
)
/*++
Routine Description:
Gives PCI-1,2,3 ISA legacy semantics for I/O and memory accesses.
Arguments:
None.
Return Value:
TRUE if the error was handled.
FALSE otherwise.
--*/
{
MC_ENUM_CONTEXT mcCtx;
MC_DEVICE_ID McDeviceIdWithMab;
ULONG numIods;
BOOLEAN bfoundIod;
IOD_CAP_ERR IodCapErr;
IOD_CAP_ERR IodCapErrMask;
//
// Find an IOD that has Mab set. If we do not find one, then
// we don't have this error.
//
numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );
//
// Search each Iod and look for a PCI or McBus error.
//
while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {
//
// Read the IOD error register to determine who has Mab set
//
IodCapErr.all = READ_IOD_REGISTER_NEW( mcCtx.McDeviceId,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );
if( (IodCapErr.PciErrValid == 1) &&
(IodCapErr.Perr == 0) &&
(IodCapErr.Serr == 0) &&
(IodCapErr.Mab == 1) &&
(IodCapErr.PteInv == 0) ) {
break;
}
}
//
// If we didn't find an IOD with Mab set, then do not handle this error.
//
if (!bfoundIod) {
return FALSE;
}
//
// Save the McDevice Id of the offending IOD
//
McDeviceIdWithMab.all = mcCtx.McDeviceId.all;
//
// This must be on a bus other than PCI0 for us to handle this error
// (PCI0 reads to non-existent ISA addresses will be fixed by by the
// PCI-EISA bridge. Thus we'll never get here on PCI0 unless there
// really is an error.)
//
if ( McDeviceIdWithMab.Mid != MidPci0 ) {
IOD_PCI_ERR1 IodPciErr1;
//
// Get the PCI address of the transaction that caused the MAB
//
IodPciErr1.PciAddress =
(ULONG) READ_IOD_REGISTER_NEW( McDeviceIdWithMab,
&((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->PciErr1 );
//
// To be handled as an ISA legacy memory or I/O space read, the
// FaultingPciAddress must be in the range 0-1 Mb
//
if( IodPciErr1.PciAddress < __1MB ) {
//
// The error has matched all of our conditions. Assume that
// V0 has already been set to 0xffffffff. (This is a contract
// with the HAL access routines in iodio.s.)
//
IodCapErrMask.all = ALL_CAP_ERRORS;
HalpClearAllIods( IodCapErrMask );
return TRUE;
}
#if HALDBG
DbgPrint( "Failed checking for legacy ISA read:\n");
DbgPrint( "PciErr1 : %08x\n", IodPciErr1.PciAddress );
#endif //HALDBG
}
//
// We have a PCI Mab on PCI0. Do not handle this error.
//
return FALSE;
}
VOID
HalpErrorFrameString(
PUNCORRECTABLE_ERROR uncorr,
PUCHAR OutBuffer
)
/*++
Routine Description:
Append an Error message to the Uncorrectable Error Frame
string
Arguments:
uncorr - Pointer to the UNCORRECTABLE_ERROR frame.
OutBuffer - message to be appended.
(If null, no string is appended, and pCurrentString
is reset to NULL).
Return Value:
none.
--*/
{
ULONG len;
static PCHAR pCurrentString = NULL;
//
// If OutBuffer is NULL, reset pointer and flag
//
if (OutBuffer == NULL) {
pCurrentString = NULL;
if (uncorr) uncorr->Flags.ErrorStringValid = 0;
return;
}
//
// Uncorrectable frame valid?
//
if (uncorr) {
//
// On first error message:
// * Init pCurrentString to beginning of ErrorString
// * Set valid flag
//
if (pCurrentString == NULL) {
pCurrentString = uncorr->ErrorString;
uncorr->Flags.ErrorStringValid = 1;
}
//
// Append OutBuffer to ErrorString
//
len = strlen(OutBuffer);
strncpy(pCurrentString,
OutBuffer,
len);
//
// Zero-terminate the error string.
//
pCurrentString += len;
*pCurrentString = 0;
}
}
ULONG
BuildActiveCpus (
VOID
)
{
ULONG ActiveLogicalProcessors = HalpActiveProcessors;
ULONG ActivePhysicalCpus = 0;
ULONG i;
//
// Make a physical processor mask from the logical processor mask
//
for (i = 0; i < HalpNumberOfCpus; i++, ActiveLogicalProcessors >> 1) {
if (ActiveLogicalProcessors & 0x1) {
ActivePhysicalCpus |= (1 << (ULONG) (MCDEVID_TO_PHYS_CPU(
HalpLogicalToPhysicalProcessor[i].all)));
}
}
return (ActivePhysicalCpus);
}