NT4/private/ntos/nthals/halalpha/ev4mchk.c

710 lines
18 KiB
C
Raw Normal View History

2001-01-01 00:00:00 +01:00
/*++
Copyright (c) 1994 Digital Equipment Corporation
Module Name:
ev4mchk.c
Abstract:
This module implements generalized machine check handling for
platforms based on the DECchip 21064 (EV4) microprocessor.
Author:
Joe Notarangelo 14-Feb-1994
Environment:
Kernel mode only.
Revision History:
--*/
#include "halp.h"
#include "axp21064.h"
#include "stdio.h"
//
// Declare the extern variable UncorrectableError declared in
// inithal.c.
//
extern PERROR_FRAME PUncorrectableError;
VOID
HalpDisplayLogout21064(
IN PLOGOUT_FRAME_21064 LogoutFrame );
BOOLEAN
HalpPlatformMachineCheck(
IN PEXCEPTION_RECORD ExceptionRecord,
IN PKEXCEPTION_FRAME ExceptionFrame,
IN PKTRAP_FRAME TrapFrame
);
VOID
HalpUpdateMces(
IN BOOLEAN ClearMachineCheck,
IN BOOLEAN ClearCorrectableError
);
//
// MAX_RETRYABLE_ERRORS is the number of times to retry machine checks before
// just giving up.
//
#define MAX_RETRYABLE_ERRORS 32
//
// System-wide controls for machine check reporting.
//
ProcessorCorrectableDisable = FALSE;
SystemCorrectableDisable = FALSE;
MachineCheckDisable = FALSE;
//
//jnfix - temporary counts
//
ULONG CorrectableErrors = 0;
ULONG RetryableErrors = 0;
VOID
HalpSetMachineCheckEnables(
IN BOOLEAN DisableMachineChecks,
IN BOOLEAN DisableProcessorCorrectables,
IN BOOLEAN DisableSystemCorrectables
)
/*++
Routine Description:
This function sets the enables that define which machine check
errors will be signaled by the processor.
N.B. - The system has the capability to ignore all machine checks
by indicating DisableMachineChecks = TRUE. This is intended
for debugging purposes on broken hardware. If you disable
this you will get no machine check no matter what error the
system/processor detects. Consider the consequences.
Arguments:
DisableMachineChecks - Supplies a boolean which indicates if all
machine checks should be disabled and not
reported. (see note above).
DisableProcessorCorrectables - Supplies a boolean which indicates if
processor correctable error reporting
should be disabled.
DisableSystemCorrectables - Supplies a boolean which indicates if
system correctable error reporting
should be disabled.
Return Value:
None.
--*/
{
ProcessorCorrectableDisable = DisableProcessorCorrectables;
SystemCorrectableDisable = DisableSystemCorrectables;
MachineCheckDisable = DisableMachineChecks;
HalpUpdateMces( FALSE, FALSE );
return;
}
VOID
HalpUpdateMces(
IN BOOLEAN ClearMachineCheck,
IN BOOLEAN ClearCorrectableError
)
/*++
Routine Description:
This function updates the state of the MCES internal processor
register.
Arguments:
ClearMachineCheck - Supplies a boolean that indicates if the machine
check indicator in the MCES should be cleared.
ClearCorrectableError - Supplies a boolean that indicates if the
correctable error indicators in the MCES should
be cleared.
Return Value:
None.
--*/
{
MCES Mces;
Mces.MachineCheck = ClearMachineCheck;
Mces.SystemCorrectable = ClearCorrectableError;
Mces.ProcessorCorrectable = ClearCorrectableError;
Mces.DisableProcessorCorrectable = ProcessorCorrectableDisable;
Mces.DisableSystemCorrectable = SystemCorrectableDisable;
Mces.DisableMachineChecks = MachineCheckDisable;
HalpWriteMces( Mces );
}
BOOLEAN
HalMachineCheck (
IN PEXCEPTION_RECORD ExceptionRecord,
IN PKEXCEPTION_FRAME ExceptionFrame,
IN PKTRAP_FRAME TrapFrame
)
/*++
Routine Description:
This function fields machine check for 21064-based machines with
parity memory protection.
Arguments:
ExceptionRecord - Supplies a pointer to the exception record for the
machine check. Included in the exception information
is the pointer to the logout frame.
ExceptionFrame - Supplies a pointer to the kernel exception frame.
TrapFrame - Supplies a pointer to the kernel trap frame.
Return Value:
A value of TRUE is returned if the machine check has been
handled by the HAL. If it has been handled then execution may
resume at the faulting address. Otherwise, a value of FALSE
is returned.
N.B. - under some circumstances this routine may not return at
all.
--*/
{
PBIU_STAT_21064 BiuStat;
BOOLEAN Handled;
PLOGOUT_FRAME_21064 LogoutFrame;
PMCHK_STATUS MachineCheckStatus;
MCES Mces;
BOOLEAN UnhandledPlatformError = FALSE;
PUNCORRECTABLE_ERROR uncorrerr = NULL;
PPROCESSOR_EV4_UNCORRECTABLE ev4uncorr = NULL;
//
// This is a parity machine. If we receive a machine check it
// is uncorrectable. We will print the logout frame and then we
// will bugcheck.
//
// However, we will first check that the machine check is not
// marked as correctable. If the machine check is correctable it uses
// a differently formatted logout frame. We will ignore the frame
// for any machine check marked as correctable since it is an impossible
// condition on a parity machine.
//
MachineCheckStatus =
(PMCHK_STATUS)&ExceptionRecord->ExceptionInformation[0];
//
// Handle any processor correctable errors.
//
if( MachineCheckStatus->Correctable == 1 ){
//
// Log the error.
//
//jnfix - temporary code
// simply assume this was a fill ecc correctable for now, print
// a debug message periodically
CORRECTABLE_FRAME_21064 *CorrectableFrame;
CorrectableFrame =
(CORRECTABLE_FRAME_21064 *)ExceptionRecord->ExceptionInformation[1];
CorrectableErrors += 1;
#if (DBG) || (HALDBG)
if( (CorrectableErrors % 32) == 0 ){
DbgPrint( "Correctable errors = %d, Error Address = 0x%016Lx\n",
CorrectableFrame->FillAddr );
}
#endif //DBG || HALDBG
//
// Acknowledge receipt of the correctable error by clearing
// the error in the MCES register.
//
HalpUpdateMces( FALSE, TRUE );
return TRUE;
}
//
// Handle any retryable errors.
//
if( MachineCheckStatus->Retryable == 1 ){
//
// Increment the number of retryable machine checks.
//
RetryableErrors += 1;
#if (DBG) || (HALDBG)
DbgPrint( "HAL Retryable Errors = %d\n", RetryableErrors );
#endif //DBG || HALDBG
//
// We'll retry MAX_RETRYABLE_ERRORS times and then give up. We
// do this to avoid infinite retry loops.
//
if( RetryableErrors <= MAX_RETRYABLE_ERRORS ){
//
// Acknowledge receipt of the retryable machine check.
//
HalpUpdateMces( TRUE, TRUE );
return TRUE;
}
}
//
// Capture the logout frame pointer.
//
LogoutFrame =
(PLOGOUT_FRAME_21064)ExceptionRecord->ExceptionInformation[1];
if(PUncorrectableError) {
//
// Fill in the processor specific uncorrectable error frame
//
uncorrerr = (PUNCORRECTABLE_ERROR)
&PUncorrectableError->UncorrectableFrame;
//
// first fill in some generic processor Information.
// For the Current (Reporting) Processor.
//
HalpGetProcessorInfo(&uncorrerr->ReportingProcessor);
uncorrerr->Flags.ProcessorInformationValid = 1;
ev4uncorr = (PPROCESSOR_EV4_UNCORRECTABLE)
uncorrerr->RawProcessorInformation;
}
if(ev4uncorr) {
ev4uncorr->BiuStat = ((ULONGLONG)LogoutFrame->BiuStat.all.HighPart <<
32) | LogoutFrame->BiuStat.all.LowPart;
ev4uncorr->BiuAddr = ((ULONGLONG)LogoutFrame->BiuAddr.HighPart << 32) |
LogoutFrame->BiuAddr.LowPart;
ev4uncorr->AboxCtl = ((ULONGLONG)LogoutFrame->AboxCtl.all.HighPart <<
32) | LogoutFrame->AboxCtl.all.LowPart;
ev4uncorr->BiuCtl = ((ULONGLONG)LogoutFrame->BiuCtl.HighPart << 32) |
LogoutFrame->BiuCtl.LowPart;
ev4uncorr->CStat = ((ULONGLONG)LogoutFrame->DcStat.all.HighPart <<
32) | LogoutFrame->DcStat.all.LowPart;
ev4uncorr->BcTag = ((ULONGLONG)LogoutFrame->BcTag.all.HighPart <<
32 ) | LogoutFrame->BcTag.all.LowPart;
ev4uncorr->FillAddr = ((ULONGLONG)LogoutFrame->FillAddr.HighPart <<
32) | LogoutFrame->FillAddr.LowPart;
ev4uncorr->FillSyndrome =
((ULONGLONG)LogoutFrame->FillSyndrome.all.HighPart << 32) |
LogoutFrame->FillSyndrome.all.LowPart;
}
//
// Check for any hard errors that cannot be dismissed.
// They are:
// Tag parity error
// Tag control parity error
// Multiple external errors
// Fill ECC error
// Fill parity error
// Multiple fill errors
//
BiuStat = (PBIU_STAT_21064)&LogoutFrame->BiuStat;
if( (BiuStat->bits.BcTperr == 1) ||
(BiuStat->bits.BcTcperr == 1) ||
(BiuStat->bits.Fatal1 == 1) ||
(BiuStat->bits.FillEcc == 1) ||
(BiuStat->bits.FillDperr == 1) ||
(BiuStat->bits.Fatal2 == 1) ) {
//
// set the flag to indicate that the processor information is valid.
//
uncorrerr->Flags.ProcessorInformationValid = 1;
uncorrerr->Flags.ErrorStringValid = 1;
sprintf(uncorrerr->ErrorString,"Uncorrectable Error From "
"Processor Detected");
//
// A serious, uncorrectable error has occured, under no circumstances
// can it be simply dismissed.
//
goto FatalError;
}
//jnfix - dcache parity error checking for EV45?
//
// It is possible that the system has experienced a hard error and
// that nonetheless the error is recoverable. This is a system-specific
// decision - allow it to be handled as such.
//
UnhandledPlatformError = TRUE;
if( (Handled = HalpPlatformMachineCheck(
ExceptionRecord,
ExceptionFrame,
TrapFrame )) == TRUE ){
//
// The system-specific code has handled the error. Dismiss
// the error and continue execution.
//
HalpUpdateMces( TRUE, TRUE );
return TRUE;
}
//
// The system has experienced a fatal error that cannot be corrected.
// Print any possible relevant information and crash the system.
//
// N.B. - In the future some of these fatal errors could be potential
// recovered. Example, a user process gets a fatal error on one
// of its pages - we kill the user process, mark the page as bad
// and continue executing.
//
FatalError:
//
// Begin the error output if this is a processor error. If this is
// an unhandled platform error than that code is responsible for
// beginning the error output.
//
if( UnhandledPlatformError == FALSE ){
//
// Acquire ownership of the display. This is done here in case we take
// a machine check before the display has been taken away from the HAL.
// When the HAL begins displaying strings after it has lost the
// display ownership then the HAL will be careful not to scroll
// information off of the screen.
//
HalAcquireDisplayOwnership(NULL);
//
// Display the dreaded banner.
//
HalDisplayString( "\nFatal system hardware error.\n\n" );
}
//
// Display the EV4 logout frame.
//
HalpDisplayLogout21064( LogoutFrame );
//
// Bugcheck to dump the rest of the machine state, this will help
// if the machine check is software-related.
//
KeBugCheckEx( DATA_BUS_ERROR,
(ULONG)MachineCheckStatus->Correctable,
(ULONG)MachineCheckStatus->Retryable,
(ULONG)0,
(ULONG)PUncorrectableError );
}
#define MAX_ERROR_STRING 100
VOID
HalpDisplayLogout21064 (
IN PLOGOUT_FRAME_21064 LogoutFrame
)
/*++
Routine Description:
This function displays the logout frame for a 21064.
Arguments:
LogoutFrame - Supplies a pointer to the logout frame generated
by the 21064.
Return Value:
None.
--*/
{
UCHAR OutBuffer[ MAX_ERROR_STRING ];
ULONG Index;
PKPRCB Prcb;
extern HalpLogicalToPhysicalProcessor[HAL_MAXIMUM_PROCESSOR+1];
sprintf( OutBuffer, "BIU_STAT : %016Lx BIU_ADDR: %016Lx\n",
BIUSTAT_ALL_21064( LogoutFrame->BiuStat ),
LogoutFrame->BiuAddr.QuadPart );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "FILL_ADDR: %016Lx FILL_SYN: %016Lx\n",
LogoutFrame->FillAddr.QuadPart,
FILLSYNDROME_ALL_21064(LogoutFrame->FillSyndrome) );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "DC_STAT : %016Lx BC_TAG : %016Lx\n",
DCSTAT_ALL_21064(LogoutFrame->DcStat),
BCTAG_ALL_21064(LogoutFrame->BcTag) );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "ICCSR : %016Lx ABOX_CTL: %016Lx EXC_SUM: %016Lx\n",
ICCSR_ALL_21064(LogoutFrame->Iccsr),
ABOXCTL_ALL_21064(LogoutFrame->AboxCtl),
EXCSUM_ALL_21064(LogoutFrame->ExcSum) );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "EXC_ADDR : %016Lx VA : %016Lx MM_CSR : %016Lx\n",
LogoutFrame->ExcAddr.QuadPart,
LogoutFrame->Va.QuadPart,
MMCSR_ALL_21064(LogoutFrame->MmCsr) );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "HIRR : %016Lx HIER : %016Lx PS : %016Lx\n",
IRR_ALL_21064(LogoutFrame->Hirr),
IER_ALL_21064(LogoutFrame->Hier),
PS_ALL_21064(LogoutFrame->Ps) );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "Waiting 15 seconds...\n");
HalDisplayString( OutBuffer );
for( Index=0; Index<1500; Index++)
KeStallExecutionProcessor( 10000 ); // ~15 second delay
sprintf( OutBuffer, "PAL_BASE : %016Lx \n",
LogoutFrame->PalBase.QuadPart );
HalDisplayString( OutBuffer );
//
// Print out interpretation of the error.
//
//
// First print the processor on which we saw the error
//
Prcb = PCR->Prcb;
sprintf( OutBuffer, "Error on processor number: %d\n",
HalpLogicalToPhysicalProcessor[Prcb->Number] );
HalDisplayString( OutBuffer );
//
// If we got a Data Cache Parity Error print a message on screen.
//
if( DCSTAT_DCPARITY_ERROR_21064(LogoutFrame->DcStat) ){
sprintf( OutBuffer, "Data Cache Parity Error.\n" );
HalDisplayString( OutBuffer );
}
//
// Check for tag control parity error.
//
if( BIUSTAT_TCPERR_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer,
"Tag control parity error, Tag control: P: %1x D: %1x S: %1x V: %1x\n",
BCTAG_TAGCTLP_21064( LogoutFrame->BcTag ),
BCTAG_TAGCTLD_21064( LogoutFrame->BcTag ),
BCTAG_TAGCTLS_21064( LogoutFrame->BcTag ),
BCTAG_TAGCTLV_21064( LogoutFrame->BcTag ) );
HalDisplayString( OutBuffer );
}
//
// Check for tag parity error.
//
if( BIUSTAT_TPERR_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer,
"Tag parity error, Tag: %x Parity: %1x\n",
BCTAG_TAG_21064(LogoutFrame->BcTag),
BCTAG_TAGP_21064(LogoutFrame->BcTag) );
HalDisplayString( OutBuffer );
}
//
// Check for hard error.
//
if( BIUSTAT_HERR_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer, "Hard error acknowledge: BIU CMD: %x PA: %16Lx\n",
BIUSTAT_CMD_21064(LogoutFrame->BiuStat),
LogoutFrame->BiuAddr.QuadPart );
HalDisplayString( OutBuffer );
}
//
// Check for soft error.
//
if( BIUSTAT_SERR_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer, "Soft error acknowledge: BIU CMD: %x PA: %16Lx\n",
BIUSTAT_CMD_21064(LogoutFrame->BiuStat),
LogoutFrame->BiuAddr.QuadPart );
HalDisplayString( OutBuffer );
}
//
// Check for fill ECC errors.
//
if( BIUSTAT_FILLECC_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer, "ECC error: %s\n",
(BIUSTAT_FILLIRD_21064(LogoutFrame->BiuStat) ?
"Icache Fill" : "Dcache Fill") );
HalDisplayString( OutBuffer );
sprintf( OutBuffer,
"PA: %16Lx Quadword: %x Longword0: %x Longword1: %x\n",
LogoutFrame->FillAddr.QuadPart,
BIUSTAT_FILLQW_21064(LogoutFrame->BiuStat),
FILLSYNDROME_LO_21064(LogoutFrame->FillSyndrome),
FILLSYNDROME_HI_21064(LogoutFrame->FillSyndrome) );
HalDisplayString( OutBuffer );
}
//
// Check for fill Parity errors.
//
if( BIUSTAT_FILLDPERR_21064(LogoutFrame->BiuStat) == 1 ){
sprintf( OutBuffer, "Parity error: %s\n",
(BIUSTAT_FILLIRD_21064(LogoutFrame->BiuStat) ?
"Icache Fill" : "Dcache Fill") );
HalDisplayString( OutBuffer );
sprintf( OutBuffer,
"PA: %16Lx Quadword: %x Longword0: %x Longword1: %x\n",
LogoutFrame->FillAddr.QuadPart,
BIUSTAT_FILLQW_21064(LogoutFrame->BiuStat),
FILLSYNDROME_LO_21064(LogoutFrame->FillSyndrome),
FILLSYNDROME_HI_21064(LogoutFrame->FillSyndrome) );
HalDisplayString( OutBuffer );
}
//
// Check for multiple hard errors.
//
if( BIUSTAT_FATAL1_21064(LogoutFrame->BiuStat) == 1 ){
HalDisplayString( "Multiple external/tag errors detected.\n" );
}
//
// Check for multiple fill errors.
//
if( BIUSTAT_FATAL2_21064(LogoutFrame->BiuStat) == 1 ){
HalDisplayString( "Multiple fill errors detected.\n" );
}
//
// return to caller
//
return;
}