NT4/private/ntos/nthals/halalpha/ev5mchk.c
2020-09-30 17:12:29 +02:00

678 lines
17 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*++
Copyright (c) 1994 Digital Equipment Corporation
Module Name:
ev5mchk.c
Abstract:
This module implements generalized machine check handling for
platforms based on the DECchip 21164 (EV5) microprocessor.
Author:
Joe Notarangelo 30-Jun-1994
Environment:
Kernel mode only.
Revision History:
--*/
#include "halp.h"
#include "axp21164.h"
#include "stdio.h"
//
// Declare the extern variable UncorrectableError declared in
// inithal.c.
//
extern PERROR_FRAME PUncorrectableError;
VOID
HalpDisplayLogout21164(
IN PLOGOUT_FRAME_21164 LogoutFrame );
BOOLEAN
HalpPlatformMachineCheck(
IN PEXCEPTION_RECORD ExceptionRecord,
IN PKEXCEPTION_FRAME ExceptionFrame,
IN PKTRAP_FRAME TrapFrame
);
VOID
HalpUpdateMces(
IN BOOLEAN ClearMachineCheck,
IN BOOLEAN ClearCorrectableError
);
//
// System-wide controls for machine check reporting.
//
ProcessorCorrectableDisable = FALSE;
SystemCorrectableDisable = FALSE;
MachineCheckDisable = FALSE;
//
// Error counts.
//
ULONG CorrectableErrors = 0;
ULONG RetryableErrors = 0;
VOID
HalpSetMachineCheckEnables(
IN BOOLEAN DisableMachineChecks,
IN BOOLEAN DisableProcessorCorrectables,
IN BOOLEAN DisableSystemCorrectables
)
/*++
Routine Description:
This function sets the enables that define which machine check
errors will be signaled by the processor.
N.B. - The system has the capability to ignore all machine checks
by indicating DisableMachineChecks = TRUE. This is intended
for debugging purposes on broken hardware. If you disable
this you will get no machine check no matter what error the
system/processor detects. Consider the consequences.
Arguments:
DisableMachineChecks - Supplies a boolean which indicates if all
machine checks should be disabled and not
reported. (see note above).
DisableProcessorCorrectables - Supplies a boolean which indicates if
processor correctable error reporting
should be disabled.
DisableSystemCorrectables - Supplies a boolean which indicates if
system correctable error reporting
should be disabled.
Return Value:
None.
--*/
{
ProcessorCorrectableDisable = DisableProcessorCorrectables;
SystemCorrectableDisable = DisableSystemCorrectables;
MachineCheckDisable = DisableMachineChecks;
HalpUpdateMces( FALSE, FALSE );
return;
}
VOID
HalpUpdateMces(
IN BOOLEAN ClearMachineCheck,
IN BOOLEAN ClearCorrectableError
)
/*++
Routine Description:
This function updates the state of the MCES internal processor
register.
Arguments:
ClearMachineCheck - Supplies a boolean that indicates if the machine
check indicator in the MCES should be cleared.
ClearCorrectableError - Supplies a boolean that indicates if the
correctable error indicators in the MCES should
be cleared.
Return Value:
None.
--*/
{
MCES Mces;
Mces.MachineCheck = ClearMachineCheck;
Mces.SystemCorrectable = ClearCorrectableError;
Mces.ProcessorCorrectable = ClearCorrectableError;
Mces.DisableProcessorCorrectable = ProcessorCorrectableDisable;
Mces.DisableSystemCorrectable = SystemCorrectableDisable;
Mces.DisableMachineChecks = MachineCheckDisable;
HalpWriteMces( Mces );
}
BOOLEAN
HalMachineCheck (
IN PEXCEPTION_RECORD ExceptionRecord,
IN PKEXCEPTION_FRAME ExceptionFrame,
IN PKTRAP_FRAME TrapFrame
)
/*++
Routine Description:
This function fields machine check for 21164-based machines.
Arguments:
ExceptionRecord - Supplies a pointer to the exception record for the
machine check. Included in the exception information
is the pointer to the logout frame.
ExceptionFrame - Supplies a pointer to the kernel exception frame.
TrapFrame - Supplies a pointer to the kernel trap frame.
Return Value:
A value of TRUE is returned if the machine check has been
handled by the HAL. If it has been handled then execution may
resume at the faulting address. Otherwise, a value of FALSE
is returned.
N.B. - Under some circumstances this routine may not return at
all.
--*/
{
BOOLEAN Handled;
PLOGOUT_FRAME_21164 LogoutFrame;
PMCHK_STATUS MachineCheckStatus;
MCES Mces;
PICPERR_STAT_21164 icPerrStat;
PDC_PERR_STAT_21164 dcPerrStat;
PSC_STAT_21164 scStat;
PEI_STAT_21164 eiStat;
BOOLEAN UnhandledPlatformError = FALSE;
PUNCORRECTABLE_ERROR uncorrerr = NULL;
PPROCESSOR_EV5_UNCORRECTABLE ev5uncorr = NULL;
//
// Check for retryable errors. These are usually I-stream parity
// errors, which may be retried following a cache flush (the cache
// flush is handled by the PAL).
//
MachineCheckStatus =
(PMCHK_STATUS)&ExceptionRecord->ExceptionInformation[0];
//
// Handle any retryable errors.
//
if( MachineCheckStatus->Retryable == 1 ){
//
// Log the error.
//
RetryableErrors += 1;
#if (DBG) || (HALDBG)
if( (RetryableErrors % 32) == 0 ){
DbgPrint( "HAL Retryable Errors = %d\n", RetryableErrors );
}
#endif //DBG || HALDBG
//
// Acknowledge receipt of the retryable machine check.
//
HalpUpdateMces( TRUE, TRUE );
return TRUE;
}
//
// Capture the logout frame pointer.
//
LogoutFrame =
(PLOGOUT_FRAME_21164)ExceptionRecord->ExceptionInformation[1];
//
// Check for any hard errors that cannot be dismissed.
// They are:
// Tag parity error
// Tag control parity error
// Multiple external errors
// Fill ECC error
// Fill parity error
// Multiple fill errors
//
icPerrStat = (PICPERR_STAT_21164)&LogoutFrame->IcPerrStat;
dcPerrStat = (PDC_PERR_STAT_21164)&LogoutFrame->DcPerrStat;
scStat = (PSC_STAT_21164)&LogoutFrame->ScStat;
eiStat = (PEI_STAT_21164)&LogoutFrame->EiStat;
if(PUncorrectableError) {
//
// Fill in the processor specific uncorrectable error frame
//
uncorrerr = (PUNCORRECTABLE_ERROR)
&PUncorrectableError->UncorrectableFrame;
//
// first fill in some generic processor Information.
// For the Current (Reporting) Processor.
//
HalpGetProcessorInfo(&uncorrerr->ReportingProcessor);
uncorrerr->Flags.ProcessorInformationValid = 1;
ev5uncorr = (PPROCESSOR_EV5_UNCORRECTABLE)
uncorrerr->RawProcessorInformation;
}
if(ev5uncorr){
ev5uncorr->IcPerrStat = LogoutFrame->IcPerrStat.all;
ev5uncorr->DcPerrStat = LogoutFrame->DcPerrStat.all;
ev5uncorr->ScStat = LogoutFrame->ScStat.all;
ev5uncorr->ScAddr = LogoutFrame->ScAddr.all;
ev5uncorr->EiStat = LogoutFrame->EiStat.all;
ev5uncorr->BcTagAddr = LogoutFrame->BcTagAddr.all;
ev5uncorr->EiAddr = LogoutFrame->EiAddr.all;
ev5uncorr->FillSyn = LogoutFrame->FillSyn.all;
ev5uncorr->BcConfig = LogoutFrame->BcConfig.all;
ev5uncorr->BcControl = LogoutFrame->BcControl.all;
}
//
// SjBfix. The External parity error checking is disabled due to bug
// Rattler chipset on Gamma which causes the parity error on
// machine checks due to reads to PCI config space. (fixed in pass 2)
//
if ( icPerrStat->Dpe == 1 || icPerrStat->Tpe == 1 ||
icPerrStat->Tmr == 1 || dcPerrStat->Lock == 1 ||
scStat->ScTperr == 1 || scStat->ScDperr == 1 ||
eiStat->BcTperr == 1 || eiStat->BcTcperr == 1 ||
// eiStat->UncEccErr == 1 || eiStat->EiParErr == 1 ||
eiStat->SeoHrdErr == 1 || scStat->ScScndErr == 1 ){
//
// A serious, uncorrectable error has occured, under no circumstances
// can it be simply dismissed.
//
goto FatalError;
}
//
// It is possible that the system has experienced a hard error and
// that nonetheless the error is recoverable. This is a system-specific
// decision - allow it to be handled as such.
//
UnhandledPlatformError = TRUE;
if( (Handled = HalpPlatformMachineCheck(
ExceptionRecord,
ExceptionFrame,
TrapFrame )) == TRUE ){
//
// The system-specific code has handled the error. Dismiss
// the error and continue execution.
//
HalpUpdateMces( TRUE, TRUE );
return TRUE;
}
//
// The system has experienced a fatal error that cannot be corrected.
// Print any possible relevant information and crash the system.
//
// N.B. - In the future some of these fatal errors could be potential
// recovered. Example, a user process gets a fatal error on one
// of its pages - we kill the user process, mark the page as bad
// and continue executing.
//
FatalError:
uncorrerr->Flags.ErrorStringValid = 1;
sprintf(uncorrerr->ErrorString,"Uncorrectable Error From "
"Processor Detected");
//
// Begin the error output if this is a processor error. If this is
// an unhandled platform error than that code is responsible for
// beginning the error output.
//
if( UnhandledPlatformError == FALSE ){
//
// Acquire ownership of the display. This is done here in case we take
// a machine check before the display has been taken away from the HAL.
// When the HAL begins displaying strings after it has lost the
// display ownership then the HAL will be careful not to scroll
// information off of the screen.
//
HalAcquireDisplayOwnership(NULL);
//
// Display the dreaded banner.
//
HalDisplayString( "\nFatal system hardware error.\n\n" );
}
//
// Display the EV5 logout frame.
//
HalpDisplayLogout21164( LogoutFrame );
//
// Bugcheck to dump the rest of the machine state, this will help
// if the machine check is software-related.
//
KeBugCheckEx( DATA_BUS_ERROR,
(ULONG)MachineCheckStatus->Correctable,
(ULONG)MachineCheckStatus->Retryable,
0,
(ULONG)PUncorrectableError );
}
#define MAX_ERROR_STRING 100
VOID
HalpDisplayLogout21164 (
IN PLOGOUT_FRAME_21164 LogoutFrame
)
/*++
Routine Description:
This function displays the logout frame for a 21164.
Arguments:
LogoutFrame - Supplies a pointer to the logout frame generated
by the 21164.
Return Value:
None.
--*/
{
UCHAR OutBuffer[ MAX_ERROR_STRING ];
sprintf( OutBuffer, "ICSR : %016Lx ICPERR_STAT : %016Lx\n",
LogoutFrame->Icsr.all, LogoutFrame->IcPerrStat.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "MM_STAT : %016Lx DC_PERR_STAT : %016Lx\n",
LogoutFrame->MmStat.all,
LogoutFrame->DcPerrStat.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "PS : %016Lx VA : %016Lx VA_FORM : %016Lx\n",
LogoutFrame->Ps,
LogoutFrame->Va,
LogoutFrame->VaForm );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "ISR : %016Lx IPL : %016Lx INTID : %016Lx\n",
LogoutFrame->Isr.all,
LogoutFrame->Ipl,
LogoutFrame->IntId );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "SC_STAT : %016Lx SC_CTL : %016Lx SC_ADDR : %016Lx\n",
LogoutFrame->ScStat.all,
LogoutFrame->ScCtl.all,
LogoutFrame->ScAddr.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "EI_STAT : %016Lx EI_ADDR : %016Lx\n",
LogoutFrame->EiStat.all, LogoutFrame->EiAddr.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "BC_TAG_ADDR : %016Lx FILL_SYN : %016Lx\n",
LogoutFrame->BcTagAddr.all, LogoutFrame->FillSyn.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "BC_CONTROL : %016Lx BC_CONFIG : %016Lx\n",
LogoutFrame->BcControl.all, LogoutFrame->BcConfig.all );
HalDisplayString( OutBuffer );
sprintf( OutBuffer, "EXC_ADDR : %016Lx PAL_BASE : %016Lx\n",
LogoutFrame->ExcAddr, LogoutFrame->PalBase );
HalDisplayString( OutBuffer );
//
// Print out interpretation of the error.
//
HalDisplayString( "\n" );
//
// Check for tag parity error.
//
if ( LogoutFrame->IcPerrStat.Dpe == 1 ||
LogoutFrame->IcPerrStat.Tpe == 1 ){
//
// Note: The excAddr may contain the address of the instruction
// the caused the parity error but it is not guaranteed:
//
sprintf( OutBuffer, "Icache %s parity error, Addr: %x\n",
LogoutFrame->IcPerrStat.Dpe ? "Data" : "Tag",
LogoutFrame->ExcAddr );
HalDisplayString( OutBuffer );
} else if ( LogoutFrame->DcPerrStat.Lock == 1 ){
sprintf( OutBuffer, "Dcache %s parity error, Addr: %x\n",
LogoutFrame->DcPerrStat.Dp0 || LogoutFrame->DcPerrStat.Dp1 ?
"Data" : "Tag",
LogoutFrame->Va );
HalDisplayString( OutBuffer );
} else if ( LogoutFrame->ScStat.ScTperr != 0 ) {
sprintf( OutBuffer,
"Scache Tag parity error, Addr: %x Tag: %x Cmd: %x\n",
LogoutFrame->ScAddr.ScAddr,
LogoutFrame->ScStat.ScTperr,
LogoutFrame->ScStat.CboxCmd);
HalDisplayString( OutBuffer );
} else if ( LogoutFrame->ScStat.ScDperr != 0 ) {
sprintf( OutBuffer,
"Scache Data parity error, Addr: %x Tag: %x Cmd: %x\n",
LogoutFrame->ScAddr.ScAddr,
LogoutFrame->ScStat.ScDperr,
LogoutFrame->ScStat.CboxCmd);
HalDisplayString( OutBuffer );
} else if ( LogoutFrame->EiStat.BcTperr == 1 ||
LogoutFrame->EiStat.BcTcperr == 1 ){
sprintf( OutBuffer,
"Bcache Tag Parity error, Addr: %x Tag: %x\n",
LogoutFrame->EiAddr.EiAddr,
LogoutFrame->BcTagAddr.Tag1);
HalDisplayString( OutBuffer );
}
//
// Check for timeout reset error:
//
if ( LogoutFrame->IcPerrStat.Tmr == 1 ){
sprintf( OutBuffer, "Timeout Reset Error\n" );
HalDisplayString( OutBuffer );
}
//
// Check for fill ECC errors.
//
if( LogoutFrame->EiStat.UncEccErr == 1 ){
sprintf( OutBuffer, "Uncorrectable ECC error: %s\n",
LogoutFrame->EiStat.FilIrd ? "Icache Fill" : "Dcache Fill" );
HalDisplayString( OutBuffer );
sprintf( OutBuffer,
"PA: %16Lx Longword0: %x Longword1: %x\n",
LogoutFrame->EiAddr.EiAddr,
LogoutFrame->FillSyn.Lo,
LogoutFrame->FillSyn.Hi );
HalDisplayString( OutBuffer );
}
//
// Check for address/command parity error
//
if( LogoutFrame->EiStat.EiParErr == 1 ){
sprintf( OutBuffer, "Address/Command parity error, Addr=%x\n",
LogoutFrame->EiAddr.EiAddr );
HalDisplayString( OutBuffer );
}
//
// Check for multiple hard errors.
//
if ( LogoutFrame->ScStat.ScScndErr == 1 ){
HalDisplayString( "Multiple Scache parity errors detected.\n" );
}
if( LogoutFrame->EiStat.SeoHrdErr == 1 ){
HalDisplayString( "Multiple external/tag errors detected.\n" );
}
return;
}
BOOLEAN
Halp21164CorrectedErrorInterrupt (
VOID
)
/*++
Routine Description:
This is the interrupt handler for the 21164 processor corrected error
interrupt.
Arguments:
None.
Return Value:
None.
--*/
{
//
// Handle any processor correctable errors.
//
//
// Log the error.
//
// simply assume this was a fill ecc correctable for now, print
// a debug message periodically
CorrectableErrors += 1;
#if 0 //jnfix
#if (DBG) || (HALDBG)
if( (CorrectableErrors % 32) == 0 ){
DbgPrint( "Correctable errors = %d\n", CorrectableErrors );
}
#endif //DBG || HALDBG
#endif //0 jnfix
//
// Acknowledge receipt of the correctable error by clearing
// the error in the MCES register.
//
HalpUpdateMces( FALSE, TRUE );
return TRUE;
}