536 lines
20 KiB
C++
536 lines
20 KiB
C++
|
/*---------------------------------------------------------------------------
|
||
|
File: MonitorRunning.cpp
|
||
|
|
||
|
Comments: This is the entry point for a thread which will periodically try to connect
|
||
|
to the agents that the monitor thinks are running, to see if they are really still running.
|
||
|
|
||
|
This will keep the monitor from getting into a state where it thinks agents
|
||
|
are still running, when they are not.
|
||
|
|
||
|
(c) Copyright 1999, Mission Critical Software, Inc., All Rights Reserved
|
||
|
Proprietary and confidential to Mission Critical Software, Inc.
|
||
|
|
||
|
REVISION LOG ENTRY
|
||
|
Revision By: Christy Boles
|
||
|
|
||
|
---------------------------------------------------------------------------
|
||
|
*/
|
||
|
#include "stdafx.h"
|
||
|
#include "DetDlg.h"
|
||
|
|
||
|
#include "Common.hpp"
|
||
|
#include "AgRpcUtl.h"
|
||
|
#include "Monitor.h"
|
||
|
#include "ServList.hpp"
|
||
|
|
||
|
#include "ResStr.h"
|
||
|
|
||
|
//#include "..\AgtSvc\AgSvc.h"
|
||
|
#include "AgSvc.h"
|
||
|
|
||
|
/*#import "\bin\McsEADCTAgent.tlb" no_namespace , named_guids
|
||
|
//#import "\bin\McsVarSetMin.tlb" no_namespace */
|
||
|
|
||
|
//#import "Engine.tlb" no_namespace , named_guids //already #imported via DetDlg.h
|
||
|
#import "VarSet.tlb" no_namespace rename("property", "aproperty")
|
||
|
|
||
|
|
||
|
DWORD
|
||
|
TryConnectAgent(
|
||
|
TServerNode * node,
|
||
|
BOOL bSignalToShutdown, // indicates whether we want to signal the agent to shut down
|
||
|
DWORD dwMilliSeconds // indicates the auto shut down timeout
|
||
|
// we should query the agent again by this time
|
||
|
)
|
||
|
{
|
||
|
DWORD rc;
|
||
|
HRESULT hr;
|
||
|
HANDLE hBinding = NULL;
|
||
|
WCHAR * sBinding = NULL;
|
||
|
WCHAR server[MAX_PATH];
|
||
|
IUnknown * pUnk = NULL;
|
||
|
IVarSetPtr pVarSet;
|
||
|
IDCTAgentPtr pAgent;
|
||
|
_bstr_t jobID;
|
||
|
BOOL bSuccess = FALSE;
|
||
|
BOOL bQueryFailed = TRUE;
|
||
|
BOOL bFinished = FALSE;
|
||
|
CString status;
|
||
|
BOOL bCoInitialized = FALSE;
|
||
|
|
||
|
server[0] = L'\\';
|
||
|
server[1] = L'\\';
|
||
|
UStrCpy(server+2,node->GetServer());
|
||
|
|
||
|
rc = EaxBindCreate(server,&hBinding,&sBinding,TRUE);
|
||
|
if ( ! rc )
|
||
|
{
|
||
|
hr = CoInitialize(NULL);
|
||
|
if ( SUCCEEDED(hr) )
|
||
|
{
|
||
|
bCoInitialized = TRUE;
|
||
|
rc = DoRpcQuery(hBinding,&pUnk);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
rc = hr;
|
||
|
}
|
||
|
|
||
|
if ( ! rc && pUnk )
|
||
|
{
|
||
|
try {
|
||
|
|
||
|
// we got an interface pointer to the agent: try to query it
|
||
|
pAgent = pUnk;
|
||
|
pUnk->Release();
|
||
|
pUnk = NULL;
|
||
|
jobID = node->GetJobID();
|
||
|
|
||
|
hr = pAgent->raw_QueryJobStatus(jobID,&pUnk);
|
||
|
if ( SUCCEEDED(hr) )
|
||
|
{
|
||
|
// set the auto shut down for the agent so in case we don't
|
||
|
// lose connection to it it will shut down automatically
|
||
|
// usually, we should call this function again by that time
|
||
|
pAgent->raw_SetAutoShutDown(dwMilliSeconds);
|
||
|
bQueryFailed = FALSE;
|
||
|
pVarSet = pUnk;
|
||
|
pUnk->Release();
|
||
|
_bstr_t text = pVarSet->get(GET_BSTR(DCTVS_JobStatus));
|
||
|
|
||
|
if ( !UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed)))
|
||
|
{
|
||
|
bFinished = TRUE;
|
||
|
}
|
||
|
else if (!UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed_With_Errors)))
|
||
|
{
|
||
|
node->SetSeverity(2);
|
||
|
bFinished = TRUE;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
catch ( ... )
|
||
|
{
|
||
|
// the DCOM connection didn't work
|
||
|
// This means we can't tell whether the agent is running or not
|
||
|
bQueryFailed = TRUE;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ( rc == E_NOTIMPL )
|
||
|
{
|
||
|
status.LoadString(IDS_CantMonitorOnNt351);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
status.LoadString(IDS_CannotConnectToAgent);
|
||
|
}
|
||
|
bQueryFailed = TRUE;
|
||
|
}
|
||
|
EaxBindDestroy(&hBinding,&sBinding);
|
||
|
}
|
||
|
|
||
|
// if trying to signal the agent to shut down, we will do our best
|
||
|
if (bSignalToShutdown)
|
||
|
{
|
||
|
if (pAgent)
|
||
|
pAgent->raw_SignalOKToShutDown();
|
||
|
rc = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
node->SetMessageText(status.GetBuffer(0));
|
||
|
if ( bFinished )
|
||
|
{
|
||
|
node->SetFinished();
|
||
|
}
|
||
|
else if ( bQueryFailed )
|
||
|
{
|
||
|
node->SetQueryFailed(TRUE);
|
||
|
}
|
||
|
|
||
|
// update the server entry in the list window
|
||
|
HWND listWnd;
|
||
|
WCHAR sTime[32];
|
||
|
gData.GetListWindow(&listWnd);
|
||
|
node->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime ));
|
||
|
SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)node);
|
||
|
}
|
||
|
|
||
|
if (bCoInitialized)
|
||
|
CoUninitialize();
|
||
|
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
typedef TServerNode * PSERVERNODE;
|
||
|
|
||
|
|
||
|
//----------------------------------------------------------------------------
|
||
|
// Function: IsFileReady
|
||
|
//
|
||
|
// Synopsis: This function checks if a file exists and no other
|
||
|
// process is trying to write to it
|
||
|
//
|
||
|
// Arguments:
|
||
|
//
|
||
|
// filename the name of file to be checked
|
||
|
//
|
||
|
// Returns: returns TRUE if the file is ready; otherwise, returns FALSE
|
||
|
//
|
||
|
// Modifies:
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
BOOL IsFileReady(WCHAR* filename)
|
||
|
{
|
||
|
if (filename == NULL)
|
||
|
return FALSE;
|
||
|
|
||
|
HANDLE hResult = CreateFile((WCHAR*)filename,
|
||
|
GENERIC_READ,
|
||
|
FILE_SHARE_READ,
|
||
|
NULL,
|
||
|
OPEN_EXISTING,
|
||
|
FILE_ATTRIBUTE_NORMAL,
|
||
|
NULL);
|
||
|
|
||
|
if (hResult != INVALID_HANDLE_VALUE)
|
||
|
{
|
||
|
CloseHandle(hResult);
|
||
|
return TRUE;
|
||
|
}
|
||
|
else
|
||
|
return FALSE;
|
||
|
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------
|
||
|
// Function: MonitorRunningAgent
|
||
|
//
|
||
|
// Synopsis: This thread entry function is responsible for monitoring the agent represented
|
||
|
// by arg (will be casted into a TServerNode pointer).
|
||
|
// A brief monitoring logic is as follows:
|
||
|
// a. We set up a FindFirstChangeNotification (last write) to look for results
|
||
|
// on the remote machine
|
||
|
// b. Start the agent query interval to 1 minute.
|
||
|
// c. Use CreateFile to test whether results are present (using FILE_SHARE_READ to make
|
||
|
// sure the writing is done)
|
||
|
// This also makes sure we don't lose any last write before the notification is set up
|
||
|
// d. If result present, wait on notification for 1 minute (as we don't fully trust notification)
|
||
|
// If result not present, query agent to see if it is finished
|
||
|
// if finised, go to g
|
||
|
// if not finished, wait on notification for 1 minute
|
||
|
// e. If timeout:
|
||
|
// if query interval has been reached, query agent (in case results cannot be written)
|
||
|
// if finished, go to g
|
||
|
// if alive, double query interval (maxes out at 20 min), go to c
|
||
|
// if notification, go to c.
|
||
|
// g. pull result
|
||
|
//
|
||
|
// Arguments:
|
||
|
//
|
||
|
// arg this is the argument for thread entry point function; will be casted into
|
||
|
// a TServerNode pointer
|
||
|
//
|
||
|
// Returns: always return 0 as the status will be reflected in pNode
|
||
|
//
|
||
|
// Modifies:
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
DWORD __stdcall
|
||
|
MonitorRunningAgent(void * arg)
|
||
|
{
|
||
|
DWORD rc = 0;
|
||
|
BOOL bDone = FALSE;
|
||
|
TServerNode* pNode = (TServerNode*) arg;
|
||
|
|
||
|
const DWORD dwMaxTimeout = 1200000; // 20 minutes
|
||
|
const DWORD dwConversionFactor = 10000; // 1 millisecond / 100 nanoseconds
|
||
|
const DWORD dwNotificationTimeout = 60000; // 1 minute
|
||
|
const DWORD dwRetryTimeout = 60000; // 1 minute
|
||
|
DWORD dwAgentQueryTimeout = 60000; // 1 minute
|
||
|
ULARGE_INTEGER uliAgentQueryTimeout;
|
||
|
uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor;
|
||
|
|
||
|
// sanity check, we should not pass in NULL in the first place
|
||
|
_ASSERT(pNode != NULL);
|
||
|
if (pNode == NULL)
|
||
|
return 0;
|
||
|
|
||
|
BOOL bAccntRefExpected = pNode->IsAccountReferenceResultExpected();
|
||
|
BOOL bJoinDomainWithRename = pNode->IsJoinDomainWithRename();
|
||
|
HANDLE hFindChange = INVALID_HANDLE_VALUE;
|
||
|
ULARGE_INTEGER uliPreviousTime;
|
||
|
ULARGE_INTEGER uliCurrentTime;
|
||
|
_bstr_t remoteResultPath, jobFilename;
|
||
|
_bstr_t remoteResultFilename, resultFilename;
|
||
|
_bstr_t remoteSecrefsFilename, secrefsFilename;
|
||
|
_bstr_t statusFilename;
|
||
|
WCHAR resultPath[MAX_PATH];
|
||
|
gData.GetResultDir(resultPath);
|
||
|
|
||
|
// the following variables are for retry logic in case that agent query fails
|
||
|
// for "Join Domain with Rename" case, we use 5 retries to make sure joining domain could
|
||
|
// finish (usually, it takes under one minute but depending on the network condition and
|
||
|
// CPU usage of computers involved, it could take longer than one minute). Allowing five
|
||
|
// retries should cover it pretty well
|
||
|
// for other purpose, we use 2 retries.
|
||
|
const DWORD dwMaxNumOfQueryRetries = (bJoinDomainWithRename) ? 5 : 2; // maximum number of retries
|
||
|
DWORD dwNumOfQueryRetries = 0; // number of retries so far
|
||
|
|
||
|
BOOL bResultReady = FALSE; // indicates whether the file is ready on the remote machine
|
||
|
|
||
|
try
|
||
|
{
|
||
|
// prepare the remote and local result file names (both .result and .secrefs files)
|
||
|
remoteResultPath = pNode->GetRemoteResultPath();
|
||
|
jobFilename = pNode->GetJobFile();
|
||
|
remoteResultFilename = remoteResultPath + jobFilename + L".result";
|
||
|
resultFilename = _bstr_t(resultPath) + jobFilename + L".result";
|
||
|
if (bAccntRefExpected)
|
||
|
{
|
||
|
remoteSecrefsFilename = remoteResultPath + jobFilename + L".secrefs";
|
||
|
secrefsFilename = _bstr_t(resultPath) + jobFilename + L".secrefs";
|
||
|
}
|
||
|
|
||
|
if (bJoinDomainWithRename)
|
||
|
statusFilename = remoteResultPath + pNode->GetJobID();
|
||
|
|
||
|
HANDLE hResult; // file handle to result file
|
||
|
|
||
|
// start monitoring
|
||
|
// the following are the ways to get out of the while loop
|
||
|
// a. results have shown up in the remote directory and either
|
||
|
// the agent has finished or we cannot query it
|
||
|
// b. results have not shown up and either we cannot query the agent
|
||
|
// after certain number of retries (dwMaxNumOfQueryRetries)
|
||
|
// or the agent has completed
|
||
|
GetSystemTimeAsFileTime((FILETIME*)&uliPreviousTime); // we need to get a starting time for the timeout
|
||
|
do
|
||
|
{
|
||
|
// listen to the central control as well: if we're signaled to be done, let's do so
|
||
|
gData.GetDone(&bDone);
|
||
|
if (bDone)
|
||
|
break;
|
||
|
|
||
|
// if someone else (detail dialog) has detected the status of the agent, we don't need to keep monitoring
|
||
|
if (!pNode->IsRunning())
|
||
|
{
|
||
|
// check whether we have results back
|
||
|
if (IsFileReady(remoteResultFilename)
|
||
|
&& (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
|
||
|
bResultReady = TRUE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// if the notification has not been set up, we should try to set up
|
||
|
if (hFindChange == INVALID_HANDLE_VALUE)
|
||
|
{
|
||
|
hFindChange = FindFirstChangeNotification(remoteResultPath, FALSE, FILE_NOTIFY_CHANGE_LAST_WRITE);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// let's check result files if we have not gotten results yet
|
||
|
//
|
||
|
if (bResultReady == FALSE)
|
||
|
{
|
||
|
// check whether the .result and .secrefs files are ready
|
||
|
if (IsFileReady(remoteResultFilename)
|
||
|
&& (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
|
||
|
bResultReady = TRUE;
|
||
|
}
|
||
|
|
||
|
// now query the agent status
|
||
|
if (bResultReady)
|
||
|
{
|
||
|
rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout);
|
||
|
if (!pNode->IsRunning() || pNode->QueryFailed())
|
||
|
{
|
||
|
// if something is wrong or the agent is not running anymore
|
||
|
// let's get out of the loop
|
||
|
break;
|
||
|
}
|
||
|
dwNumOfQueryRetries = 0; // reset the number of retries so far to zero
|
||
|
}
|
||
|
else if (bJoinDomainWithRename)
|
||
|
{
|
||
|
// if it is the "join domain with rename" case, we want to take a look
|
||
|
// at status file as well
|
||
|
if (IsFileReady(statusFilename))
|
||
|
{
|
||
|
pNode->QueryStatusFromFile(statusFilename);
|
||
|
// just in case, we check result files again
|
||
|
if (IsFileReady(remoteResultFilename)
|
||
|
&& (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
|
||
|
bResultReady = TRUE;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// figure out the elapsed time to see whether you should query the agent
|
||
|
GetSystemTimeAsFileTime((FILETIME*)&uliCurrentTime);
|
||
|
BOOL bNeedToQueryAgent = FALSE;
|
||
|
// if somehow the time has been set back significantly or
|
||
|
// the timeout period has elapsed
|
||
|
// we should query the agent
|
||
|
// note: in the retry case, we use dwRetryTimeout instead of uliAgentQueryTimeout
|
||
|
// since if we do not want to wait too long before a retry
|
||
|
if (uliCurrentTime.QuadPart <= uliPreviousTime.QuadPart
|
||
|
|| (dwNumOfQueryRetries > 0
|
||
|
&& uliPreviousTime.QuadPart + dwRetryTimeout <= uliCurrentTime.QuadPart)
|
||
|
|| uliPreviousTime.QuadPart + uliAgentQueryTimeout.QuadPart <= uliCurrentTime.QuadPart)
|
||
|
{
|
||
|
bNeedToQueryAgent = TRUE;
|
||
|
}
|
||
|
|
||
|
if (bNeedToQueryAgent)
|
||
|
{
|
||
|
// reset the timeout for querying agent
|
||
|
|
||
|
// if not in the retry case, we double the timeout
|
||
|
// otherwise, we use the same timeout value
|
||
|
if (dwNumOfQueryRetries == 0)
|
||
|
{
|
||
|
dwAgentQueryTimeout += dwAgentQueryTimeout;
|
||
|
// if it hits the maximum timeout, it is set to the maximum value
|
||
|
if (dwAgentQueryTimeout > dwMaxTimeout)
|
||
|
dwAgentQueryTimeout = dwMaxTimeout;
|
||
|
uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor;
|
||
|
}
|
||
|
uliPreviousTime = uliCurrentTime;
|
||
|
|
||
|
rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout);
|
||
|
|
||
|
// if it is the "join domain with rename" case and we are getting ERROR_ACCESS_DENIED
|
||
|
// or RPC_S_SERVER_UNAVAILABLE, we should check the status file
|
||
|
if (bJoinDomainWithRename
|
||
|
&& (rc == ERROR_ACCESS_DENIED || rc == RPC_S_SERVER_UNAVAILABLE))
|
||
|
{
|
||
|
pNode->QueryStatusFromFile(statusFilename);
|
||
|
}
|
||
|
|
||
|
if (pNode->QueryFailed())
|
||
|
{
|
||
|
if (dwNumOfQueryRetries < dwMaxNumOfQueryRetries)
|
||
|
{
|
||
|
// in retry mode, we need to use the original timeout value
|
||
|
dwNumOfQueryRetries++;
|
||
|
pNode->SetQueryFailed(FALSE);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// we have retried enough times, let's break out of the loop
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
else if (!pNode->IsRunning())
|
||
|
{
|
||
|
// if something is wrong or the agent is not running anymore
|
||
|
// let's get out of the loop
|
||
|
// but first check the result files again if they are not ready yet
|
||
|
if (!bResultReady && IsFileReady(remoteResultFilename)
|
||
|
&& (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
|
||
|
bResultReady = TRUE;
|
||
|
break;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// reset the number of query of retries to zero
|
||
|
dwNumOfQueryRetries = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// wait for the notification or sleep for one minute
|
||
|
// this is to make agent monitoring thread as robust as possible
|
||
|
if (hFindChange != INVALID_HANDLE_VALUE)
|
||
|
{
|
||
|
// if the notification is set up, let's wait on it
|
||
|
WaitForSingleObject(hFindChange, dwNotificationTimeout);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// if the notification is not set up, let's sleep for one minute
|
||
|
Sleep(dwNotificationTimeout);
|
||
|
}
|
||
|
|
||
|
// find the next notification
|
||
|
if (hFindChange != INVALID_HANDLE_VALUE)
|
||
|
{
|
||
|
// this part is to make sure the code is robust
|
||
|
if (!FindNextChangeNotification(hFindChange))
|
||
|
{
|
||
|
FindCloseChangeNotification(hFindChange);
|
||
|
hFindChange = INVALID_HANDLE_VALUE;
|
||
|
}
|
||
|
}
|
||
|
} while (!bDone);
|
||
|
|
||
|
//
|
||
|
// pull the result
|
||
|
//
|
||
|
pNode->SetHasResult(FALSE);
|
||
|
|
||
|
if (bResultReady)
|
||
|
{
|
||
|
// make sure we copy all needed files over
|
||
|
if (CopyFile(remoteResultFilename,resultFilename,FALSE)
|
||
|
&& (!pNode->IsAccountReferenceResultExpected()
|
||
|
|| (pNode->IsAccountReferenceResultExpected()
|
||
|
&& CopyFile(remoteSecrefsFilename,secrefsFilename,FALSE))))
|
||
|
{
|
||
|
// mark that we have the result
|
||
|
pNode->SetHasResult(TRUE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// we should always mark that we have tried to pull the result
|
||
|
// we do this after we tried to pull results so that the result monitoring thread
|
||
|
// can handle it correctly
|
||
|
pNode->SetResultPullingTried(TRUE);
|
||
|
|
||
|
// finally, we signal the agent to shut down
|
||
|
// however in the "join domain with rename" case, since we already lost contact
|
||
|
// with the agent, we should not attempt to call TryConnectAgent
|
||
|
if (!pNode->QueryFailed() && !bJoinDomainWithRename)
|
||
|
{
|
||
|
// tell the agent to shut down in 1 minute just in case
|
||
|
// note: by using TRUE here, the status will not be updated
|
||
|
TryConnectAgent(pNode, TRUE, 60000);
|
||
|
}
|
||
|
|
||
|
// if we cannot query the agent, we assume it has finished
|
||
|
if (pNode->QueryFailed())
|
||
|
{
|
||
|
if (bResultReady)
|
||
|
{
|
||
|
// if bResultReady is TRUE, we will clean the Agent_Status_QueryFailed bit
|
||
|
pNode->SetQueryFailed(FALSE);
|
||
|
}
|
||
|
pNode->SetFinished();
|
||
|
}
|
||
|
|
||
|
// one more update
|
||
|
HWND listWnd;
|
||
|
WCHAR sTime[32];
|
||
|
gData.GetListWindow(&listWnd);
|
||
|
pNode->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime ));
|
||
|
SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)pNode);
|
||
|
}
|
||
|
catch (_com_error& e)
|
||
|
{
|
||
|
pNode->SetFailed();
|
||
|
pNode->SetOutOfResourceToMonitor(TRUE);
|
||
|
}
|
||
|
|
||
|
// clean up
|
||
|
if (hFindChange != INVALID_HANDLE_VALUE)
|
||
|
FindCloseChangeNotification(hFindChange);
|
||
|
|
||
|
pNode->SetDoneMonitoring(TRUE);
|
||
|
|
||
|
return 0;
|
||
|
}
|