Windows2003-3790/inetcore/urlmon/trans/datasnif.cxx
2020-09-30 16:53:55 +02:00

1365 lines
41 KiB
C++

//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1992 - 1996.
//
// File: datasnif.cxx
//
// Contents: Stream Mime type checking (attempts to guess the MIME type
// of a buffer by simple pattern matching).
//
// Classes: CContentAnalyzer
//
// Functions: private:
// CContentAnalyzer::SampleData
// CContentAnalyzer::IsBMP
// CContentAnalyzer::GetDataFormat
// CContentAnalyzer::FormatAgreesWithData
// CContentAnalyzer::MatchDWordAtOffset
// CContentAnalyzer::FindAppFromExt
// CContentAnalyzer::CheckTextHeaders
// CContentAnalyzer::CheckBinaryHeaders
//
// public:
// CContentAnalyzer::FindMimeFromData
// ::FindMimeFromData
//
//
// History: 05-25-96 AdriaanC (Adriaan Canter) Created
// 07-16-96 AdriaanC (Adriaan Canter) Modified
// 08-06-96 AdriaanC (Adriaan Canter) Modified
// 08-14-96 AdriaanC (Adriaan Canter) Modified
//
//----------------------------------------------------------------------------
#include <trans.h>
#include "datasnif.hxx"
#include <shlwapip.h>
#ifdef UNIX
#include <mainwin.h>
#endif
PerfDbgTag(tagDataSniff, "Urlmon", "Log DataSniff", DEB_DATA);
// Max no. bytes to look at
#define SAMPLE_SIZE 256
// Registry Key for app/fileext associations
#define szApplicationRegistryKey "\\Shell\\Open\\Command"
#define szApplicationRegistryKey2 "\\Shell\\Connect To\\Command"
#define szMimeRegistryKey "MIME\\Database\\Content Type\\"
// Magic header words
#define AU_SUN_MAGIC 0x2e736e64
#define AU_SUN_INV_MAGIC 0x646e732e
#define AU_DEC_MAGIC 0x2e736400
#define AU_DEC_INV_MAGIC 0x0064732e
#define AIFF_MAGIC 0x464f524d
#define AIFF_INV_MAGIC 0x4d524f46
#define AIFF_MAGIC_MORE_1 'AIFF'
#define AIFF_MAGIC_MORE_2 'AIFC'
#define RIFF_MAGIC 0x52494646
#define AVI_MAGIC 0x41564920
#define WAV_MAGIC 0x57415645
#define JAVA_MAGIC 0xcafebabe
#define MPEG_MAGIC 0x000001b3
#define MPEG_MAGIC_2 0x000001ba
#define EMF_MAGIC_1 0x01000000
#define EMF_MAGIC_2 0x20454d46
#define WMF_MAGIC 0xd7cdc69a
#define JPEG_MAGIC_1 0xFF
#define JPEG_MAGIC_2 0xD8
// Magic header text
CHAR vszRichTextMagic[] = "{\\rtf";
CHAR vszPostscriptMagic[] = "%!";
CHAR vszBinHexMagic[] = "onverted with BinHex";
CHAR vszBase64Magic[] = "begin";
CHAR vszGif87Magic[] = "GIF87";
CHAR vszGif89Magic[] = "GIF89";
CHAR vszTiffMagic[] = "MM";
CHAR vszBmpMagic[] = "BM";
CHAR vszZipMagic[] = "PK";
CHAR vszExeMagic[] = "MZ";
CHAR vszPngMagic[] = "\211PNG\r\n\032\n";
CHAR vszCompressMagic[] = "\037\235";
CHAR vszGzipMagic[] = "\037\213";
CHAR vszXbmMagic1[] = "define";
CHAR vszXbmMagic2[] = "width";
CHAR vszXbmMagic3[] = "bits";
CHAR vszPdfMagic[] = "%PDF";
CHAR vszJGMagic[] = "JG";
CHAR vszMIDMagic[] = "MThd";
// null MIME type
WCHAR vwzNULL[] = L"(null)";
// 7 bit MIME Types
WCHAR vwzTextPlain[] = L"text/plain";
WCHAR vwzTextRichText[] = L"text/richtext";
WCHAR vwzImageXBitmap[] = L"image/x-xbitmap";
WCHAR vwzApplicationPostscript[] = L"application/postscript";
WCHAR vwzApplicationBase64[] = L"application/base64";
WCHAR vwzApplicationMacBinhex[] = L"application/macbinhex40";
WCHAR vwzApplicationPdf[] = L"application/pdf";
WCHAR vwzApplicationCDF[] = L"application/x-cdf";
WCHAR vwzApplicationNETCDF[] = L"application/x-netcdf";
WCHAR vwzmultipartmixedreplace[] = L"multipart/x-mixed-replace";
WCHAR vwzmultipartmixed[] = L"multipart/mixed";
WCHAR vwzTextScriptlet[] = L"text/scriptlet";
WCHAR vwzTextComponent[] = L"text/x-component";
WCHAR vwzTextXML[] = L"text/xml";
WCHAR vwzApplicationHTA[] = L"application/hta";
// 8 bit MIME types
WCHAR vwzAudioAiff[] = L"audio/x-aiff";
WCHAR vwzAudioBasic[] = L"audio/basic";
WCHAR vwzAudioWav[] = L"audio/wav";
WCHAR vwzAudioMID[] = L"audio/mid";
WCHAR vwzImageGif[] = L"image/gif";
WCHAR vwzImagePJpeg[] = L"image/pjpeg";
WCHAR vwzImageJpeg[] = L"image/jpeg";
WCHAR vwzImageTiff[] = L"image/tiff";
WCHAR vwzImagePng[] = L"image/x-png";
WCHAR vwzImageBmp[] = L"image/bmp";
WCHAR vwzImageJG[] = L"image/x-jg";
WCHAR vwzImageEmf[] = L"image/x-emf";
WCHAR vwzImageWmf[] = L"image/x-wmf";
WCHAR vwzVideoAvi[] = L"video/avi";
WCHAR vwzVideoMpeg[] = L"video/mpeg";
WCHAR vwzApplicationCompressed[] = L"application/x-compressed";
WCHAR vwzApplicationZipCompressed[] = L"application/x-zip-compressed";
WCHAR vwzApplicationGzipCompressed[] = L"application/x-gzip-compressed";
WCHAR vwzApplicationJava[] = L"application/java";
WCHAR vwzApplicationMSDownload[] = L"application/x-msdownload";
// 7 or 8 bit MIME types
WCHAR vwzTextHTML[] = L"text/html";
WCHAR vwzApplicationOctetStream[] = L"application/octet-stream";
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::SampleData
//
// Synopsis:
//
// Arguments: (void)
//
// Returns: (void)
//
// History: 5-25-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
void CContentAnalyzer::SampleData()
{
DEBUG_ENTER((DBG_TRANS,
None,
"CContentAnalyzer::SampleData",
"this=%#x",
this
));
BOOL fFoundFirstXBitMapTag = FALSE;
BOOL fFoundSecondXBitMapTag = FALSE;
BOOL fFoundAsciiChar = FALSE;
int nHTMLConfidence = 0;
unsigned char *p = (unsigned char*) _pBuf;
_cbNL = _cbCR = _cbFF = _cbText = _cbCtrl = _cbHigh = 0;
// Count incidence of character types.
for (int i = 0; i < _cbSample - 1; i++)
{
fFoundAsciiChar = FALSE;
if (*p == '\n') // new line
{
_cbNL++;
}
else if (*p == '\r') // carriage return
{
_cbCR++;
}
else if (*p == '\f') // form feed
{
_cbFF++;
}
else if (*p == '\t') // tab
{
_cbText++;
}
else if (*p < 32) // control character
{
_cbCtrl++;
}
else if (*p >= 32 && *p < 128) // regular text
{
_cbText++;
fFoundAsciiChar = TRUE;
}
else // extended text
{
_cbHigh++;
}
if (fFoundAsciiChar)
{
// check for html
if (*p == '<')
{
if (!StrCmpNIC((char*) p+1, "?XML", sizeof("?XML") - 1) &&
(
(*(p+5) == ':') ||
(*(p+5) == ' ') ||
(*(p+5) == '\t')) )
{
_fFoundXML = TRUE;
// don't break : for CDF
}
if (!StrCmpNIC((char*) p+1, "SCRIPTLET", sizeof("SCRIPTLET") - 1))
{
_fFoundTextScriptlet = TRUE;
break;
}
if (!StrCmpNIC((char*) p+1, "HTML", sizeof("HTML") - 1)
|| !StrCmpNIC((char*) p+1, "HEAD", sizeof("HEAD") - 1)
|| !StrCmpNIC((char*) p+1, "TITLE", sizeof("TITLE") - 1)
|| !StrCmpNIC((char*) p+1, "BODY", sizeof("BODY") - 1)
|| !StrCmpNIC((char*) p+1, "SCRIPT", sizeof("SCRIPT") - 1)
|| !StrCmpNIC((char*) p+1, "A HREF", sizeof("A HREF") - 1)
|| !StrCmpNIC((char*) p+1, "PRE", sizeof("PRE") - 1)
|| !StrCmpNIC((char*) p+1, "IMG", sizeof("IMG") - 1)
|| !StrCmpNIC((char*) p+1, "PLAINTEXT", sizeof("PLAINTEXT") - 1)
|| !StrCmpNIC((char*) p+1, "TABLE", sizeof("TABLE") - 1))
{
_fFoundHTML = TRUE;
break;
}
else if ( !StrCmpNIC((char*) p+1, "HR", sizeof("HR") - 1)
|| !StrCmpNIC((char*) p+1, "A", sizeof("A") - 1)
|| !StrCmpNIC((char*) p+1, "/A", sizeof("/A") - 1)
|| !StrCmpNIC((char*) p+1, "B", sizeof("B") - 1)
|| !StrCmpNIC((char*) p+1, "/B", sizeof("/B") - 1)
|| !StrCmpNIC((char*) p+1, "P", sizeof("P") - 1)
|| !StrCmpNIC((char*) p+1, "/P", sizeof("/P") - 1)
|| !StrCmpNIC((char*) p+1, "!--", sizeof("!--") - 1)
)
{
//
// In order for this branch to identify this is HTML
// We have to make sure:
// 1. some HTML control char exists
// 2. We've scanned the whole data block
// 3. 2/3 of the data should be text
//
nHTMLConfidence += 50;
if ( nHTMLConfidence >= 100
&& i == _cbSample - 1
&& _cbText >= ((_cbSample * 2) / 3)
)
{
_fFoundHTML = TRUE;
break;
}
}
if (!StrCmpNIC((char*) p+1, "CHANNEL", sizeof("CHANNEL") - 1))
{
_fFoundCDF = TRUE;
break;
}
}
else if (!StrCmpNIC((char*) p, "-->", sizeof("-->") - 1))
{
// comment begin
// I really want to make sure that most of the
// char are printable
// potential issue: International code page?
nHTMLConfidence += 50;
if ( (nHTMLConfidence >= 100)
&& (i == _cbSample - 1 )
&& (_cbText > (_cbSample * 2 /3) )
)
{
_fFoundHTML = TRUE;
break;
}
}
// check for xbitmap
else if (*p == '#')
{
if (!StrCmpNC((char*) p+1, vszXbmMagic1, sizeof(vszXbmMagic1) - 1))
fFoundFirstXBitMapTag = TRUE;
}
else if (*p == '_' && fFoundSecondXBitMapTag)
{
if (!StrCmpNC((char*) p+1, vszXbmMagic3, sizeof(vszXbmMagic3) - 1))
{
_fFoundXBitMap = TRUE;
break;
}
}
else if (*p == '_' && fFoundFirstXBitMapTag)
{
if (!StrCmpNC((char*) p+1, vszXbmMagic2, sizeof(vszXbmMagic2) - 1))
fFoundSecondXBitMapTag = TRUE;
}
// MacBinhex
else if (*p == 'c')
{
if (!StrCmpNC((char*) p+1, vszBinHexMagic, sizeof(vszBinHexMagic) - 1))
{
_fFoundMacBinhex = TRUE;
break;
}
}
}
p++;
}
DEBUG_LEAVE(0);
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::IsBMP
//
// Synopsis:
//
// Arguments: (void)
//
// Returns: BOOL
//
// History: 5-25-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::IsBMP()
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::IsBMP",
"this=%#x",
this
));
BOOL bRetVal = TRUE;
BITMAPFILEHEADER UNALIGNED *pBMFileHdr;
if (_cbSample < 2)
{
bRetVal = FALSE;
}
// Check header
if (StrCmpNC(_pBuf, vszBmpMagic, sizeof(vszBmpMagic) - 1))
{
bRetVal = FALSE;
}
// Sample size needs to be big enough.
if (_cbSample < sizeof(BITMAPFILEHEADER))
{
bRetVal = FALSE;
}
pBMFileHdr = (BITMAPFILEHEADER*)(_pBuf);
#ifdef UNIX
/* Use 14 on Unix, because we want the size without the padding
* done on Unix. sizeof(BITMAPFILEHEADER) = 16 on Unix with padding
*/
#define UNIX_BITMAP_HEADER_SIZE 14
BITMAPFILEHEADER bmFileHeader;
if(MwReadBITMAPFILEHEADER((LPBYTE)_pBuf, UNIX_BITMAP_HEADER_SIZE, &bmFileHeader))
pBMFileHdr = &bmFileHeader;
#endif /* UNIX */
// The reserved fields must be set to 0
if (pBMFileHdr->bfReserved1!=0 || pBMFileHdr->bfReserved2!=0)
{
bRetVal = FALSE;
}
DEBUG_LEAVE(bRetVal);
return bRetVal;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::GetDataFormat
//
// Synopsis:
//
// Arguments: (WCHAR* wzMimeType)
//
// Returns: BOOL dwDataFormat
//
// History: 7-21-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD CContentAnalyzer::GetDataFormat(LPCWSTR wzMimeType)
{
DEBUG_ENTER((DBG_TRANS,
Dword,
"CContentAnalyzer::GetDataFormat",
"this=%#x, %.80wq",
this, wzMimeType
));
CLIPFORMAT cfFormat;
DATAFORMAT dwDataFormat;
HRESULT hr;
if (!wzMimeType)
{
DEBUG_LEAVE(DATAFORMAT_AMBIGUOUS);
return DATAFORMAT_AMBIGUOUS;
}
if( !_wcsicmp(wzMimeType, vwzNULL) )
{
DEBUG_LEAVE(DATAFORMAT_AMBIGUOUS);
return DATAFORMAT_AMBIGUOUS;
}
hr = FindMediaTypeFormat(wzMimeType, &cfFormat, (DWORD *)&dwDataFormat);
if (hr == S_OK)
{
DEBUG_LEAVE(dwDataFormat);
return dwDataFormat;
}
else
{
DEBUG_LEAVE(DATAFORMAT_UNKNOWN);
return DATAFORMAT_UNKNOWN;
}
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::FormatAgreesWithData
//
// Synopsis:
//
// Arguments: (void)
//
// Returns: BOOL
//
// History: 8-14-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::FormatAgreesWithData(DWORD dwFormat)
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::FormatAgreesWithData",
"this=%#x, %#x",
this, dwFormat
));
if (dwFormat == DATAFORMAT_TEXT && _fBinary == FALSE
|| dwFormat == DATAFORMAT_BINARY && _fBinary == TRUE
|| dwFormat == DATAFORMAT_TEXTORBINARY)
{
DEBUG_LEAVE(TRUE);
return TRUE;
}
DEBUG_LEAVE(FALSE);
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::MatchDWordAtOffset
//
// Synopsis: Determines if a given magic word is found at
// the specified offset.
//
// Arguments: (DWORD magic, int offset)
//
// Returns: BOOL
//
// History: 5-25-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::MatchDWordAtOffset(DWORD magic, int offset)
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::MatchDWordAtOffset",
"this=%#x, %#x, %d",
this, magic, offset
));
BOOL bRetVal = TRUE;
DWORD dwWord = 0;
unsigned char* p = (unsigned char*) _pBuf;
if (_cbSample < offset + (int) sizeof(DWORD))
{
DEBUG_LEAVE(FALSE);
return FALSE;
}
dwWord = (p[offset] << 24)
| (p[offset+1] << 16)
| (p[offset+2] << 8)
| p[offset+3];
if (magic != dwWord)
{
bRetVal = FALSE;
}
DEBUG_LEAVE(bRetVal);
return bRetVal;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::FindAppFromExt
//
// Synopsis: Determines an associated application from
// a given file extension
//
// Arguments: (LPSTR pszExt, LPSTR pszCommand (command line))
//
// Returns: BOOL (Associated Application is found or not)
//
// History: 7-15-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::FindAppFromExt(LPSTR pszExt, LPSTR pszCommand, DWORD cbCommand)
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::FindAppFromExt",
"this=%#x, %.80q, %.80q, %d",
this, pszExt, pszCommand, cbCommand
));
DWORD cbLen, dwType;
CHAR szRegPath[MAX_PATH];
BOOL fReturn = FALSE;
HKEY hMimeKey = NULL;
// BUGBUG - Is there a max registry path length?
cbLen = MAX_PATH;
// Should be a file extension
TransAssert((pszExt[0] == '.'));
// Open key on extension
if (RegOpenKeyEx(HKEY_CLASSES_ROOT, pszExt, 0,
KEY_QUERY_VALUE, &hMimeKey) == ERROR_SUCCESS)
{
// Find file type (txtfile, htmlfile, etc) .
// These currently utilize a null key.
if (RegQueryValueEx(hMimeKey, NULL, NULL, &dwType,
(LPBYTE)szRegPath, &cbLen) == ERROR_SUCCESS)
{
strncat(szRegPath, szApplicationRegistryKey, MAX_PATH - strlen(szRegPath) - 1);
HKEY hAppKey = NULL;
cbLen = cbCommand;
// szRegPath should now look similar to
// "txtfile\Shell\Open\Command". Open key on szRegPath
if (RegOpenKeyEx(HKEY_CLASSES_ROOT, szRegPath, 0,
KEY_QUERY_VALUE, &hAppKey) == ERROR_SUCCESS)
{
// Find the application command line - again, null key.
if (RegQueryValueEx(hMimeKey, NULL, NULL, &dwType,
(LPBYTE)pszCommand, &cbLen) == ERROR_SUCCESS)
{
// Success
fReturn = TRUE;
}
RegCloseKey(hAppKey);
}
else
{
// check "Shell\\Connect To\command" key - used by SmartTerm
// dynamic allocate szRegPath2 so that it won't take
// unnecessary stack space - after all, this is not a
// common case
CHAR* szRegPath2 = NULL;
HKEY hAppKey2 = NULL;
szRegPath2 = new CHAR[MAX_PATH];
if( szRegPath2 )
{
if (RegQueryValueEx(hMimeKey, NULL, NULL, &dwType,
(LPBYTE)szRegPath2, &cbLen) == ERROR_SUCCESS)
{
strncat(szRegPath2, szApplicationRegistryKey2,
MAX_PATH - strlen(szRegPath2) - 1);
}
else
{
// this should not happen at all
delete [] szRegPath2;
szRegPath2 = NULL;
}
}
if (szRegPath2 &&
RegOpenKeyEx(HKEY_CLASSES_ROOT, szRegPath2, 0, KEY_QUERY_VALUE, &hAppKey2) == ERROR_SUCCESS)
{
if (RegQueryValueEx(hMimeKey, NULL, NULL, &dwType,
(LPBYTE)pszCommand, &cbLen) == ERROR_SUCCESS)
{
// Success
fReturn = TRUE;
}
RegCloseKey(hAppKey2);
}
delete [] szRegPath2;
}
}
RegCloseKey(hMimeKey);
}
DEBUG_LEAVE(fReturn);
return fReturn;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::CheckTextHeaders
//
// Synopsis:
//
//
// Arguments: void
//
// Returns: void
//
// History: 7-23-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::CheckTextHeaders()
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::CheckTextHeaders",
"this=%#x",
this
));
BOOL bRet = TRUE;
// application/pdf (Acrobat)
if (!StrCmpNC(_pBuf, vszPdfMagic, sizeof(vszPdfMagic) - 1))
{
_wzMimeType = vwzApplicationPdf;
}
// application/Postscript
else if (!StrCmpNC(_pBuf, vszPostscriptMagic, sizeof(vszPostscriptMagic) - 1))
{
_wzMimeType = vwzApplicationPostscript;
}
// text/richtext
else if (!StrCmpNC(_pBuf, vszRichTextMagic, sizeof(vszRichTextMagic) - 1))
{
_wzMimeType = vwzTextRichText;
}
// application/base64
else if (!StrCmpNC(_pBuf, vszBase64Magic, sizeof(vszBase64Magic) - 1))
{
_wzMimeType = vwzApplicationBase64;
}
// No matches - assume plain text.
else
{
//_wzMimeType = vwzTextPlain;
bRet = FALSE;
}
DEBUG_LEAVE(bRet);
return bRet;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::CheckBinaryHeaders
//
// Synopsis:
//
//
// Arguments: void
//
// Returns: void
//
// History: 7-23-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::CheckBinaryHeaders()
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::CheckBinaryHeaders",
"this=%#x",
this
));
BOOL bRet = TRUE;
// image/gif
if (!StrCmpNIC(_pBuf, vszGif87Magic, sizeof(vszGif87Magic) - 1)
|| !StrCmpNIC(_pBuf, vszGif89Magic, sizeof(vszGif89Magic) - 1))
{
_wzMimeType = vwzImageGif;
}
// image/pjpeg
else if ((BYTE)_pBuf[0] == JPEG_MAGIC_1 && (BYTE)_pBuf[1] == JPEG_MAGIC_2)
{
_wzMimeType = vwzImagePJpeg;
}
// img/bmp
else if (IsBMP())
{
_wzMimeType = vwzImageBmp;
}
// audio/wav
else if (MatchDWordAtOffset(RIFF_MAGIC, 0)
&& MatchDWordAtOffset(WAV_MAGIC, 8))
{
_wzMimeType = vwzAudioWav;
}
// audio/basic (.au files)
else if (MatchDWordAtOffset(AU_DEC_MAGIC, 0)
|| MatchDWordAtOffset(AU_SUN_MAGIC, 0)
|| MatchDWordAtOffset(AU_DEC_INV_MAGIC, 0)
|| MatchDWordAtOffset(AU_SUN_INV_MAGIC, 0))
{
_wzMimeType = vwzAudioBasic;
}
// image/tiff
else if (!StrCmpC(_pBuf, vszTiffMagic)) // "MM" followed by a \0
{
_wzMimeType = vwzImageTiff;
}
// application/x-msdownload
else if (!StrCmpNC(_pBuf, vszExeMagic, sizeof(vszExeMagic) - 1))
{
_wzMimeType = vwzApplicationMSDownload;
}
// image/x-png
else if (!StrCmpNC(_pBuf, vszPngMagic, sizeof(vszPngMagic) - 1))
{
_wzMimeType = vwzImagePng;
}
// image/x-jg
else if (!StrCmpNC(_pBuf, vszJGMagic, sizeof(vszJGMagic) - 1)
&& (int) _pBuf[2] >= 3
&& (int) _pBuf[2] <= 31
&& _pBuf[4] == 0)
{
_wzMimeType = vwzImageJG;
}
// audio/x-aiff
else if (MatchDWordAtOffset(AIFF_INV_MAGIC, 0))
{
_wzMimeType = vwzAudioAiff;
}
else if (MatchDWordAtOffset(AIFF_MAGIC, 0) &&
( MatchDWordAtOffset(AIFF_MAGIC_MORE_1, 8) ||
MatchDWordAtOffset(AIFF_MAGIC_MORE_2, 8) ) )
{
//
// according to DaveMay, the correct AIFF format would be:
// 'FORM....AIFF' or 'FORM....AIFC'
// Only check for 'FORM' is incorrect because .sc2 has the
// same sig
//
_wzMimeType = vwzAudioAiff;
}
// video/avi (or video/x-msvedio)
else if (MatchDWordAtOffset(RIFF_MAGIC, 0)
&& MatchDWordAtOffset(AVI_MAGIC, 8))
{
_wzMimeType = vwzVideoAvi;
}
// video/mpeg
else if (MatchDWordAtOffset(MPEG_MAGIC, 0)
|| MatchDWordAtOffset(MPEG_MAGIC_2, 0) )
{
_wzMimeType = vwzVideoMpeg;
}
// image/x-emf
else if (MatchDWordAtOffset(EMF_MAGIC_1, 0)
&& MatchDWordAtOffset(EMF_MAGIC_2, 40))
{
_wzMimeType = vwzImageEmf;
}
// image/x-wmf
else if (MatchDWordAtOffset(WMF_MAGIC, 0))
{
_wzMimeType = vwzImageWmf;
}
// application/java
else if (MatchDWordAtOffset(JAVA_MAGIC, 0))
{
_wzMimeType = vwzApplicationJava;
}
// application/x-zip-compressed
else if (!StrCmpNC(_pBuf, vszZipMagic, sizeof(vszZipMagic) - 1))
{
_wzMimeType = vwzApplicationZipCompressed;
}
// application/x-compress
else if (!StrCmpNC(_pBuf, vszCompressMagic, sizeof(vszCompressMagic) - 1))
{
_wzMimeType = vwzApplicationCompressed;
}
// application/x-gzip
else if (!StrCmpNC(_pBuf, vszGzipMagic, sizeof(vszGzipMagic) - 1))
{
_wzMimeType = vwzApplicationGzipCompressed;
}
// application/x-zip-compressed
else if (!StrCmpNC(_pBuf, vszZipMagic, sizeof(vszZipMagic) - 1))
{
_wzMimeType = vwzApplicationZipCompressed;
}
// audio/mid
else if (!StrCmpC(_pBuf, vszMIDMagic))
{
_wzMimeType = vwzAudioMID;
}
// application/pdf (Acrobat)
else if (!StrCmpNC(_pBuf, vszPdfMagic, sizeof(vszPdfMagic) - 1))
{
_wzMimeType = vwzApplicationPdf;
}
// don't know what it is.
else
{
//_wzMimeType = vwzApplicationOctetStream;
bRet = FALSE;
}
DEBUG_LEAVE(bRet);
return bRet;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::FindMimeFromData
//
// Synopsis: Attempts to guess MIME type from buffer
//
//
// Arguments: pBuf, cbSample, wzSuggestedMimeType
//
// Returns: LPCWSTR (the MIME type guessed)
//
// History: 5-25-96 AdriaanC (Adriaan Canter) Created
//
// Notes:
//
//----------------------------------------------------------------------------
LPCWSTR CContentAnalyzer::FindMimeFromData(LPCWSTR wzFileName, char* pBuf,
int cbSample, LPCWSTR wzSuggestedMimeType, DWORD grfFlags)
{
DEBUG_ENTER((DBG_TRANS,
Pointer,
"CContentAnalyzer::FindMimeFromData",
"this=%#x, %.80wq, %.80q, %d, %.80wq, %#x",
this, wzFileName, pBuf, cbSample, wzSuggestedMimeType, grfFlags
));
BOOL fSampledData = FALSE;
BOOL fFoundMimeTypeFromExt = FALSE;
CHAR* szFileExt = 0;
CHAR szFileName[MAX_PATH];
CHAR szMimeTypeFromExt[SZMIMESIZE_MAX];
CHAR szCommand[MAX_PATH];
CHAR cLastByte;
DWORD dwMimeLen = SZMIMESIZE_MAX;
DWORD dwExtMimeTypeDataFormat;
DWORD dwSuggestedMimeTypeDataFormat;
DWORD dwMimeTypeDataFormat;
DWORD cbCommand = MAX_PATH;
BOOL fExtensionChecked = FALSE;
_grfFlags = grfFlags;
// BUGBUG - we can use this information for DBCS.
// Remove any info appended to the suggested mime type
// such as charset information. This is identified by ';'
if (wzSuggestedMimeType)
{
WCHAR* wptr = wcsstr(wzSuggestedMimeType, L";");
if (wptr)
{
*wptr = L'\0';
}
}
// Check to see if the server is suggesting an unknown mime type
dwSuggestedMimeTypeDataFormat = GetDataFormat(wzSuggestedMimeType);
if (dwSuggestedMimeTypeDataFormat == DATAFORMAT_UNKNOWN)
{
// server push returns "multipart" content type
// this is not the real mimetype, so we have to sniff
// to find out the truth
if( wcsicmp(wzSuggestedMimeType, vwzmultipartmixed)
&& wcsicmp(wzSuggestedMimeType, vwzmultipartmixedreplace) )
{
// If so, return the suggested mime type.
_wzMimeType = (WCHAR*) wzSuggestedMimeType;
DEBUG_LEAVE(_wzMimeType);
return _wzMimeType;
}
}
/*****
// check if we got an extension and extension mime
// matches the suggested mime - only for text/plain
if ( wzSuggestedMimeType
&& wzFileName
&& !wcscmp(wzSuggestedMimeType,vwzTextPlain))
{
fExtensionChecked = TRUE;
fFoundMimeTypeFromExt = FindMimeFromExt(
wzFileName,
szFileName,
szMimeTypeFromExt,
&dwExtMimeTypeDataFormat,
&szFileExt
);
// If there is a mime type associated with the file
// extension then return it.
if ( fFoundMimeTypeFromExt
&& (dwExtMimeTypeDataFormat == dwSuggestedMimeTypeDataFormat)
&& !wcscmp(wzSuggestedMimeType,_wzMimeTypeFromExt)
)
{
// If so, return the suggested mime type.
_wzMimeType = (WCHAR*) wzSuggestedMimeType;
return _wzMimeType;
}
}
*****/
// Not enough data to tell anything
if (!pBuf || cbSample <= 0)
{
_wzMimeType = (WCHAR*) wzSuggestedMimeType;
DEBUG_LEAVE(_wzMimeType);
return _wzMimeType;
}
_pBuf = pBuf;
_cbSample = (cbSample <= SAMPLE_SIZE) ? cbSample : SAMPLE_SIZE;
// Save off last character. Null terminate the buffer.
cLastByte = _pBuf[_cbSample - 1];
_pBuf[_cbSample - 1] = '\0';
// Common cases first - check the server indicated mime type
// for text/html, image/gif or image/[p]jpeg.
if ( wzSuggestedMimeType
&& !StrCmpICW(wzSuggestedMimeType, vwzTextHTML))
{
// Sample the data. This routine also checks for the following
// mime types which require extended scanning through the buffer:
// text/html, image/x-xbitmap, application/macbinhex
SampleData();
fSampledData = TRUE;
if (_fFoundHTML)
{
_wzMimeType = vwzTextHTML;
goto exit;
}
}
// image/gif
else if (wzSuggestedMimeType
&& !wcsicmp(wzSuggestedMimeType, vwzImageGif))
{
if (!StrCmpNIC(_pBuf, vszGif87Magic, sizeof(vszGif87Magic) - 1)
|| !StrCmpNIC(_pBuf, vszGif89Magic, sizeof(vszGif89Magic) - 1))
{
_wzMimeType = vwzImageGif;
goto exit;
}
}
// image/jpeg or image/pjpeg
else if (wzSuggestedMimeType
&& (!wcsicmp(wzSuggestedMimeType, vwzImagePJpeg)
|| !wcsicmp(wzSuggestedMimeType, vwzImageJpeg)))
{
if ((BYTE)_pBuf[0] == JPEG_MAGIC_1 && (BYTE)_pBuf[1] == JPEG_MAGIC_2)
{
_wzMimeType = vwzImagePJpeg;
goto exit;
}
}
//
// ********************** BEGIN HACK *******************************
//
// we will remove this once tridents defined the unique signature
// for .hta and .htc format
//
// DanpoZ (98.08.12) - refer to IE5 SUPERHOT bug 35478
//
if (wzFileName )
{
CHAR* szExt;
CHAR szFile[MAX_PATH];
W2A(wzFileName, szFile, MAX_PATH);
if( grfFlags & FMFD_URLASFILENAME )
{
//
// remove teh security context '\1' and replace it with '\0'
// but only do this when we are using URL to replace the filename
//
CHAR* pch = StrChr(szFile, '\1');
if (pch)
{
*pch = '\0';
}
}
szExt = FindFileExtension(szFile);
if( szExt &&
( !StrCmpNIC(szExt, ".hta", sizeof(".hta") - 1) ||
!StrCmpNIC(szExt, ".htc", sizeof(".htc") - 1) ) )
{
fExtensionChecked = TRUE;
fFoundMimeTypeFromExt = FindMimeFromExt(
wzFileName,
szFileName,
szMimeTypeFromExt,
&dwExtMimeTypeDataFormat,
&szFileExt
);
// If there is a mime type associated with the file
// extension then return it.
if (fFoundMimeTypeFromExt)
{
_wzMimeType = _wzMimeTypeFromExt;
goto exit;
}
}
}
//
// ********************** END HACK *********************************
//
// One of the following is true:
// 1) The server indicated a common mime type (html, gif or jpeg),
// however, verification failed.
// 2) The server indicated an ambiguous mime type or
// a known, but uncommon mime type.
// If not done so already, sample the data.
if (!fSampledData)
{
SampleData();
fSampledData = TRUE;
}
// Return any mime type that was positively
// identified during the data sampling
if( _fFoundCDF )
{
_wzMimeType = vwzApplicationCDF;
goto exit;
}
else if( _fFoundXML)
{
_wzMimeType = vwzTextXML;
goto exit;
}
else if (_fFoundHTML)
{
_wzMimeType = vwzTextHTML;
goto exit;
}
else if (_fFoundXBitMap)
{
_wzMimeType = vwzImageXBitmap;
goto exit;
}
else if (_fFoundMacBinhex)
{
_wzMimeType = vwzApplicationMacBinhex;
goto exit;
}
else if( _fFoundTextScriptlet )
{
_wzMimeType = vwzTextScriptlet;
goto exit;
}
if( !_fFoundCDF
&& wzSuggestedMimeType
&& !wcsicmp(wzSuggestedMimeType, vwzApplicationNETCDF)
)
{
// only overwrite application/x-netcdf with aplication/x-cdf
_wzMimeType = vwzApplicationNETCDF;
goto exit;
}
// Decide if buffer is primarily text or binary. Conduct
// pattern matching to determine a mime type depending on the
// finding.
if (!_cbCtrl || _cbText + _cbFF >= 16 * (_cbCtrl + _cbHigh))
{
_fBinary = FALSE;
if( !CheckTextHeaders() )
{
if( !CheckBinaryHeaders() )
{
_wzMimeType = vwzTextPlain;
}
}
}
else
{
_fBinary = TRUE;
if( !CheckBinaryHeaders() )
{
if( !CheckTextHeaders() )
{
_wzMimeType = vwzApplicationOctetStream;
}
}
}
// Determine format of the mime type from data
dwMimeTypeDataFormat = GetDataFormat(_wzMimeType);
// If the format of the mime type found from examining the data
// is not ambiguous, then return this mime type.
if (dwMimeTypeDataFormat != DATAFORMAT_AMBIGUOUS)
{
goto exit;
}
// Examination of data is inconclusive.
else
{
// If the suggested mime type is not ambiguous and does
// not conflict with the data format then return it.
if (dwSuggestedMimeTypeDataFormat != DATAFORMAT_AMBIGUOUS
&& FormatAgreesWithData(dwSuggestedMimeTypeDataFormat))
{
_wzMimeType = (WCHAR*) wzSuggestedMimeType;
goto exit;
}
// Otherwise, attempt to obtain a mime type from any
// file extension. If none is found, but an application
// is registered for the file extension, return
// application/octet-stream.
// If there is a file extension, find any
// associated mime type.
if (wzFileName && !fExtensionChecked)
{
fExtensionChecked = TRUE;
fFoundMimeTypeFromExt = FindMimeFromExt(
wzFileName,
szFileName,
szMimeTypeFromExt,
&dwExtMimeTypeDataFormat,
&szFileExt
);
}
// If there is a mime type associated with the file
// extension then return it.
if (fFoundMimeTypeFromExt)
{
if (dwExtMimeTypeDataFormat == DATAFORMAT_UNKNOWN)
{
_wzMimeType = _wzMimeTypeFromExt;
goto exit;
}
else
{
goto exit;
}
}
// Otherwise, check to see if there is an associated application.
if (szFileExt && FindAppFromExt(szFileExt, szCommand, cbCommand))
{
// Found an associated application.
_wzMimeType = vwzApplicationOctetStream;
goto exit;
}
// No suggested mime type, no mime type from file extension
// and no registered application found. Fall through and return
// mime type found from the data
}
exit:
// Replace the null termination with
// the original character.
_pBuf[_cbSample - 1] = cLastByte;
DEBUG_LEAVE(_wzMimeType);
return _wzMimeType;
}
//+---------------------------------------------------------------------------
//
// Method: CContentAnalyzer::FindMimeFromExt
//
// Synopsis:
//
// Arguments: [wzFileName] --
// [szFileName] --
// [szMimeTypeFromExt] --
// [pdwExtMimeTypeDataFormat] --
//
// Returns:
//
// History: 5-25-96 AdriaanC (Adriaan Canter)
// 1-28-1997 JohannP (Johann Posch) made separate function
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CContentAnalyzer::FindMimeFromExt(
LPCWSTR wzFileName,
CHAR *szFileName,
CHAR *szMimeTypeFromExt,
DWORD *pdwExtMimeTypeDataFormat,
CHAR **ppszFileExt)
{
DEBUG_ENTER((DBG_TRANS,
Bool,
"CContentAnalyzer::FindMimeFromExt",
"this=%#x, %.80wq, %.80q, %.80q, %#x, %#x",
this, wzFileName, szFileName, szMimeTypeFromExt, pdwExtMimeTypeDataFormat, ppszFileExt
));
BOOL fFoundMimeTypeFromExt = FALSE;
UrlMkAssert((wzFileName && szFileName && pdwExtMimeTypeDataFormat));
DWORD dwMimeLen = SZMIMESIZE_MAX;
CHAR* szFileExt = 0;
// If there is a file extension, find any
// associated mime type.
W2A(wzFileName, szFileName, MAX_PATH);
szFileExt = FindFileExtension(szFileName);
if (szFileExt && GetMimeFromExt(szFileExt,
szMimeTypeFromExt, &dwMimeLen) == ERROR_SUCCESS)
{
fFoundMimeTypeFromExt = TRUE;
A2W(szMimeTypeFromExt, _wzMimeTypeFromExt, SZMIMESIZE_MAX);
*pdwExtMimeTypeDataFormat = GetDataFormat(_wzMimeTypeFromExt);
}
if (szFileExt && ppszFileExt)
{
*ppszFileExt = szFileExt;
}
DEBUG_LEAVE(fFoundMimeTypeFromExt);
return fFoundMimeTypeFromExt;
}