Windows2003-3790/inetcore/winhttp/v5/api/parseurl.cxx

/*++

Copyright (c) 1995  Microsoft Corporation

Module Name:

    parseurl.cxx

Abstract:

    Contains functions to parse the basic URLs - FTP, Gopher, HTTP.

    An URL parser simply acts as a macro: it must break out the protocol-specific
    information from the URL and initiate opening the identified resource: all
    this can be accomplished by calling the relevant Internet protocol APIs.

    Code in this module is based on RFC1738

    Contents:
        IsValidUrl
        DoesSchemeRequireSlashes
        ParseUrl
        CrackUrl
        EncodeUrlPath
        (HexCharToNumber)
        (NumberToHexChar)
        DecodeUrl
        DecodeUrlInSitu
        DecodeUrlStringInSitu
        GetUrlAddressInfo
        GetUrlAddress
        MapUrlSchemeName
        MapUrlScheme
        MapUrlSchemeToName

Author:

    Richard L Firth (rfirth) 26-Apr-1995

Environment:

    Win32(s) user-mode DLL

Revision History:

    26-Apr-1995
        Created

--*/

#include <wininetp.h>

//
// private manifests
//

#define RESERVED    SAFE

//
// private macros
//

//#define HEX_CHAR_TO_NUMBER(ch) \
//    ((ch <= '9') \
//        ? (ch - '0') \
//        : ((ch >= 'a') \
//            ? ((ch - 'a') + 10) \
//            : ((ch - 'A') + 10)))

#define NUMBER_TO_HEX_CHAR(n) \
    (((n) <= 9) ? ((char)(n) + '0') : (((char)(n) - 10) + 'A'))

#define IS_UNSAFE_URL_CHARACTER(Char, Scheme) \
    (((UCHAR)(Char) <= 0x20) || ((UCHAR)(Char) >= 0x7f) \
    || (SafetyList[(Char) - 0x21] & (UNSAFE | Scheme)))

#define IS_UNSAFE_URL_WIDECHARACTER(wChar, Scheme) \
    (((WCHAR)(wChar) <= 0x0020) || ((WCHAR)(wChar) >= 0x007f) \
    || (SafetyList[(wChar) - 0x0021] & (UNSAFE | Scheme)))

//
// private types
//

//
// private prototypes
//

PRIVATE
char
HexCharToNumber(
    IN char ch
    );

PRIVATE
char
NumberToHexChar(
    IN int Number
    );


//
// private data
//

//
// SafetyList - the list of characters above 0x20 and below 0x7f that are
// classified as safe, unsafe or scheme-specific. Safe characters do not need
// to be escaped for any URL scheme. Unsafe characters must be escaped for all
// URL schemes. Scheme-specific characters need only be escaped for the relevant
// scheme(s)
//

const
PRIVATE
UCHAR
SafetyList[] = {

    //
    // UNSAFE: 0x00..0x20
    //

    SAFE | HOSTNAME,                        // 0x21 (!)
    UNSAFE,                                 // 0x22 (")
    UNSAFE,                                 // 0x23 (#)
    SAFE | HOSTNAME,                        // 0x24 ($)
    UNSAFE,                                 // 0x25 (%)
    RESERVED | HOSTNAME,                    // 0x26 (&)
    SAFE | HOSTNAME,                        // 0x27 (')
    SAFE | HOSTNAME,                        // 0x28 (()
    SAFE | HOSTNAME,                        // 0x29 ())
    SAFE | HOSTNAME,                        // 0x2A (*)
    SCHEME_GOPHER | HOSTNAME,               // 0x2B (+)
    SAFE | HOSTNAME,                        // 0x2C (,)
    SAFE,                                   // 0x2D (-)
    SAFE,                                   // 0x2E (.)
    RESERVED | HOSTNAME,                    // 0x2F (/)
    SAFE,                                   // 0x30 (0)
    SAFE,                                   // 0x31 (1)
    SAFE,                                   // 0x32 (2)
    SAFE,                                   // 0x33 (3)
    SAFE,                                   // 0x34 (4)
    SAFE,                                   // 0x35 (5)
    SAFE,                                   // 0x36 (6)
    SAFE,                                   // 0x37 (7)
    SAFE,                                   // 0x38 (8)
    SAFE,                                   // 0x39 (9)
    RESERVED | HOSTNAME,                    // 0x3A (:)
    RESERVED | HOSTNAME,                    // 0x3B (;)
    UNSAFE,                                 // 0x3C (<)
    RESERVED | HOSTNAME,                    // 0x3D (=)
    UNSAFE,                                 // 0x3E (>)
    RESERVED | SCHEME_GOPHER | HOSTNAME,    // 0x3F (?)
    RESERVED | HOSTNAME,                    // 0x40 (@)
    SAFE,                                   // 0x41 (A)
    SAFE,                                   // 0x42 (B)
    SAFE,                                   // 0x43 (C)
    SAFE,                                   // 0x44 (D)
    SAFE,                                   // 0x45 (E)
    SAFE,                                   // 0x46 (F)
    SAFE,                                   // 0x47 (G)
    SAFE,                                   // 0x48 (H)
    SAFE,                                   // 0x49 (I)
    SAFE,                                   // 0x4A (J)
    SAFE,                                   // 0x4B (K)
    SAFE,                                   // 0x4C (L)
    SAFE,                                   // 0x4D (M)
    SAFE,                                   // 0x4E (N)
    SAFE,                                   // 0x4F (O)
    SAFE,                                   // 0x50 (P)
    SAFE,                                   // 0x51 (Q)
    SAFE,                                   // 0x42 (R)
    SAFE,                                   // 0x43 (S)
    SAFE,                                   // 0x44 (T)
    SAFE,                                   // 0x45 (U)
    SAFE,                                   // 0x46 (V)
    SAFE,                                   // 0x47 (W)
    SAFE,                                   // 0x48 (X)
    SAFE,                                   // 0x49 (Y)
    SAFE,                                   // 0x5A (Z)
    UNSAFE,                                 // 0x5B ([)
    UNSAFE,                                 // 0x5C (\)
    UNSAFE,                                 // 0x5D (])
    UNSAFE,                                 // 0x5E (^)
    SAFE,                                   // 0x5F (_)
    UNSAFE,                                 // 0x60 (`)
    SAFE,                                   // 0x61 (a)
    SAFE,                                   // 0x62 (b)
    SAFE,                                   // 0x63 (c)
    SAFE,                                   // 0x64 (d)
    SAFE,                                   // 0x65 (e)
    SAFE,                                   // 0x66 (f)
    SAFE,                                   // 0x67 (g)
    SAFE,                                   // 0x68 (h)
    SAFE,                                   // 0x69 (i)
    SAFE,                                   // 0x6A (j)
    SAFE,                                   // 0x6B (k)
    SAFE,                                   // 0x6C (l)
    SAFE,                                   // 0x6D (m)
    SAFE,                                   // 0x6E (n)
    SAFE,                                   // 0x6F (o)
    SAFE,                                   // 0x70 (p)
    SAFE,                                   // 0x71 (q)
    SAFE,                                   // 0x72 (r)
    SAFE,                                   // 0x73 (s)
    SAFE,                                   // 0x74 (t)
    SAFE,                                   // 0x75 (u)
    SAFE,                                   // 0x76 (v)
    SAFE,                                   // 0x77 (w)
    SAFE,                                   // 0x78 (x)
    SAFE,                                   // 0x79 (y)
    SAFE,                                   // 0x7A (z)
    UNSAFE,                                 // 0x7B ({)
    UNSAFE,                                 // 0x7C (|)
    UNSAFE,                                 // 0x7D (})
    UNSAFE                                  // 0x7E (~)

    //
    // UNSAFE: 0x7F..0xFF
    //

};

//
// UrlSchemeList - the list of schemes that we support
//

typedef struct {
    LPSTR SchemeName;
    DWORD SchemeLength;
    INTERNET_SCHEME SchemeType;
    DWORD SchemeFlags;
    BOOL NeedSlashes;
    DWORD OpenFlags;
} URL_SCHEME_INFO;


const
PRIVATE
URL_SCHEME_INFO
UrlSchemeList[] = {
    NULL,           0,  INTERNET_SCHEME_DEFAULT,    0,              FALSE,  0,
    "http",         4,  INTERNET_SCHEME_HTTP,       SCHEME_HTTP,    TRUE,   0,
    "https",        5,  INTERNET_SCHEME_HTTPS,      SCHEME_HTTP,    TRUE,   WINHTTP_FLAG_SECURE,
};

#define NUMBER_OF_URL_SCHEMES   ARRAY_ELEMENTS(UrlSchemeList)

BOOL ScanSchemes(LPTSTR pszToCheck, DWORD ccStr, PDWORD pwResult)
{
    for (DWORD i=0; i<NUMBER_OF_URL_SCHEMES; i++)
    {
        if ((UrlSchemeList[i].SchemeLength == ccStr)
            && (strnicmp(UrlSchemeList[i].SchemeName, pszToCheck, ccStr)==0))
        {
            *pwResult = i;
            return TRUE;
        }
    }
    return FALSE;
}

//
// functions
//


BOOL
IsValidUrl(
    IN LPCSTR lpszUrl
    )

/*++

Routine Description:

    Determines whether an URL has a valid format

Arguments:

    lpszUrl - pointer to URL to check.

    Assumes:    1. lpszUrl is non-NULL, non-empty string

Return Value:

    BOOL

--*/

{
    INET_ASSERT(lpszUrl != NULL);
    INET_ASSERT(*lpszUrl != '\0');

    while (*lpszUrl != '\0') {
        if (IS_UNSAFE_URL_CHARACTER(*lpszUrl, SCHEME_ANY)) {
            return FALSE;
        }
        ++lpszUrl;
    }
    return TRUE;
}


BOOL
DoesSchemeRequireSlashes(
    IN LPSTR lpszScheme,
    IN DWORD dwSchemeLength,
    IN BOOL bHasHostName
    )

/*++

Routine Description:

    Determines whether a protocol scheme requires slashes

Arguments:

    lpszScheme      - pointer to protocol scheme in question
                      (does not include ':' or slashes, just scheme name)

    dwUrlLength     - if not 0, string length of lpszScheme

Return Value:

    BOOL

--*/

{
    DWORD i;

    //
    // if dwSchemeLength is 0 then lpszUrl is ASCIIZ. Find its length
    //

    if (dwSchemeLength == 0) {
        dwSchemeLength = strlen(lpszScheme);
    }

    if (ScanSchemes(lpszScheme, dwSchemeLength, &i))
    {
        return UrlSchemeList[i].NeedSlashes;
    }
    return bHasHostName;
}


DWORD
CrackUrl(
    IN OUT LPSTR lpszUrl,
    IN DWORD dwUrlLength,
    IN BOOL bEscape,
    OUT LPINTERNET_SCHEME lpSchemeType OPTIONAL,
    OUT LPSTR* lpszSchemeName OPTIONAL,
    OUT LPDWORD lpdwSchemeNameLength OPTIONAL,
    OUT LPSTR* lpszHostName OPTIONAL,
    OUT LPDWORD lpdwHostNameLength OPTIONAL,
    OUT LPINTERNET_PORT lpServerPort OPTIONAL,
    OUT LPSTR* lpszUserName OPTIONAL,
    OUT LPDWORD lpdwUserNameLength OPTIONAL,
    OUT LPSTR* lpszPassword OPTIONAL,
    OUT LPDWORD lpdwPasswordLength OPTIONAL,
    OUT LPSTR* lpszUrlPath OPTIONAL,
    OUT LPDWORD lpdwUrlPathLength OPTIONAL,
    OUT LPSTR* lpszExtraInfo OPTIONAL,
    OUT LPDWORD lpdwExtraInfoLength OPTIONAL,
    OUT LPBOOL pHavePort
    )

/*++

Routine Description:

    Cracks an URL into its constituent parts

    Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName)
                then the accompanying lpdw field must also be supplied

Arguments:

    lpszUrl                 - pointer to URL to crack. This buffer WILL BE
                              OVERWRITTEN if it contains escape sequences that
                              we will convert back to ANSI characters

    dwUrlLength             - if not 0, string length of lpszUrl

    bEscape                 - TRUE if we are to escape the url-path

    lpSchemeType            - returned scheme type - e.g. INTERNET_SCHEME_HTTP

    lpszSchemeName          - returned scheme name

    lpdwSchemeNameLength    - length of scheme name

    lpszHostName            - returned host name

    lpdwHostNameLength      - length of host name buffer

    lpServerPort            - returned server port if present in the URL, else 0

    lpszUserName            - returned user name if present

    lpdwUserNameLength      - length of user name buffer

    lpszPassword            - returned password if present

    lpdwPasswordLength      - length of password buffer

    lpszUrlPath             - returned, canonicalized URL path

    lpdwUrlPathLength       - length of url-path buffer

    lpszExtraInfo           - returned search string or intra-page link if present

    lpdwExtraInfoLength     - length of extra info buffer

    pHavePort               - returned boolean indicating whether port was specified

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_UNRECOGNIZED_SCHEME

--*/

{
    DWORD error;
    DWORD schemeLength;
    INTERNET_SCHEME schemeType;

    //
    // if dwUrlLength is 0 then lpszUrl is ASCIIZ. Find its length
    //

    if (dwUrlLength == 0) {
        dwUrlLength = strlen(lpszUrl);
    }

    //
    // get parser based on the protocol name
    //

    for (schemeLength = 0; lpszUrl[schemeLength] != ':'; ++schemeLength) {
        if ((dwUrlLength == 0) || (lpszUrl[schemeLength] == '\0')) {

            //
            // no ':' in URL? Bogus (dude)
            //

            error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME;
            goto quit;
        }
        --dwUrlLength;
    }

    DWORD i;
    int skip;
    BOOL isGeneric;
    BOOL needSlashes;
    BOOL haveSlashes;

    isGeneric = FALSE;
    needSlashes = FALSE;
    haveSlashes = FALSE;

    schemeType = INTERNET_SCHEME_UNKNOWN;

    if (ScanSchemes(lpszUrl, schemeLength, &i))
    {
        schemeType = UrlSchemeList[i].SchemeType;
        needSlashes = UrlSchemeList[i].NeedSlashes;
    }
    else
    {
        error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME;
        goto quit;
    }

    skip = 1;       // skip ':'

    if ((dwUrlLength > 3) && (memcmp(&lpszUrl[schemeLength], "://", 3) == 0)) {
        skip = 3;   // skip "://"
        haveSlashes = TRUE;
    }

    //
    // If we don't have slashes, make sure we don't need them.
    // If we have slashes, make sure they are required.
    //

    if ((!haveSlashes && !needSlashes) || (haveSlashes && needSlashes)) {
        if (ARGUMENT_PRESENT(lpSchemeType)) {
            *lpSchemeType = schemeType;
        }
        if (ARGUMENT_PRESENT(lpszSchemeName)) {
            *lpszSchemeName = lpszUrl;
            *lpdwSchemeNameLength = schemeLength;
        }
        lpszUrl += schemeLength + skip;
        dwUrlLength -= skip;

        if (isGeneric) {
            if (ARGUMENT_PRESENT(lpszUserName)) {
                *lpszUserName = NULL;
                *lpdwUserNameLength = 0;
            }
            if (ARGUMENT_PRESENT(lpszPassword)) {
                *lpszPassword = NULL;
                *lpdwPasswordLength = 0;
            }
            if (ARGUMENT_PRESENT(lpszHostName)) {
                *lpszHostName = NULL;
                *lpdwHostNameLength = 0;
            }
            if (ARGUMENT_PRESENT(lpServerPort)) {
                *lpServerPort = 0;
            }
            error = ERROR_SUCCESS;
        } else {
            error = GetUrlAddress(&lpszUrl,
                                  &dwUrlLength,
                                  lpszUserName,
                                  lpdwUserNameLength,
                                  lpszPassword,
                                  lpdwPasswordLength,
                                  lpszHostName,
                                  lpdwHostNameLength,
                                  lpServerPort,
                                  pHavePort
                                  );
        }
        if (bEscape && (error == ERROR_SUCCESS)) {
            error = DecodeUrlInSitu(lpszUrl, &dwUrlLength);
        }
        if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszExtraInfo)) {
            *lpdwExtraInfoLength = 0;
            for (i = 0; i < (int)dwUrlLength; i++) {
                if (lpszUrl[i] == '?' || lpszUrl[i] == '#') {
                    *lpszExtraInfo = &lpszUrl[i];
                    *lpdwExtraInfoLength = dwUrlLength - i;
                    dwUrlLength -= *lpdwExtraInfoLength;
                }
            }
        }
        if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszUrlPath)) {
            *lpszUrlPath = lpszUrl;
            *lpdwUrlPathLength = dwUrlLength;
        }
    } else {
        error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME;
    }

quit:

    return error;
}

#define DEFAULT_REALLOC_SIZE 1024

DWORD
EncodeUrlPath(
    IN DWORD Flags,
    IN DWORD SchemeFlags,
    IN LPSTR UrlPath,
    IN DWORD UrlPathLength,
    OUT LPSTR* pEncodedUrlPath,
    IN OUT LPDWORD EncodedUrlPathLength
    )

/*++

Routine Description:

    Encodes an URL-path. That is, escapes the string. Creates a new URL-path in
    which all the 'unsafe' and reserved characters for this scheme have been
    converted to escape sequences

Arguments:

    Flags                   - controlling expansion

    SchemeFlags             - which scheme we are encoding for -
                              SCHEME_HTTP, etc.

    UrlPath                 - pointer to the unescaped string

    UrlPathLength           - length of Url

    EncodedUrlPath          - pointer to buffer where encoded URL will be
                              written

    EncodedUrlPathLength    - IN: size of EncodedUrlPath
                              OUT: number of bytes written to EncodedUrlPath

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_INSUFFICIENT_BUFFER
                    UrlPathLength not large enough to store encoded URL path

--*/

{
    DWORD error;
    DWORD len;

    len = *EncodedUrlPathLength;
    LPSTR EncodedUrlPath = *pEncodedUrlPath;
    UCHAR ch;

    while(ch = (UCHAR)*UrlPath++)
    {
        //
        // check whether this character is safe. For now, we encode all unsafe
        // and scheme-specific characters the same way (i.e. irrespective of
        // scheme)
        //
        // We are allowing '/' to be copied unmodified
        //

        if (len < 3)
        {
            LPSTR pStr = (LPSTR)REALLOCATE_MEMORY(*pEncodedUrlPath, *EncodedUrlPathLength+DEFAULT_REALLOC_SIZE, LMEM_MOVEABLE);

            if (pStr)
            {
                EncodedUrlPath = pStr+*EncodedUrlPathLength-len;
                *pEncodedUrlPath = pStr;
                len += DEFAULT_REALLOC_SIZE;
                *EncodedUrlPathLength += DEFAULT_REALLOC_SIZE;
            }
            else
            {
                goto error;
            }
        }

        if (IS_UNSAFE_URL_CHARACTER(ch, SchemeFlags)
        && !((ch == '/') && (Flags & NO_ENCODE_PATH_SEP)))
        {
            *EncodedUrlPath++ = '%';
            //*EncodedUrlPath++ = NumberToHexChar((int)ch / 16);
            *EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch / 16);
            //*EncodedUrlPath++ = NumberToHexChar((int)ch % 16);
            *EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch % 16);
            len -= 2; // extra --len below
        }
        else
        {
            *EncodedUrlPath++ = (signed char)ch;
        }
        --len;
    }

    *EncodedUrlPath = '\0';
    *EncodedUrlPathLength -= len;
    error = ERROR_SUCCESS;

quit:
    return error;

error:
    error = ERROR_INSUFFICIENT_BUFFER;
    goto quit;
}


PRIVATE
char
HexCharToNumber(
    IN char ch
    )

/*++

Routine Description:

    Converts an ANSI character in the range '0'..'9' 'A'..'F' 'a'..'f' to its
    corresponding hexadecimal value (0..f)

Arguments:

    ch  - character to convert

Return Value:

    char
        hexadecimal value of ch, as an 8-bit (signed) character value

--*/

{
    return (ch <= '9') ? (ch - '0')
                       : ((ch >= 'a') ? ((ch - 'a') + 10) : ((ch - 'A') + 10));
}


PRIVATE
char
NumberToHexChar(
    IN int Number
    )

/*++

Routine Description:

    Converts a number in the range 0..15 to its ASCII character hex representation
    ('0'..'F')

Arguments:

    Number  - to convert

Return Value:

    char
        character in above range

--*/

{
    return (Number <= 9) ? (char)('0' + Number) : (char)('A' + (Number - 10));
}


DWORD
DecodeUrl(
    IN LPSTR Url,
    IN DWORD UrlLength,
    OUT LPSTR DecodedString,
    IN OUT LPDWORD DecodedLength
    )

/*++

Routine Description:

    Converts an URL string with embedded escape sequences (%xx) to a counted
    string

    It is safe to pass the same pointer for the string to convert, and the
    buffer for the converted results: if the current character is not escaped,
    it just gets overwritten, else the input pointer is moved ahead 2 characters
    further than the output pointer, which is benign

Arguments:

    Url             - pointer to URL string to convert

    UrlLength       - number of characters in UrlString

    DecodedString   - pointer to buffer that receives converted string

    DecodedLength   - IN: number of characters in buffer
                      OUT: number of characters converted

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_INVALID_URL
                    UrlString couldn't be converted

                  ERROR_INSUFFICIENT_BUFFER
                    ConvertedString isn't large enough to hold all the converted
                    UrlString

--*/

{
    DWORD bufferRemaining;

    bufferRemaining = *DecodedLength;
    while (UrlLength && bufferRemaining) {

        char ch;

        if (*Url == '%') {

            //
            // BUGBUG - would %00 ever appear in an URL?
            //

            ++Url;
            if (isxdigit(*Url)) {
                ch = HexCharToNumber(*Url++) << 4;
                if (isxdigit(*Url)) {
                    ch |= HexCharToNumber(*Url++);
                } else {
                    return ERROR_WINHTTP_INVALID_URL;
                }
            } else {
                return ERROR_WINHTTP_INVALID_URL;
            }
            UrlLength -= 3;
        } else {
            ch = *Url++;
            --UrlLength;
        }
        *DecodedString++ = ch;
        --bufferRemaining;
    }
    if (UrlLength == 0) {
        *DecodedLength -= bufferRemaining;
        return ERROR_SUCCESS;
    } else {
        return ERROR_INSUFFICIENT_BUFFER;
    }
}


DWORD
DecodeUrlInSitu(
    IN LPSTR BufferAddress,
    IN OUT LPDWORD BufferLength
    )

/*++

Routine Description:

    Decodes an URL string, if it contains escape sequences. The conversion is
    done in place, since we know that a string containing escapes is longer than
    the string with escape sequences (3 bytes) converted to characters (1 byte)

Arguments:

    BufferAddress   - pointer to the string to convert

    BufferLength    - IN: number of characters to convert
                      OUT: length of converted string

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_INVALID_URL
                  ERROR_INSUFFICIENT_BUFFER

--*/

{
    DWORD stringLength;

    stringLength = *BufferLength;
    if (memchr(BufferAddress, '%', stringLength)) {
        return DecodeUrl(BufferAddress,
                         stringLength,
                         BufferAddress,
                         BufferLength
                         );
    } else {

        //
        // no escape character in the string, just return success
        //

        return ERROR_SUCCESS;
    }
}


DWORD
DecodeUrlStringInSitu(
    IN LPSTR BufferAddress,
    IN OUT LPDWORD BufferLength
    )

/*++

Routine Description:

    Performs DecodeUrlInSitu() on a string and zero terminates it

    Assumes: 1. Even if no decoding is performed, *BufferLength is large enough
                to fit an extra '\0' character

Arguments:

    BufferAddress   - pointer to the string to convert

    BufferLength    - IN: number of characters to convert
                      OUT: length of converted string, excluding '\0'

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_INVALID_URL
                  ERROR_INSUFFICIENT_BUFFER

--*/

{
    DWORD error;

    error = DecodeUrlInSitu(BufferAddress, BufferLength);
    if (error == ERROR_SUCCESS) {
        BufferAddress[*BufferLength] = '\0';
    }
    return error;
}


DWORD
GetUrlAddressInfo(
    IN OUT LPSTR* Url,
    IN OUT LPDWORD UrlLength,
    OUT LPSTR* PartOne,
    OUT LPDWORD PartOneLength,
    OUT LPBOOL PartOneEscape,
    OUT LPSTR* PartTwo,
    OUT LPDWORD PartTwoLength,
    OUT LPBOOL PartTwoEscape
    )

/*++

Routine Description:

    Given a string of the form foo:bar, splits them into 2 counted strings about
    the ':' character. The address string may or may not contain a ':'.

    This function is intended to split into substrings the host:port and
    username:password strings commonly used in Internet address specifications
    and by association, in URLs

Arguments:

    Url             - pointer to pointer to string containing URL. On output
                      this is advanced past the address parts

    UrlLength       - pointer to length of URL in UrlString. On output this is
                      reduced by the number of characters parsed

    PartOne         - pointer which will receive first part of address string

    PartOneLength   - pointer which will receive length of first part of address
                      string

    PartOneEscape   - TRUE on output if PartOne contains escape sequences

    PartTwo         - pointer which will receive second part of address string

    PartTwoLength   - pointer which will receive length of second part of address
                      string

    PartOneEscape   - TRUE on output if PartTwo contains escape sequences

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_INVALID_URL

--*/

{
    LPSTR pString;
    LPSTR pColon;
    DWORD partLength;
    LPBOOL partEscape;
    DWORD length;

    //
    // parse out <host>[:<port>] or <name>[:<password>] (i.e. <part1>[:<part2>]
    //

    pString = *Url;
    pColon = NULL;
    partLength = 0;
    *PartOne = pString;
    *PartOneLength = 0;
    *PartOneEscape = FALSE;
    *PartTwoEscape = FALSE;
    partEscape = PartOneEscape;
    length = *UrlLength;
    while ((*pString != '/') && (*pString != '\0') && (length != 0)) {
        if (*pString == '%') {

            //
            // if there is a % in the string then it *must* (RFC 1738) be the
            // start of an escape sequence. This function just reports the
            // address of the substrings and their lengths; calling functions
            // must handle the escape sequences (i.e. it is their responsibility
            // to decide where to put the results)
            //

            *partEscape = TRUE;
        }
        if (*pString == ':') {
            if (pColon != NULL) {

                //
                // we don't expect more than 1 ':'
                //

                return ERROR_WINHTTP_INVALID_URL;
            }
            pColon = pString;
            *PartOneLength = partLength;
            if (partLength == 0) {
                *PartOne = NULL;
            }
            partLength = 0;
            partEscape = PartTwoEscape;
        } else {
            ++partLength;
        }
        ++pString;
        --length;
    }

    //
    // we either ended on the host (or user) name or the port number (or
    // password), one of which we don't know the length of
    //

    if (pColon == NULL) {
        *PartOneLength = partLength;
        *PartTwo = NULL;
        *PartTwoLength = 0;
        *PartTwoEscape = FALSE;
    } else {
        *PartTwoLength = partLength;
        *PartTwo = pColon + 1;

        //
        // in both the <user>:<password> and <host>:<port> cases, we cannot have
        // the second part without the first, although both parts being zero
        // length is OK (host name will be sorted out elsewhere, but (for now,
        // at least) I am allowing <>:<> for username:password, since I don't
        // see it expressly disallowed in the RFC. I may be revisiting this code
        // later...)
        //
        // N.B.: ftp://ftp.microsoft.com uses http://:0/-http-gw-internal-/menu.gif

//      if ((*PartOneLength == 0) && (partLength != 0)) {
//          return ERROR_WINHTTP_INVALID_URL;
//      }
    }

    //
    // update the URL pointer and length remaining
    //

    *Url = pString;
    *UrlLength = length;

    return ERROR_SUCCESS;
}


DWORD
GetUrlAddress(
    IN OUT LPSTR* lpszUrl,
    OUT LPDWORD lpdwUrlLength,
    OUT LPSTR* lpszUserName OPTIONAL,
    OUT LPDWORD lpdwUserNameLength OPTIONAL,
    OUT LPSTR* lpszPassword OPTIONAL,
    OUT LPDWORD lpdwPasswordLength OPTIONAL,
    OUT LPSTR* lpszHostName OPTIONAL,
    OUT LPDWORD lpdwHostNameLength OPTIONAL,
    OUT LPINTERNET_PORT lpPort OPTIONAL,
    OUT LPBOOL pHavePort
    )

/*++

Routine Description:

    This function extracts any and all parts of the address information for a
    generic URL. If any of the address parts contain escaped characters (%nn)
    then they are converted in situ

    The generic addressing format (RFC 1738) is:

        <user>:<password>@<host>:<port>

    The addressing information cannot contain a password without a user name,
    or a port without a host name
    NB: ftp://ftp.microsoft.com uses URL's that have a port without a host name!
    (e.g. http://:0/-http-gw-internal-/menu.gif)

    Although only the lpszUrl and lpdwUrlLength fields are required, the address
    parts will be checked for presence and completeness

    Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName)
                then the accompanying lpdw field must also be supplied

Arguments:

    lpszUrl             - IN: pointer to the URL to parse
                          OUT: URL remaining after address information

                          N.B. The url-path is NOT canonicalized (unescaped)
                          because it may contain protocol-specific information
                          which must be parsed out by the protocol-specific
                          parser

    lpdwUrlLength       - returned length of the remainder of the URL after the
                          address information

    lpszUserName        - returned pointer to the user name
                          This parameter can be omitted by those protocol parsers
                          that do not require or expect user names in the URL

    lpdwUserNameLength  - returned length of the user name part
                          This parameter can be omitted by those protocol parsers
                          that do not require or expect user names in the URL

    lpszPassword        - returned pointer to the password
                          This parameter can be omitted by those protocol parsers
                          that do not require or expect user passwords in the URL

    lpdwPasswordLength  - returned length of the password
                          This parameter can be omitted by those protocol parsers
                          that do not require or expect user passwords in the URL

    lpszHostName        - returned pointer to the host name
                          This parameter can be omitted by those protocol parsers
                          that do not require the host name info

    lpdwHostNameLength  - returned length of the host name
                          This parameter can be omitted by those protocol parsers
                          that do not require the host name info

    lpPort              - returned value of the port field
                          This parameter can be omitted by those protocol parsers
                          that do not require or expect user port number

    pHavePort           - returned boolean indicating whether a port was specified
                          in the URL or not.  This value is not returned if the
                          lpPort parameter is omitted.

Return Value:

    DWORD
        Success - ERROR_SUCCESS

        Failure - ERROR_WINHTTP_INVALID_URL
                    We could not parse some part of the address info, or we
                    found address info where the protocol parser didn't expect
                    any

                  ERROR_INSUFFICIENT_BUFFER
                    We could not convert an escaped string

--*/

{
    LPSTR pAt;
    DWORD urlLength;
    LPSTR pUrl;
    BOOL part1Escape;
    BOOL part2Escape;
    char portNumber[INTERNET_MAX_PORT_NUMBER_LENGTH + 1];
    DWORD portNumberLength;
    LPSTR pPortNumber;
    DWORD error;
    LPSTR hostName;
    DWORD hostNameLength;

    pUrl = *lpszUrl;
    urlLength = strlen(pUrl);

    //
    // check to see if there is an '@' separating user name & password. If we
    // see a '/' or get to the end of the string before we see the '@' then
    // there is no username:password part
    //

    pAt = NULL;
    for (DWORD i = 0; i < urlLength; ++i) {
        if (pUrl[i] == '/') {
            break;
        } else if (pUrl[i] == '@') {
            pAt = &pUrl[i];
            break;
        }
    }

    if (pAt != NULL) {

        DWORD addressPartLength;
        LPSTR userName;
        DWORD userNameLength;
        LPSTR password;
        DWORD passwordLength;

        addressPartLength = (DWORD) (pAt - pUrl);
        urlLength -= addressPartLength;
        error = GetUrlAddressInfo(&pUrl,
                                  &addressPartLength,
                                  &userName,
                                  &userNameLength,
                                  &part1Escape,
                                  &password,
                                  &passwordLength,
                                  &part2Escape
                                  );
        if (error != ERROR_SUCCESS) {
            return error;
        }

        //
        // ensure there is no address information unparsed before the '@'
        //

        INET_ASSERT(addressPartLength == 0);
        INET_ASSERT(pUrl == pAt);

        if (ARGUMENT_PRESENT(lpszUserName)) {

            INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength));

            //
            // convert the user name in situ
            //

            if (part1Escape) {

                INET_ASSERT(userName != NULL);
                INET_ASSERT(userNameLength != 0);

                error = DecodeUrlInSitu(userName, &userNameLength);
                if (error != ERROR_SUCCESS) {
                    return error;
                }
            }
            *lpszUserName = userName;
            *lpdwUserNameLength = userNameLength;
        }

        if (ARGUMENT_PRESENT(lpszPassword)) {

            //
            // convert the password in situ
            //

            if (part2Escape) {

                INET_ASSERT(userName != NULL);
                INET_ASSERT(userNameLength != 0);
                INET_ASSERT(password != NULL);
                INET_ASSERT(passwordLength != 0);

                error = DecodeUrlInSitu(password, &passwordLength);
                if (error != ERROR_SUCCESS) {
                    return error;
                }
            }
            *lpszPassword = password;
            *lpdwPasswordLength = passwordLength;
        }

        //
        // the URL pointer now points at the host:port fields (remember that
        // ExtractAddressParts() must have bumped pUrl up to the end of the
        // password field (if present) which ends at pAt)
        //

        ++pUrl;

        //
        // similarly, bump urlLength to account for the '@'
        //

        --urlLength;
    } else {

        //
        // no '@' therefore no username or password
        //

        if (ARGUMENT_PRESENT(lpszUserName)) {

            INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength));

            *lpszUserName = NULL;
            *lpdwUserNameLength = 0;
        }
        if (ARGUMENT_PRESENT(lpszPassword)) {

            INET_ASSERT(ARGUMENT_PRESENT(lpdwPasswordLength));

            *lpszPassword = NULL;
            *lpdwPasswordLength = 0;
        }
    }

    //
    // now get the host name and the optional port
    //

    pPortNumber = portNumber;
    portNumberLength = sizeof(portNumber);
    error = GetUrlAddressInfo(&pUrl,
                              &urlLength,
                              &hostName,
                              &hostNameLength,
                              &part1Escape,
                              &pPortNumber,
                              &portNumberLength,
                              &part2Escape
                              );
    if (error != ERROR_SUCCESS) {
        return error;
    }

    //
    // the URL address information MUST contain the host name
    //

//  if ((hostName == NULL) || (hostNameLength == 0)) {
//      return ERROR_WINHTTP_INVALID_URL;
//  }

    if (ARGUMENT_PRESENT(lpszHostName)) {

        INET_ASSERT(ARGUMENT_PRESENT(lpdwHostNameLength));

        //
        // if the host name contains escaped characters, convert them in situ
        //

        if (part1Escape) {
            error = DecodeUrlInSitu(hostName, &hostNameLength);
            if (error != ERROR_SUCCESS) {
                return error;
            }
        }
        *lpszHostName = hostName;
        *lpdwHostNameLength = hostNameLength;
    }

    //
    // if there is a port field, convert it if there are escaped characters,
    // check it for valid numeric characters, and convert it to a number
    //

    if (ARGUMENT_PRESENT(lpPort)) {
        if (portNumberLength != 0) {

            DWORD i;
            DWORD port;

            INET_ASSERT(pPortNumber != NULL);

            if (part2Escape) {
                error = DecodeUrlInSitu(pPortNumber, &portNumberLength);
                if (error != ERROR_SUCCESS) {
                    return error;
                }
            }

            //
            // ensure all characters in the port number buffer are numeric, and
            // calculate the port number at the same time
            //

            for (i = 0, port = 0; i < portNumberLength; ++i) {
                if (!isdigit(*pPortNumber)) {
                    return ERROR_WINHTTP_INVALID_URL;
                }
                port = port * 10 + (int)(*pPortNumber++ - '0');
                // We won't allow ports larger than 65535 ((2^16)-1)
                // We have to check this every time to make sure that someone
                // doesn't try to overflow a DWORD.
                if (port > 65535)
                {
                    return ERROR_WINHTTP_INVALID_URL;
                }
            }
            *lpPort = (INTERNET_PORT)port;
            if (ARGUMENT_PRESENT(pHavePort)) {
                *pHavePort = TRUE;
            }
        } else {
            *lpPort = INTERNET_INVALID_PORT_NUMBER;
            if (ARGUMENT_PRESENT(pHavePort)) {
                *pHavePort = FALSE;
            }
        }
    }

    //
    // update the URL pointer and the length of the url-path
    //

    *lpszUrl = pUrl;
    *lpdwUrlLength = urlLength;

    return ERROR_SUCCESS;
}


INTERNET_SCHEME
MapUrlSchemeName(
    IN LPSTR lpszSchemeName,
    IN DWORD dwSchemeNameLength
    )

/*++

Routine Description:

    Maps a scheme name/length to a scheme name type

Arguments:

    lpszSchemeName      - pointer to name of scheme to map

    dwSchemeNameLength  - length of scheme (if -1, lpszSchemeName is ASCIZ)

Return Value:

    INTERNET_SCHEME

--*/

{
    if (dwSchemeNameLength == (DWORD)-1) {
        dwSchemeNameLength = (DWORD)lstrlen(lpszSchemeName);
    }

    DWORD i;
    if (ScanSchemes(lpszSchemeName, dwSchemeNameLength, &i))
    {
        return UrlSchemeList[i].SchemeType;
    }
    return INTERNET_SCHEME_UNKNOWN;
}


LPSTR
MapUrlScheme(
    IN INTERNET_SCHEME Scheme,
    OUT LPDWORD lpdwSchemeNameLength
    )

/*++

Routine Description:

    Maps the enumerated scheme name type to the name

Arguments:

    Scheme                  - enumerated scheme type to map

    lpdwSchemeNameLength    - pointer to returned length of scheme name

Return Value:

    LPSTR   - pointer to scheme name or NULL

--*/

{
    if ((Scheme >= INTERNET_SCHEME_FIRST)
    && (Scheme <= INTERNET_SCHEME_LAST))
    {
        *lpdwSchemeNameLength = UrlSchemeList[Scheme].SchemeLength;
        return UrlSchemeList[Scheme].SchemeName;
    }
    *lpdwSchemeNameLength = 0;
    return NULL;
}


LPSTR
MapUrlSchemeToName(
    IN INTERNET_SCHEME Scheme
    )

/*++

Routine Description:

    Maps the enumerated scheme name type to the name

Arguments:

    Scheme  - enumerated scheme type to map

Return Value:

    LPSTR   - pointer to scheme name or NULL

--*/

{
    if ((Scheme >= INTERNET_SCHEME_FIRST)
    && (Scheme <= INTERNET_SCHEME_LAST)) {
        return UrlSchemeList[Scheme].SchemeName;
    }
    return NULL;
}

/*
 * ConvertUnicodeToMultiByte:
 *
 * dwFlags: WINHTTP_FLAG_NULL_CODEPAGE-> assumes correctly encoded string packaged into UTF8, no escaping done.
            WINHTTP_FLAG_VALID_HOSTNAME-> only for server name
                only the previous flag valid for server name passed in here.
                if both of these are not specified, then

            if dwCodePage is not INVALID, it'll be used to convert unicode string to ANSI.
            else UTF8 will be used.

            if ESCAPE && ESCAPE_PERCENT is specified, the ANSI url will be escaped (incl. %) else it will be escaped w/o
            escaping %s.
 */

DWORD
ConvertUnicodeToMultiByte(
    LPCWSTR lpszObjectName,
    DWORD dwCodePage,
    MEMORYPACKET* pmp,
    DWORD dwFlags)
{
    DWORD dwError = ERROR_SUCCESS;
    LPSTR pStr;
    WCHAR wc;
    LPCWSTR pwStr;
    BOOL bStrip0s = TRUE;
    DWORD dwUnicodeUrlSize;

//determine size of string and/or safe characters
    if ((dwFlags & WINHTTP_FLAG_NULL_CODEPAGE) ||
        (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME))
    {
        if (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME)
        {
            for (pwStr = lpszObjectName; wc = *pwStr; ++pwStr)
            {
                if (IS_UNSAFE_URL_WIDECHARACTER(wc, HOSTNAME))
                {
                    dwError = ERROR_WINHTTP_INVALID_URL;
                    goto done;
                }
            }
            pmp->dwAlloc = dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1);
        }
        else
        {
            pmp->dwAlloc = dwUnicodeUrlSize = lstrlenW(lpszObjectName)+1;
        }
    }
    else
    {
        DWORD dwUnsafeChars = 0;

        // optimization to check for unsafe characters, and optimize the common case.
        // calculate the length, and while parsing the string, check if there are unsafeChars
        for(pwStr = lpszObjectName; wc = *pwStr; ++pwStr)
        {
            if (IS_UNSAFE_URL_WIDECHARACTER(wc, 0))
                ++dwUnsafeChars;
        }
        dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1);

        if (dwUnsafeChars == 0)
        {
            pmp->dwAlloc = dwUnicodeUrlSize;
        }
        else
        {
            bStrip0s = FALSE;
        }
    }

//convert to MBCS
    if (bStrip0s)
    {
        INET_ASSERT(pmp->dwAlloc);

        pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);

        if (!pmp->psStr)
        {
            pmp->dwAlloc = 0;
            dwError = ERROR_NOT_ENOUGH_MEMORY;
            goto done;
        }
        pmp->dwSize = pmp->dwAlloc-1;

        for (pStr = pmp->psStr; wc = *lpszObjectName; ++lpszObjectName)
        {
            *(pStr)++ = (CHAR)wc;
        }
        *pStr = '\0';
    }
    else
    {
        // convert with WideCharToMultiByte()
        pmp->dwAlloc = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, NULL, 0, NULL, NULL);
        if (pmp->dwAlloc)
        {
            pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);

            if (!pmp->psStr)
            {
                pmp->dwAlloc = 0;
                dwError = ERROR_NOT_ENOUGH_MEMORY;
                goto done;
            }
            pmp->dwSize = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, pmp->psStr, pmp->dwAlloc, NULL, NULL);

            if (!pmp->dwSize)
            {
                dwError = GetLastError();
                goto done;
            }
            else
                pmp->dwSize -= 1;
        }
        else
        {
            dwError = GetLastError();
            goto done;
        }
    }

//escaping
    if (dwFlags & WINHTTP_FLAG_DEFAULT_ESCAPE)
    {
        INET_ASSERT (! (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME));

        static CHAR* hexArray = "0123456789ABCDEF";
        UCHAR ch;
        DWORD dwUnsafeChars = 0;
        DWORD dwNewAlloc;
        LPSTR pDest, pNewStr;

        for(pStr = pmp->psStr; ch = *pStr; pStr = CharNextExA((WORD)dwCodePage, pStr, 0))
        {
            if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP))
                ++dwUnsafeChars;
            else if(ch == '?')
                break;
        }

        if (dwUnsafeChars == 0)
            goto done;


        dwNewAlloc = pmp->dwAlloc + dwUnsafeChars*2;
        pNewStr = pDest = (LPSTR)ALLOCATE_FIXED_MEMORY(dwNewAlloc);

        if (!pDest)
        {
            dwError = ERROR_NOT_ENOUGH_MEMORY;
            goto done;
        }

        BOOL bEscapePercent = (dwFlags & WINHTTP_FLAG_ESCAPE_PERCENT) ? TRUE : FALSE;

        BOOL bHitQuery = FALSE;
        LPSTR pNext;
        BOOL bLead;
        for (pStr = pmp->psStr; ch = *pStr;)
        {
            pNext = CharNextExA((WORD)dwCodePage, pStr, 0);
            bLead = TRUE;
            do
            {
                ch = *pStr;
                if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP)
                    && (!bLead || (ch != '%') || bEscapePercent) )
                {
                    *pDest++ = '%';
                    *pDest++ = hexArray[ch>>4];
                    *pDest++ = hexArray[ch & 0x0f];
                }
                else
                {
                    *pDest++ = ch;
                    if ((ch == '?') && bLead)
                    {
                        bHitQuery = TRUE;

                        ++pStr;
                        INET_ASSERT(pStr == pNext);

                        break;
                    }
                }
                bLead = FALSE;
            }
            while (++pStr != pNext);

            if (bHitQuery)
                break;
        }

        if (bHitQuery)
        {
            for ( ; ch = *pStr; pStr++)
            {
                *pDest++ = ch;
            }
        }
        *pDest = '\0';

        FREE_FIXED_MEMORY(pmp->psStr);
        pmp->psStr = pNewStr;
        pmp->dwSize = (DWORD)(pDest-pNewStr);
        pmp->dwAlloc = dwNewAlloc;
    }

done:
    if (pmp->psStr)
        pmp->dwAlloc = (pmp->dwAlloc > MP_MAX_STACK_USE) ? pmp->dwAlloc : MP_MAX_STACK_USE+1;// to force FREE in ~MEMORYPACKET

    return dwError;
}