xbox-kernel/private/ntos/xnet/tcp/pcb.c
2020-09-30 17:17:25 +02:00

1018 lines
24 KiB
C

/*++
Copyright (c) 2000 Microsoft Corporation
Module Name:
pcb.c
Abstract:
Functions for dealing with PCBs (protocol control blocks)
Revision History:
06/01/2000 davidx
Created it.
--*/
#include "precomp.h"
//
// Global PCB lists
//
LIST_ENTRY PcbList;
ULONG PcbCount;
//
// Default and max send and receive buffer sizes
// Since we don't do TCP window scaling option, the max
// receive buffer size cannot exceed 16-bit (65535).
//
UINT cfgDefaultSendBufsize = 16*1024;
UINT cfgDefaultRecvBufsize = 16*1024;
UINT cfgMinSendRecvBufsize = 1;
UINT cfgMaxSendRecvBufsize = 0xffff;
//
// Maximum number of sockets
//
UINT cfgMaxSockets = 64;
PCB*
PcbCreate(
INT type,
INT protocol,
BYTE allocFlag
)
/*++
Routine Description:
Create a new PCB for the specified socket type and protocol
Arguments:
type - Socket type: SOCK_STREAM, SOCK_DGRAM, or SOCK_RAW
protocol - Protocol number
allocFlag - Whether to allocate memory from our private pool
PCBFLAG_NETPOOL or 0
Return Value:
Pointer to the newly created PCB structure
NULL if there is an error
--*/
{
PCB* pcb;
if (PcbCount >= cfgMaxSockets) {
WARNING_("Too many sockets");
return NULL;
}
//
// Allocate memory out of the system pool or
// our private pool based on the caller specified flag
//
if (allocFlag & PCBFLAG_NETPOOL) {
pcb = (PCB*) ((type == SOCK_STREAM) ?
XnetAlloc0(sizeof(TCB), PTAG_TCB) :
XnetAlloc0(sizeof(PCB), PTAG_PCB));
} else {
pcb = (PCB*) ((type == SOCK_STREAM) ?
SysAlloc0(sizeof(TCB), PTAG_TCB) :
SysAlloc0(sizeof(PCB), PTAG_PCB));
}
if (!pcb) return NULL;
pcb->magicCookie = ACTIVE_PCB_COOKIE;
pcb->flags = allocFlag;
pcb->type = (BYTE) type;
pcb->protocol = (BYTE) protocol;
pcb->maxSendBufsize = cfgDefaultSendBufsize;
pcb->maxRecvBufsize = cfgDefaultRecvBufsize;
pcb->ipTtl = cfgDefaultTtl;
pcb->ipTos = cfgDefaultTos;
pcb->mcastTtl = 1;
InitializeListHead(&pcb->recvbuf);
InitializeListHead(&pcb->sendbuf);
KeInitializeEvent(GetPcbWaitEvent(pcb), NotificationEvent, FALSE);
if (type == SOCK_STREAM) {
TcbInit((TCB*) pcb);
}
InterlockedIncrement(&PcbCount);
return pcb;
}
NTSTATUS
PcbClose(
PCB* pcb,
BOOL force
)
/*++
Routine Description:
Close a PCB structure
Arguments:
pcb - Points to the PCB structure to be disposed of
force - Forceful close, always succeed
Return Value:
Status code
--*/
{
KIRQL irql = RaiseToDpc();
// If a TCP is being gracefully closed, then we'll
// leave the the PCB structure in the global list
// until the connection is really gone.
if (IsTcb(pcb) && !TcbClose((TCB*) pcb, force)) {
pcb->magicCookie = CLOSED_PCB_COOKIE;
pcb->flags |= PCBFLAG_BOTH_SHUTDOWN;
LowerFromDpc(irql);
return NETERR_OK;
}
// Mark the PCB structure as invalid for simple protection
// against bad apps trying to access closed socket handles.
pcb->magicCookie = CLOSED_PCB_COOKIE;
if (!IsListNull(&pcb->links)) {
RemoveEntryList(&pcb->links);
}
// Clean up the information associated with the PCB
PcbCleanup(pcb, FALSE);
LowerFromDpc(irql);
if (pcb->flags & PCBFLAG_NETPOOL) {
XnetFree(pcb);
} else {
SysFree(pcb);
}
InterlockedDecrement(&PcbCount);
return NETERR_OK;
}
VOID
PcbCleanup(
PCB* pcb,
BOOL revivable
)
/*++
Routine Description:
Clean up the information associated with a PCB
Arguments:
pcb - Points to the protocol control block
revivable - Only significant for a TCP socket;
if TRUE we leave the socket in a revivable state
Return Value:
NONE
--*/
{
NTSTATUS status;
RUNS_AT_DISPATCH_LEVEL
status = NT_SUCCESS(pcb->errStatus) ? NETERR_CANCELLED : pcb->errStatus;
PcbClearOverlappedRecvs(pcb, status);
PcbClearOverlappedSends(pcb, status);
// Flush send and receive buffers
PcbFlushRecvBuffers(pcb);
while (!IsPcbSendBufEmpty(pcb)) {
SENDBUF* sendbuf = (SENDBUF*) RemoveHeadList(&pcb->sendbuf);
SendbufRelease(sendbuf);
}
pcb->sendbufSize = 0;
if (pcb->rte) {
IpReleaseCachedRTE(pcb->rte);
pcb->rte = NULL;
}
if (revivable) {
pcb->flags |= PCBFLAG_REVIVABLE;
} else {
if (pcb->bindIfp)
pcb->bindIfp = NULL;
PcbCleanupMcastData(pcb);
PcbFreeIpOpts(pcb);
}
}
VOID
PcbCloseAll()
/*++
Routine Description:
Forcefully close all sockets
Arguments:
NONE
Return Value:
NONE
--*/
{
PCB* pcb;
if (IsListNull(&PcbList)) return;
while (!IsPcbListEmpty()) {
pcb = (PCB*) PcbList.Flink;
PcbDelete(pcb);
}
if (IsListNull(&DeadTcbList)) return;
while (!IsListEmpty(&DeadTcbList)) {
pcb = (PCB*) DeadTcbList.Flink;
PcbDelete(pcb);
}
}
PCB*
PcbFindMatch(
IPADDR toaddr,
IPPORT toport,
IPADDR fromaddr,
IPPORT fromport,
BYTE type,
BYTE protocol
)
/*++
Routine Description:
Find the matching socket for a received datagarm
Arguments:
toaddr, toport - The destination address of the datagram
fromaddr, fromport - The sender's address
type - Specifies the socket type (SOCK_DGRAM or SOCK_RAW)
protocol - Specifies the protocol number
Return Value:
Points to the best socket to receive the datagram
NULL if no matching socket is found
--*/
{
PCB* pcb;
PCB* found;
UINT wildcard, minWildcard;
RUNS_AT_DISPATCH_LEVEL
found = NULL;
minWildcard = 4;
LOOP_THRU_PCB_LIST(pcb)
if (pcb->type != type ||
pcb->protocol != protocol && pcb->protocol != 0 ||
pcb->srcport != toport)
continue;
wildcard = 0;
if (pcb->srcaddr != toaddr) {
if (pcb->srcaddr) continue;
wildcard++;
}
if (pcb->dstaddr != fromaddr) {
if (pcb->dstaddr) continue;
wildcard++;
}
if (pcb->dstport != fromport) {
if (pcb->dstport) continue;
wildcard++;
}
if (wildcard == 0) return pcb;
if (wildcard < minWildcard) {
found = pcb;
minWildcard = wildcard;
}
END_PCB_LIST_LOOP
return found;
}
NTSTATUS
PcbUpdateBufferSize(
PCB* pcb,
INT sendBufsize,
INT recvBufsize
)
/*++
Routine Description:
Update the send and receive buffer sizes
Arguments:
pcb - Points to the PCB structure
sendBufsize, recvBufsize -
Specifies the new send and receive buffer sizes
Return Value:
Status code
--*/
{
KIRQL irql;
NTSTATUS status;
if (sendBufsize > (INT) cfgMaxSendRecvBufsize)
sendBufsize = (INT) cfgMaxSendRecvBufsize;
else if (sendBufsize <= 0) {
// NOTE: we never set actual send buffer size to 0
// because we don't support the no-buffering option.
sendBufsize = 1;
}
if (recvBufsize > (INT) cfgMaxSendRecvBufsize)
recvBufsize = (INT) cfgMaxSendRecvBufsize;
else if (recvBufsize <= 0)
recvBufsize = 0;
status = NETERR_OK;
irql = RaiseToDpc();
if (IsDgramPcb(pcb)) {
// For datagram sockets, we'll just update the send and
// receive buffers sizes. If the current buffers are bigger
// than the specified limits, we'll leave the current data alone.
pcb->maxSendBufsize = sendBufsize;
pcb->maxRecvBufsize = recvBufsize;
} else {
TCB* tcb = (TCB*) pcb;
// Set the send buffer size. If the current send buffer size
// is larger than the specified limit, leave the current data untouched.
tcb->maxSendBufsize = sendBufsize;
// If the TCP socket is already connected,
// don't allow the app to reduce the receive buffer size.
if (!IsTcpIdleState(tcb) && recvBufsize < (INT) tcb->maxRecvBufsize) {
status = NETERR_PARAM;
} else {
// NOTE: we don't update receive window to the connection peer
// right away. The new window information will be sent
// in the next outgoing ACK segment.
tcb->maxRecvBufsize = recvBufsize;
}
}
// NOTE: If the send buffer has just opened up,
// we don't check to see if there is any pending
// overlapped send request that can be started.
// Rather the overlapped send request will be started
// by the normal process.
LowerFromDpc(irql);
return status;
}
//
// Temporary port numbers are between 1024 and 4999 (inclusive)
// We're not using port number 5000 just as a safety precaution.
//
#define MIN_TEMP_PORT 1024
#define MAX_TEMP_PORT 4999
#define TEMP_PORT_COUNT (MAX_TEMP_PORT-MIN_TEMP_PORT+1)
INLINE IPPORT GenerateTempPort() {
static USHORT nextTempPort = MAX_TEMP_PORT;
if (++nextTempPort > MAX_TEMP_PORT)
nextTempPort = MIN_TEMP_PORT;
return HTONS(nextTempPort);
}
NTSTATUS
PcbBind(
PCB* pcb,
IPADDR srcaddr,
IPPORT srcport
)
/*++
Routine Description:
Bind a socket to the specified local address
Arguments:
pcb - Points to the protocol control block
srcaddr, srcport - Specifies the local socket address
Return Value:
Status code
--*/
{
NTSTATUS status;
INT retries = 1;
KIRQL irql = RaiseToDpc();
ASSERT(!IsPcbBound(pcb));
if (pcb->type == SOCK_RAW) {
// Port number is meaningless for raw sockets
// and we always set it to 0.
srcport = 0;
} else if (srcport == 0) {
// We're picking a randomly-generated port number.
retries = min(cfgMaxSockets, TEMP_PORT_COUNT);
srcport = GenerateTempPort();
}
while (TRUE) {
PCB* cur;
status = NETERR_OK;
LOOP_THRU_PCB_LIST(cur)
if (cur == pcb || !IsPcbBound(cur)) continue;
// NOTE: Different raw sockets can bind to
// the same address/protocol pair. This is
// to follow win2k behavior.
if ((pcb->type != SOCK_RAW) &&
(cur->protocol == pcb->protocol) &&
(cur->srcport == srcport) && // conflicting port number?
(cur->bindSrcAddr == srcaddr || // conflicting address?
cur->bindSrcAddr == 0 ||
srcaddr == 0) &&
(cur->exclusiveAddr || // address reuse disallowed?
pcb->exclusiveAddr ||
!pcb->reuseAddr)) {
status = NETERR_ADDRINUSE;
break;
}
END_PCB_LIST_LOOP
if (status == NETERR_OK) {
// If we're binding to a specific IP address,
// loop through all the interfaces and
// select the one with the matching address.
if (srcaddr != 0) {
IfInfo* ifp;
ifp = IfFindInterface(srcaddr);
if (ifp == NULL) {
status = NETERR(WSAEADDRNOTAVAIL);
break;
}
ASSERT(pcb->bindIfp == NULL);
CACHE_IFP_REFERENCE(pcb->bindIfp, ifp);
}
pcb->srcaddr = pcb->bindSrcAddr = srcaddr;
pcb->srcport = srcport;
pcb->flags |= PCBFLAG_BOUND;
} else {
// The selected port is not available.
// Check to see if we should try another one.
if (--retries) {
srcport = GenerateTempPort();
continue;
}
}
break;
}
LowerFromDpc(irql);
return status;
}
NTSTATUS
PcbConnectDgram(
PCB* pcb,
IPADDR dstaddr,
IPPORT dstport
)
/*++
Routine Description:
Connect a datagram socket to the specified foreign address
Arguments:
pcb - Points to the protocol control block
dstaddr, dstport - Specifies the foreign socket address
Return Value:
Status code
--*/
{
KIRQL irql;
NTSTATUS status;
if (pcb->type == SOCK_RAW) {
// Port number for raw sockets is meaningless
dstport = 0;
} else {
if (dstaddr == 0 && dstport != 0 ||
dstaddr != 0 && dstport == 0)
return NETERR(WSAEADDRNOTAVAIL);
}
// Destination hasn't changed, no need to do anything
if (dstaddr == pcb->dstaddr && dstport == pcb->dstport)
return NETERR_OK;
// Is this socket allowed to send broadcast
// datagrams on this socket?
if (IS_BCAST_IPADDR(dstaddr) && !pcb->broadcast)
return NETERR(WSAEACCES);
irql = RaiseToDpc();
// If the socket is currently connected,
// we need to disconnect it first.
if (IsPcbConnected(pcb)) {
pcb->flags &= ~PCBFLAG_CONNECTED;
pcb->dstaddr = 0;
pcb->dstport = 0;
pcb->srcaddr = pcb->bindSrcAddr;
if (pcb->rte) {
IpReleaseCachedRTE(pcb->rte);
pcb->rte = NULL;
}
}
// Discard any received packets that have been
// queued up but not yet processed
PcbFlushRecvBuffers(pcb);
if (dstaddr != 0) {
// Bind to a local address if necessary
if (!IsPcbBound(pcb)) {
status = PcbBind(pcb, 0, 0);
if (!NT_SUCCESS(status)) {
LowerFromDpc(irql);
return status;
}
}
if (!IS_MCAST_IPADDR(dstaddr)) {
// Find a route to the connected destination.
// And if we're bound to a wildcard source address,
// then we'll pick a specific source address here
// based on the route.
ASSERT(pcb->rte == NULL);
pcb->rte = IpFindRTE(dstaddr, NULL);
if (!pcb->rte) {
LowerFromDpc(irql);
return NETERR_UNREACHABLE;
}
RteAddRef(pcb->rte);
if (pcb->bindSrcAddr == 0) {
pcb->srcaddr = PcbGetDefaultSrcAddr(pcb->rte);
}
}
pcb->flags |= PCBFLAG_CONNECTED;
pcb->dstaddr = dstaddr;
pcb->dstport = dstport;
}
LowerFromDpc(irql);
return NETERR_OK;
}
VOID
PcbSetupIpHeader(
PCB* pcb,
Packet* pkt,
IpAddrPair* addrpair
)
/*++
Routine Description:
Prepare the IP header information in an outgoing packet
using the connection information from the PCB.
Arguments:
pcb - Points to a connected PCB
pkt - Points to the outgoing packet
addrpair - Specifies the source and destination address pair
Return Value:
NONE
Note:
This function really belong to the ..\ip\ipsend.c.
We duplicate the code here to save a function call (with
large number of parameters).
--*/
{
UINT iphdrlen;
IpHeader* iphdr;
iphdrlen = IPHDRLEN + ROUNDUP4(pcb->ipoptlen);
pkt->data -= iphdrlen;
iphdr = (IpHeader*) pkt->data;
SETPKTIPHDR(pkt, iphdr);
pkt->datalen += iphdrlen;
if (pcb->ipoptlen) {
ZeroMem(iphdr+1, iphdrlen-IPHDRLEN);
CopyMem(iphdr+1, pcb->ipopts, pcb->ipoptlen);
}
FILL_IPHEADER(
iphdr,
iphdrlen,
pcb->ipTos,
pkt->datalen,
pcb->ipDontFrag ? HTONS(DONT_FRAGMENT) : 0,
IS_BCAST_IPADDR(addrpair->dstaddr) ? 1 :
IS_MCAST_IPADDR(addrpair->dstaddr) ? pcb->mcastTtl : pcb->ipTtl,
pcb->protocol,
addrpair->srcaddr,
addrpair->dstaddr);
}
NTSTATUS
PcbWaitForEvent(
PCB* pcb,
INT eventMask,
UINT timeout
)
/*++
Routine Description:
Block the current thread until the specified PCB event is signalled
Arguments:
pcb - Points to the protocol control block
eventMask - Flag bit to indicate which event to block on
timeout - Specifies the wait timeout (in milliseconds, 0 means forever)
Return Value:
Status code
--*/
{
INT readyMask;
NTSTATUS status = NETERR_OK;
//
// Check if the specified event is already available
// or if the socket connection has been reset.
//
readyMask = PcbCheckSelectEvents(pcb, eventMask, 1);
if (readyMask & PCBEVENT_CONNRESET) goto exit;
if (readyMask != 0) return NETERR_OK;
status = WaitKernelEventObject(GetPcbWaitEvent(pcb), timeout);
PcbClearSelectEvents(pcb);
exit:
return NT_SUCCESS(status) ? PcbGetErrStatus(pcb) : status;
}
INT
PcbCheckSelectEvents(
PCB* pcb,
INT eventMasks,
INT setwait
)
/*++
Routine Description:
Check if the specified socket events are available
and optionally set up the socket to wait for them
Arguments:
pcb - Points to the protocol control block
eventMasks - Specifies the socket events the caller is interested in
setwait - Whether to set up the sockets to wait if
none of the specified events are avaiable
Return Value:
Set of event flags that are already available
--*/
{
NTSTATUS status;
INT readyMasks;
KIRQL irql = RaiseToDpc();
// Check to see if the specified event is already available
// Since our checks are trivial, it's faster to check
// everything instead of trying to check selectively
// based on the flags specified by the caller.
if (IsTcb(pcb)) {
// If the connection was reset, return reset status
status = PcbGetErrStatus(pcb);
if (!NT_SUCCESS(status)) {
// Note: If we already told the app that the socket was connected
// and then the socket got resetted, we don't need to set the socket
// in the exceptfds again to tell the app the connection has failed.
if (eventMasks != PCBEVENT_CONNRESET || !pcb->connectSelected) {
LowerFromDpc(irql);
return PCBEVENT_CONNRESET;
}
}
readyMasks = (IsTcpRecvBufEmpty(pcb) ? 0 : PCBEVENT_READ) |
(TcbHasPendingConnReq((TCB*) pcb) ? PCBEVENT_ACCEPT : 0) |
(IsFINReceived(pcb) ? PCBEVENT_READ|PCBEVENT_CLOSE : 0);
if (IsPcbConnected(pcb)) {
if (!IsPcbSendBufFull(pcb)) readyMasks |= PCBEVENT_WRITE;
// NOTE: we only signal the connect event exactly once
if ((eventMasks & PCBEVENT_CONNECT) && !pcb->connectSelected) {
readyMasks |= PCBEVENT_CONNECT;
if (setwait >= 0) pcb->connectSelected = 1;
}
}
} else {
readyMasks = (IsDgramRecvBufEmpty(pcb) ? 0 : PCBEVENT_READ) |
(IsPcbSendBufFull(pcb) ? 0 : PCBEVENT_WRITE);
}
if ((readyMasks &= eventMasks) == 0 && setwait) {
// Indicate that we're interested in the specified event
// and prepare to wait
pcb->eventFlags = eventMasks;
KeClearEvent(GetPcbWaitEvent(pcb));
}
LowerFromDpc(irql);
return readyMasks;
}
VOID
PcbCompleteOverlappedSendRecv(
PcbOverlappedReq* req,
NTSTATUS status
)
/*++
Routine Description:
Complete an overlapped send/receive request
Arguments:
req - Points to the overlapped send/receive request
status - Specifies the completion status
Return Value:
NONE
--*/
{
PCB* pcb = req->pcb;
// NOTE: we can have at most 1 overlapped send/receive request
if ((RECVREQ*) req == pcb->overlappedRecvs) {
pcb->overlappedRecvs = NULL;
} else if ((SENDREQ*) req == pcb->overlappedSends) {
pcb->overlappedSends = NULL;
}
req->overlapped->_iostatus = status;
SetKernelEvent(req->overlappedEvent);
// If the wait event is our internal per-PCB event,
// then this is a special case for implementing the
// blocking recv call. See comments in PcbQueueOverlappedRecv.
if (req->overlappedEvent != GetPcbWaitEvent(req->pcb)) {
ObDereferenceObject(req->overlappedEvent);
SysFree(req);
}
}
NTSTATUS
PcbQueueOverlappedRecv(
PCB* pcb,
RECVREQ* recvreq
)
/*++
Routine Description:
Queue up an overlapped receive request
Arguments:
pcb - Points to the protocol control block
recvreq - Points to the receive request
Return Value:
Status code
--*/
{
RECVREQ* newreq;
// Queue up an overlapped receive request.
// We only support 1 outstanding overlapped receive request.
if (HasOverlappedRecv(pcb) || !recvreq->overlappedEvent)
return NETERR_WOULDBLOCK;
if (recvreq->overlappedEvent == GetPcbWaitEvent(pcb)) {
// If the wait event is our internal per-PCB event,
// then this is a special case for implementing the
// blocking recv call.
//
// In this case, we avoid an allocation by directly
// queuing up the RECVREQ structure that was passed
// in from the caller (winsock layer). This works because
// the caller will wait for the recv to complete after
// this function returns.
newreq = recvreq;
} else {
newreq = (RECVREQ*) SysAlloc(sizeof(RECVREQ), PTAG_RREQ);
if (!newreq) return NETERR_MEMORY;
*newreq = *recvreq;
}
pcb->overlappedRecvs = newreq;
newreq->pcb = pcb;
newreq->overlapped->_ioxfercnt = 0;
newreq->overlapped->_ioflags = 0;
newreq->overlapped->_ioreq = (UINT_PTR) newreq;
newreq->overlapped->_iostatus = (DWORD) NETERR_PENDING;
return NETERR_PENDING;
}
NTSTATUS
PcbQueueOverlappedSend(
PCB* pcb,
SENDREQ* sendreq
)
/*++
Routine Description:
Queue up an overlapped send request
Arguments:
pcb - Points to the protocol control block
sendreq - Points to the overlapped send request
Return Value:
Status code:
NETERR_PENDING - the request was successfully queued up
NETERR_OK - if the send buffer has opened up and
there is no need to queue up the request
otherwise - the request was not queued up due to an error
--*/
{
NTSTATUS status;
KIRQL irql = RaiseToDpc();
if (HasOverlappedSend(pcb)) {
// We only support 1 outstanding overlapped send request.
status = NETERR_WOULDBLOCK;
} else if (!IsPcbSendBufFull(pcb)) {
// The send buffer opened up just as
// we were raising to DPC level
status = NETERR_OK;
} else {
SENDREQ* newreq;
UINT size = sizeof(SENDREQ) +
(sendreq->toaddr ? sizeof(*sendreq->toaddr) : 0) +
sizeof(WSABUF) * sendreq->bufcnt;
newreq = (SENDREQ*) SysAlloc(size, PTAG_RREQ);
if (!newreq) {
// Out of memory
status = NETERR_MEMORY;
} else {
VOID* bufs;
pcb->overlappedSends = newreq;
*newreq = *sendreq;
bufs = newreq+1;
if (sendreq->toaddr) {
newreq->toaddr = (struct sockaddr_in*) bufs;
*newreq->toaddr = *sendreq->toaddr;
bufs = newreq->toaddr + 1;
}
newreq->bufs = (WSABUF*) bufs;
CopyMem(bufs, sendreq->bufs, sizeof(WSABUF) * sendreq->bufcnt);
newreq->pcb = pcb;
newreq->overlapped->_ioxfercnt = 0;
newreq->overlapped->_ioflags = 0;
newreq->overlapped->_ioreq = (UINT_PTR) newreq;
newreq->overlapped->_iostatus = status = NETERR_PENDING;
}
}
LowerFromDpc(irql);
return status;
}