xbox-kernel/private/ntos/xnet/tcp/tcprecv.c

/*++

Copyright (c) 2000 Microsoft Corporation

Module Name:

    tcprecv.c

Abstract:

    TCP input processing functions

Revision History:

    05/31/2000 davidx
        Created it.

--*/

#include "precomp.h"


//
// Update the receive window information after accepting
// the specified amount data from the connection peer.
//
INLINE VOID
TcbSlideRecvWindow(
    TCB* tcb,
    UINT datalen
    )
{
    UINT newwnd = tcb->maxRecvBufsize - tcb->recvbufSize;

    // Here datalen must be <= rcv_wnd.
    ASSERT(datalen <= tcb->rcv_wnd);
    tcb->rcv_nxt += datalen;

    if ((datalen + newwnd) - tcb->rcv_wnd >= tcb->rcv_swsthresh) {
        // Enough space has freed up. Move the right edge of the receive window.
        tcb->rcv_wnd = newwnd;
    } else {
        // Implement receive side silly window avoidance:
        //  keep the right edge of the receive window unchanged.
        tcb->rcv_wnd -= datalen;
    }
}


//
// Copy data from the TCB receive buffer to the user's receive buffer
//
PRIVATE UINT
TcbCopyRecvData(
    TCB* tcb,
    RECVREQ* recvreq
    )
{
    RECVBUF* recvbuf;
    UINT copycnt, total = 0;
    BYTE tcpflags = 0;

    do {
        recvbuf = (RECVBUF*) tcb->recvbuf.Flink;
        ASSERT(SEQ_LE(recvbuf->seqnext, tcb->rcv_nxt));

        copycnt = min(recvbuf->datalen, recvreq->buflen);
        CopyMem(recvreq->buf, (BYTE*) recvbuf + recvbuf->dataoffset, copycnt);
        total += copycnt;
        tcb->recvbufSize -= copycnt;
        recvreq->buf += copycnt;
        recvreq->buflen -= copycnt;

        tcpflags |= recvbuf->tcpflags;
        if (copycnt == recvbuf->datalen) {
            RemoveEntryList(&recvbuf->links);
            XnetFree(recvbuf);
        } else {
            recvbuf->dataoffset = (WORD) (recvbuf->dataoffset + copycnt);
            recvbuf->datalen -= copycnt;
        }
    } while (recvreq->buflen && !IsTcpRecvBufEmpty(tcb));

    if (tcpflags & (TCP_PSH|TCP_URG))
        recvreq->buflen = 0;

    return total;
}


NTSTATUS
TcbRecv(
    TCB* tcb,
    RECVREQ* recvreq
    )

/*++

Routine Description:

    Handle a user request to receive data from a TCP socket

Arguments:

    tcb - Points to the TCP control block
    recvreq - Points to the receive request information

Return Value:

    Status code

--*/

{
    NTSTATUS status;
    UINT oldrcvwnd;
    KIRQL irql = RaiseToDpc();

    // Only segments from the connection peer are accepted
    SetRecvReqFromAddr(recvreq, tcb->dstaddr, tcb->dstport);
    *recvreq->bytesRecv = 0;

    if (!IsTcpRecvBufEmpty(tcb)) {

        // If the receive buffer is not empty, return as much data
        // as possible from the receive buffer.

        (*recvreq->bytesRecv) += TcbCopyRecvData(tcb, recvreq);
        status = NETERR_OK;

        // If the receive window was closed and there is
        // now enough free spce in the receive buffer,
        // then we'll send out a gratuitous ACK here.

        oldrcvwnd = tcb->rcv_wnd;
        TcbSlideRecvWindow(tcb, 0);
        if (oldrcvwnd < tcb->rcv_swsthresh &&
            tcb->rcv_wnd >= tcb->rcv_swsthresh) {
            TcbEmitACK(tcb);
        }
    } else if (IsFINReceived(tcb)) {
        // The connection has been gracefully closed
        status = NETERR_OK;
    } else {
        status = PcbGetErrStatus(tcb);
        if (NT_SUCCESS(status)) {
            status = PcbQueueOverlappedRecv((PCB*) tcb, recvreq);
        }
    }

    LowerFromDpc(irql);
    return status;
}


PRIVATE VOID
TcbUpdateRTO(
    TCB* tcb
    )

/*++

Routine Description:

    Update the round-trip time measurements for a TCP connection

Arguments:

    tcb - Points to the TCP control block

Return Value:

    NONE

--*/

{
    INT nticks, delta;

    // The computation is based on the following formula:
    //  delta = nticks - srtt
    //  srtt = srtt + delta / 8
    //  rttvar = rttvar + (|delta| - rttvar) / 4
    //  RTO = srtt + 4 * rttvar

    nticks = TcpTickCount - tcb->rtt_tick;
    tcb->rtt_tick = 0;

    if (tcb->srtt_8 != 0) {
        delta = nticks - (tcb->srtt_8 >> SRTT_SHIFT);
        if ((tcb->srtt_8 += delta) <= 0)
            tcb->srtt_8 = 1;

        if (delta < 0) delta = -delta;
        delta -= (tcb->rttvar_4 >> RTTVAR_SHIFT);
        if ((tcb->rttvar_4 += delta) <= 0)
            tcb->rttvar_4 = 1;
    } else {
        // The very first measurement - use the unsmoothed data
        if (nticks == 0) {
            tcb->srtt_8 = tcb->rttvar_4 = 1;
        } else {
            tcb->srtt_8 = nticks << SRTT_SHIFT;
            tcb->rttvar_4 = nticks << (RTTVAR_SHIFT-1);
        }
    }

    tcb->RTO = (tcb->srtt_8 >> SRTT_SHIFT) + tcb->rttvar_4;
    if (tcb->RTO < cfgMinRexmitTimeout)
        tcb->RTO = cfgMinRexmitTimeout;
}


PRIVATE BOOL
TcbUpdatePersistFlag(
    TCB* tcb
    )

/*++

Routine Description:

    This function is called after the send window information is updated.
    If we're currently sending out window probing segments, then
    we'll stop doing so if the send window has opened up.

Arguments:

    tcb - Points to the TCP control block

Return Value:

    FALSE if we got out of persist mode
    TRUE otherwise

--*/

{
    TCPSENDBUF* sendbuf = TcbFirstSendbuf(tcb);
    ASSERT(!IsPcbSendBufEmpty(tcb));

    if (sendbuf->datalen <= tcb->snd_wnd) {
        tcb->persistFlag = 0;
        if (sendbuf->retries) {
            TRACE_("Retransmit persisting segment...");
            sendbuf->retries = 0;
            TcbSendSegment(tcb, sendbuf);
        } else {
            tcb->xmitTimer = 0;
        }
        return FALSE;
    }
    return TRUE;
}


PRIVATE VOID
TcbUpdateSndUna(
    TCB* tcb,
    TCPSEQ ack
    )

/*++

Routine Description:

    Update the snd.una variable for a TCP connection

Arguments:

    tcb - Points to the TCP connection block
    ack - The acknowledged sequence number

Return Value:

    NONE

--*/

{
    TCPSENDBUF* sendbuf;
    UINT ackedSends = 0;

    if (SEQ_GT(ack, tcb->snd_una)) {
        tcb->snd_una = ack;

        // Update round-trip time measurements
        if (tcb->rtt_tick && SEQ_GT(ack, tcb->rtt_seq)) {
            TcbUpdateRTO(tcb);
        }

        // Complete fully acknowledged send user requests

        LOOP_THRU_TCB_SENDBUF(tcb, sendbuf)

            if (sendbuf->retries && SEQ_GE(ack, sendbuf->seqnext)) {
                RemoveEntryList(&sendbuf->links);
                tcb->sendbufSize -= sendbuf->datalen;
                ackedSends++;
                SendbufRelease(sendbuf);
            } else {
                break;
            }

        END_TCB_SENDBUF_LOOP
    }

    // Update congestion window
    if (tcb->fastRexmitFlag) {
        if (ackedSends) {
            //
            // Getting out of fast retransmit / fast recovery mode:
            //  "deflate" the congestion window
            //
            tcb->snd_cwnd = tcb->snd_ssthresh;
        } else {
            //
            // In fast recovery mode:
            //  increment the congestion window by SMSS
            //  for every duplicate ACK received.
            //
            TcbIncrementCwnd(tcb, tcb->snd_mss);
        }
    } else {
        if (tcb->snd_cwnd < tcb->snd_ssthresh) {
            //
            // Slow-start mode:
            //  increment the congestion window by SMSS
            //
            TcbIncrementCwnd(tcb, tcb->snd_mss);
        } else {
            //
            // Congestion avoidance mode:
            //  increment the congestion window by ~SMSS per RTT
            //
            // NOTE: Overflow is not possible here because snd_mss
            // and snd_cwnd are 16-bit numbers. Also snd_cwnd is never 0.
            //
            UINT inc = tcb->snd_mss * tcb->snd_mss / tcb->snd_cwnd;
            TcbIncrementCwnd(tcb, max(1, inc));
        }
    }

    if (ackedSends) {
        if (!IsPcbSendBufFull(tcb)) {
            if (HasOverlappedSend(tcb)) {
                SENDREQ* sendreq = PcbGetOverlappedSend((PCB*) tcb);
                NTSTATUS status = TcbSend(tcb, sendreq);

                sendreq->overlapped->_ioxfercnt = sendreq->sendtotal;
                PcbCompleteOverlappedSend(sendreq, status);
            } else {
                PcbSignalEvent(tcb, PCBEVENT_WRITE);
            }
        }
        tcb->persistFlag = 0;
        TcbStopFastRexmitMode(tcb);

        // Reset retransmission timer
        if (tcb->snd_una == tcb->snd_nxt)
            tcb->xmitTimer = 0;
        else {
            sendbuf = TcbFirstSendbuf(tcb);
            if (sendbuf->firstSendTime + tcb->RTO <= TcpTickCount) {
                TcbXmitTimeout(tcb);
            } else {
                tcb->xmitTimer = sendbuf->firstSendTime + tcb->RTO - TcpTickCount;
            }
        }

    } else if (tcb->persistFlag) {

        // If we're probing the send window and the ack was for
        // the probe segment, then make sure we continue to probe
        // without timing out.

        sendbuf = TcbFirstSendbuf(tcb);
        if (SEQ_GE(ack, sendbuf->seq) &&
            sendbuf->retries >= cfgMaxXmitRetries-1) {
            sendbuf->retries--;
        }

    } else {
        if (tcb->snd_una != tcb->snd_nxt &&
            !tcb->fastRexmitFlag &&
            ++tcb->dupacks >= 4) {
            //
            // Too many duplicate ACK received:
            //  do fast retransmit / recovery
            //
            TcbDoFastRexmit(tcb);
        }
    }

    // If the send window has opened up and
    // we have pending data to sent, try to do it now.
    if (!tcb->persistFlag || !TcbUpdatePersistFlag(tcb)) {
        while (TcbHasPendingSend(tcb) && TcbStartOutput(tcb))
            NULL;
    }
}


//
// Check if an acknowledged sequence number is valid for a TCP connection
//
#define IsValidACK(_tcb, _ack) \
        (SEQ_GT((_ack), (_tcb)->snd_una) && \
         SEQ_LE((_ack), (_tcb)->snd_nxt))

//
// Indicate whether we should send out an ACK immediately
// in response to an incoming segment.
//
#define NeedSendACKNow(_tcb) ((_tcb)->delayedAcks += 0x20000)

//
// Save the send window information from the connection peer
//
#define TcbUpdateSndWnd(_tcb, _wnd, _seq, _ack) { \
            (_tcb)->snd_wnd = (_wnd); \
            (_tcb)->snd_wl1 = (_seq); \
            (_tcb)->snd_wl2 = (_ack); \
        }


PRIVATE BOOL
TcbValidateSeqs(
    TCB* tcb,
    TCPSEQ oldseq0,
    TCPSEQ oldseq1,
    TCPSEQ* newseq0,
    TCPSEQ* newseq1
    )

/*++

Routine Description:

    Check if the received sequence number is valid for a TCP connection

Arguments:

    tcb - Points to TCP control block
    oldseq0 - Starting sequence number from the received segment
    oldseq1 - Ending sequence number from the received segment (oldseq+seglen)
    newseq0 - Returns the effective starting sequence number inside the receive window
    newseq1 - Returns the effective ending sequence number

Return Value:

    FALSE if none of the received segment is inside the receive window,
    TRUE otherwise. In the latter case, seq0 and seq1 will return updated
    sequence numbers.

--*/

{
    TCPSEQ rcv_last;

    rcv_last = tcb->rcv_nxt + tcb->rcv_wnd;
    *newseq0 = SEQ_LT(oldseq0, tcb->rcv_nxt) ? tcb->rcv_nxt : oldseq0;
    *newseq1 = SEQ_GT(oldseq1, rcv_last) ? rcv_last : oldseq1;

    return SEQ_LT(*newseq0, *newseq1) ||
           (*newseq0 == *newseq1 && oldseq0 == oldseq1);
}


#if DBG

PRIVATE BOOL
TcbVerifyRecvBuf(
    TCB* tcb
    )

/*++

Routine Description:

    Verify the current receive buffers for a TCB is good

Arguments:

    tcb - Points to the TCP control block

Return Value:

    TRUE if successful, FALSE if there is an error

--*/

{
    RECVBUF* buf0 = (RECVBUF*) tcb->recvbuf.Flink;
    RECVBUF* buf1;

    if (buf0 == TcbRecvbufNil(tcb)) return TRUE;
    while (TRUE) {
        buf1 = TcbRecvbufFlink(buf0);
        if (buf1 == TcbRecvbufNil(tcb)) return TRUE;
        if (SEQ_GT(buf0->seqnext, buf1->seqnext-buf1->datalen)) return FALSE;
        buf0 = buf1;
    }
}

#endif // DBG


PRIVATE UINT
TcbCorrectMisorderedSegments(
    TCB* tcb,
    TCPSEQ seq,
    RECVBUF* recvbuf
    )

/*++

Routine Description:

    This function is called when we receive a segment
    whose starting sequence number matches rcv_nxt and
    there are buffered out-of-order segments after this one.

Arguments:

    tcb - Points to TCP control block
    seq - Specifies the starting sequence number for this segment
    recvbuf - Points to the received data buffer

Return Value:

    Number of continuous bytes that can be acknowledged

--*/

{
    RECVBUF* buf0 = TcbLastRecvbuf(tcb);
    RECVBUF* buf1;
    UINT count;

    do {
        buf0 = TcbRecvbufBlink(buf0);
    } while (buf0 != TcbRecvbufNil(tcb) && buf0->seqnext != seq);

    buf1 = TcbRecvbufFlink(buf0);
    if (SEQ_GT(recvbuf->seqnext, buf1->seqnext - buf1->datalen)) {
        // Weird case: data in this segment overlapps with
        // data in the buffered out-of-order segments.
        // Chop off the overlapped data in the current receive buffer.
        TRACE_("TcbCorrectMisorderedSegments: overlapping segment");

        count = recvbuf->seqnext - (buf1->seqnext - buf1->datalen);
        ASSERT(count < recvbuf->datalen);

        recvbuf->datalen -= count;
        recvbuf->seqnext -= count;
    }

    // Insert recvbuf after buf0 and before buf1
    buf0->links.Flink = buf1->links.Blink = (LIST_ENTRY*) recvbuf;
    recvbuf->links.Flink = (LIST_ENTRY*) buf1;
    recvbuf->links.Blink = (LIST_ENTRY*) buf0;

    count = recvbuf->datalen;
    buf0 = recvbuf;
    while (buf1 != TcbRecvbufNil(tcb) &&
           buf0->seqnext == buf1->seqnext - buf1->datalen) {
        count += buf1->datalen;
        buf0 = buf1;
        buf1 = TcbRecvbufFlink(buf1);
    }

    // If we filled up a hole, emit an ACK immediately
    if (count > recvbuf->datalen) {
        NeedSendACKNow(tcb);
    }

    // ASSERT(TcbVerifyRecvBuf(tcb));
    return count;
}


PRIVATE VOID
TcbBufferMisorderedSegment(
    TCB* tcb,
    UINT seq,
    RECVBUF* recvbuf
    )


/*++

Routine Description:

    This function is called when we receive an out-of-order segment.

Arguments:

    tcb - Points to TCP control block
    seq - Specifies the starting sequence number for this segment
    recvbuf - Points to the received data buffer

Return Value:

    NONE

--*/

{
    RECVBUF* buf0 = TcbLastRecvbuf(tcb);
    RECVBUF* buf1;
    UINT count;

    if (buf0 == TcbRecvbufNil(tcb) || SEQ_GE(seq, buf0->seqnext)) {
        // The sequence number for this segment is larger
        // than everything in the receive buffer.
        InsertTailList(&tcb->recvbuf, &recvbuf->links);
        return;
    }

    do {
        buf0 = TcbRecvbufBlink(buf0);
    } while (buf0 != TcbRecvbufNil(tcb) && SEQ_LT(seq, buf0->seqnext));

    // recvbuf should go in between buf0 and buf1

    buf1 = TcbRecvbufFlink(buf0);
    if (SEQ_GT(recvbuf->seqnext, buf1->seqnext - buf1->datalen)) {
        // Data in this segment overlapps with
        // data in the buffered out-of-order segments.
        TRACE_("TcbBufferMisorderedSegment: overlapping segment");

        count= recvbuf->seqnext - (buf1->seqnext - buf1->datalen);
        if (count >= recvbuf->datalen) {
            XnetFree(recvbuf);
            return;
        }

        recvbuf->datalen -= count;
        recvbuf->seqnext -= count;
    }

    buf0->links.Flink = buf1->links.Blink = (LIST_ENTRY*) recvbuf;
    recvbuf->links.Flink = (LIST_ENTRY*) buf1;
    recvbuf->links.Blink = (LIST_ENTRY*) buf0;

    // ASSERT(TcbVerifyRecvBuf(tcb));
}


PRIVATE UINT
TcbCompletePendingRecvReqFast(
    TCB* tcb,
    Packet* pkt,
    BYTE tcpflags
    )

/*++

Routine Description:

    This function is called when we received data from a TCP connection
    and there is a pending overlapped receive request
    and there is currently no data in the receive buffer.
    In this case we can satisfy the request right away without
    allocating memory to store data in the receive buffer.

Arguments:

    tcb - Points to the TCP control block
    pkt - Points to the received packet
    tcpflags - TCP segment flags

Return Value:

    Number of bytes used up in this call

--*/

{
    UINT copycnt;
    RECVREQ* recvreq = PcbGetOverlappedRecv(tcb);

    copycnt = min(pkt->datalen, recvreq->buflen);
    CopyMem(recvreq->buf, pkt->data, copycnt);
    recvreq->overlapped->_ioxfercnt += copycnt;
    recvreq->buf += copycnt;
    recvreq->buflen -= copycnt;

    if ((recvreq->buflen == 0) || (tcpflags & TCP_PSH|TCP_URG)) {
        PcbCompleteOverlappedRecv(recvreq, NETERR_OK);
    }

    TcbSlideRecvWindow(tcb, copycnt);
    return copycnt;
}


PRIVATE TCB*
TcbProcessDataAck(
    TCB* tcb,
    Packet* pkt,
    TcpHeader* tcphdr
    )

/*++

Routine Description:

    Process a received TCP data segment.
    And we assume the connection is in a synchronized state.

Arguments:

    tcb - Points to the TCP connection block
    pkt - Points to the received packet
    tcphdr - Points to the TCP segment header information

Return Value:

    NULL if the TCB was deleted as a result of the incoming segment;
    Otherwise, just return the input tcb parameter

--*/

{
    TCPSEQ seq, ack, seq0, seq1;
    RECVBUF* recvbuf;

    seq = SEG_SEQ(tcphdr);

    // Process ACK if it's present
    if (ISTCPSEG(tcphdr, ACK)) {
        ack = SEG_ACK(tcphdr);
        if (SEQ_LT(ack, tcb->snd_una)) {
            // ACK is an old duplicate, ignore it
        } else if (SEQ_GT(ack, tcb->snd_nxt)) {
            // ACKing something that hasn't been sent.
            // Emit an ACK, drop the segment
            NeedSendACKNow(tcb);
            return tcb;
        } else {
            // update the send window information
            if (SEQ_LT(tcb->snd_wl1, seq) ||
                tcb->snd_wl1 == seq && SEQ_LE(tcb->snd_wl2, ack)) {
                TcbUpdateSndWnd(tcb, SEG_WIN(tcphdr), seq, ack);
            }

            TcbUpdateSndUna(tcb, ack);

            // If we already sent FIN, check to see if FIN has been acknowledged
            if (IsFINSent(tcb) && IsPcbSendBufEmpty(tcb)) {
                switch (tcb->tcpstate) {
                case TCPST_FIN_WAIT_1:
                    TcbSetState(tcb, TCPST_FIN_WAIT_2, "FinAck");
                    break;

                case TCPST_CLOSING:
                    TcbSetState(tcb, TCPST_TIME_WAIT, "FinAck");
                    TcbSetTimeWaitTimer(tcb, 2*cfgMSL);
                    return NULL;

                case TCPST_LAST_ACK:
                    TcbDelete(tcb);
                    return NULL;
                }
            }
        }
    }

    // If we already receive FIN from the peer,
    // we'll just ignore incoming data.
    if (IsFINReceived(tcb) || pkt->datalen == 0) return tcb;

    tcb->delayedAcks += pkt->datalen;

    // Quick check to see if the data from incoming segment
    // fits entirely within the current receive window.
    if (seq != tcb->rcv_nxt || pkt->datalen > tcb->rcv_wnd) {
        // If the receive sequence number is outside of
        // the current receive window, send an ACK right away.
        if (!TcbValidateSeqs(tcb, seq, seq+pkt->datalen, &seq0, &seq1)) {
            TRACE_("Sequence number out-of-range: %u %u %d",
                   seq, tcb->rcv_nxt, tcb->rcv_wnd);
            NeedSendACKNow(tcb);
            return tcb;
        }

        pkt->data += (seq0 - seq);
        pkt->datalen = (seq1 - seq0);
        ASSERT((INT) pkt->datalen > 0);
        seq = seq0;
    }

    if (IsPcbRecvShutdown(tcb)) {
        TcbResetPeer(tcb);
        TcbReset(tcb, NETERR_CONNRESET);
        return NULL;
    }

    // Special fast path: there is a pending overlapped receive request
    // and there is no data in the receive buffer.
    if (HasOverlappedRecv(tcb) &&
        seq == tcb->rcv_nxt &&
        IsListEmpty(&tcb->recvbuf)) {
        UINT copied = TcbCompletePendingRecvReqFast(tcb, pkt, tcphdr->flags);
        if (copied == pkt->datalen) return tcb;

        pkt->data += copied;
        pkt->datalen -= copied;
        seq += copied;
    }

    // Make a copy of the incoming segment and buffer it up
    recvbuf = (RECVBUF*) XnetAlloc(sizeof(RECVBUF) + pkt->datalen, PTAG_RBUF);
    if (!recvbuf) {
        WARNING_("Received TCP data discarded because of no memory");
        WARNING_("  local = %d, remote = %s:%d",
            NTOHS(tcb->srcport),
            IPADDRSTR(tcb->dstaddr),
            NTOHS(tcb->dstport));
        return tcb;
    }

    recvbuf->seqnext = seq + pkt->datalen;
    recvbuf->dataoffset = sizeof(RECVBUF);
    recvbuf->tcpflags = tcphdr->flags;
    recvbuf->datalen = pkt->datalen;
    CopyMem(recvbuf+1, pkt->data, pkt->datalen);

    // Fast case: the receive segment is in sequence
    if (seq == tcb->rcv_nxt) {
        UINT datarun;
        RECVBUF* lastbuf = TcbLastRecvbuf(tcb);

        if (lastbuf == TcbRecvbufNil(tcb) || lastbuf->seqnext == seq) {
            // We haven't buffered any out-of-order segments behind this one.
            datarun = pkt->datalen;
            InsertTailList(&tcb->recvbuf, &recvbuf->links);
        } else {
            // Figure out if this segment fills out holes
            // in the receive buffer.
            ASSERT(SEQ_LT(seq, lastbuf->seqnext));
            datarun = TcbCorrectMisorderedSegments(tcb, seq, recvbuf);
        }
        tcb->recvbufSize += datarun;
        TcbSlideRecvWindow(tcb, datarun);

        if (HasOverlappedRecv(tcb)) {
            // If we have a pending overlapped receive request, satisfy it now
            RECVREQ* recvreq = PcbGetOverlappedRecv(tcb);

            recvreq->overlapped->_ioxfercnt += TcbCopyRecvData(tcb, recvreq);
            if (recvreq->buflen == 0) {
                PcbCompleteOverlappedRecv(recvreq, NETERR_OK);
            }

            TcbSlideRecvWindow(tcb, 0);
        } else {
            PcbSignalEvent(tcb, PCBEVENT_READ);
        }
    } else {
        // Slow case: a segment arrived out of order.
        // Buffer up this segment for later processing.
        TRACE_("Out-of-order segment: %u %u", SEG_SEQ(tcphdr), tcb->rcv_nxt);
        TcpStats.misordered++;

        NeedSendACKNow(tcb);
        TcbBufferMisorderedSegment(tcb, seq, recvbuf);
    }

    return tcb;
}


PRIVATE VOID
TcbProcessFIN(
    TCB* tcb,
    TcpHeader* tcphdr,
    TCPSEQ seqfin
    )

/*++

Routine Description:

    Process an incoming TCP FIN segment

Arguments:

    tcb - Points to the TCP control block
    tcphdr - Points to the TCP segment header
    seqfin - Sequence number for the FIN

Return Value:

    NONE

--*/

{
    if (seqfin != tcb->rcv_nxt) return;

    tcb->rcv_nxt = seqfin+1;
    NeedSendACKNow(tcb);
    PcbSignalEvent(tcb, PCBEVENT_CLOSE);

    switch (tcb->tcpstate) {
    case TCPST_SYN_RECEIVED:
    case TCPST_ESTABLISHED:
        PcbClearOverlappedRecvs(tcb, NETERR_OK);
        PcbSignalEvent(tcb, PCBEVENT_READ);
        TcbSetState(tcb, TCPST_CLOSE_WAIT, "FinRecv");
        break;

    case TCPST_FIN_WAIT_1:
        if (IsPcbSendBufEmpty(tcb)) {
            TcbSetState(tcb, TCPST_CLOSING, "FinRecv");
            break;
        }

        // If our FIN has been acknowledged,
        // fall through and change to TIME-WAIT state.

    case TCPST_FIN_WAIT_2:
        TcbSetState(tcb, TCPST_TIME_WAIT, "FinRecv");

        // Fall through

    case TCPST_TIME_WAIT:
        TcbSetTimeWaitTimer(tcb, 2*cfgMSL);
        break;
    }
}


//
// Structure for storing TCP option parameters
// from an incoming SYN segment
//
typedef struct _TcpOptions {
    UINT mss;
    // other TCP option parameters ...
} TcpOptions;

PRIVATE BOOL
TcpParseOptions(
    Packet* pkt,
    TcpHeader* tcphdr,
    TcpOptions* opts
    )

/*++

Routine Description:

    Parse the option information in an incoming TCP SYN segment

Arguments:

    pkt - Points to the received packet
    tcphdr - Points to the TCP segment header
    opts - Returns the parsed option information

Return Value:

    TRUE if successful, FALSE if the TCP options are not well-formed

--*/

#define TCP_MINIMUM_MSS (MAXIPHDRLEN+MAXTCPHDRLEN+8-IPHDRLEN-TCPHDRLEN)

{
    const BYTE* buf;
    UINT buflen;

    // We ignore any data that's sent in the initial SYN segment.
    // Not sure if this case actually happens in real-life.
    // In any case, the sender should retransmit the data.
    if (ISTCPSEG(tcphdr, SYN) && pkt->datalen > 0) {
        WARNING_("Ignoring data in SYN segment.");
    }

    // Use default values if no options are present
    opts->mss = TCP_DEFAULT_MSS;

    buflen = GETTCPHDRLEN(tcphdr) - TCPHDRLEN;
    buf = (const BYTE*) (tcphdr+1);

    while (buflen) {
        BYTE opt, optlen;

        if ((opt = *buf) == TCPOPT_EOL) break;
        if (opt == TCPOPT_NOP) {
            buf++; buflen--;
            continue;
        }

        // Verify option length field
        if (buflen < 2 || (optlen = buf[1]) < 2 || optlen > buflen)
            return FALSE;

        switch (opt) {
        case TCPOPT_MAX_SEGSIZE:
            if (optlen != 4) return FALSE;
            opts->mss = ((UINT) buf[2] << 8) | buf[3];
            if (opts->mss < TCP_MINIMUM_MSS)
                opts->mss = TCP_DEFAULT_MSS;
            break;

        default:
            TRACE_("TCP option ignored: %d", opt);
            break;
        }

        buf += optlen;
        buflen -= optlen;
    }
    return TRUE;
}


//
// Save the relevant information in an incoming connection request
//
#define TcpSaveConnReqParams(_tcb, _tcphdr, _opts) { \
            (_tcb)->rcv_isn = SEG_SEQ(_tcphdr); \
            (_tcb)->rcv_nxt = (_tcb)->rcv_isn+1; \
            (_tcb)->snd_mss = (_opts)->mss; \
        }

PRIVATE BOOL
TcbAcceptConnReqPassive(
    TCB* tcb,
    Packet* pkt,
    TcpHeader* tcphdr
    )

/*++

Routine Description:

    Process an incoming TCP connection request
    that was made to a listening (passively opened) socket

Arguments:

    tcb - TCP control block
    pkt - Points to the incoming packet
    tcphdr - Points to the TCP segment header

Return Value:

    FALSE if the incoming connection request is bad and
    the caller should send out a RST in response; TRUE otherwise

--*/

{
    NTSTATUS status;
    TCB* tcbChild;
    TcpOptions opts;

    // Parse TCP options
    if (!TcpParseOptions(pkt, tcphdr, &opts)) return FALSE;

    tcbChild = TcbCloneChild(tcb);
    if (!tcbChild) return TRUE;

    TcbSetState(tcbChild, TCPST_SYN_RECEIVED, "Connection request");
    TcpSaveConnReqParams(tcbChild, tcphdr, &opts);

    status = TcbConnect(tcbChild, GETPKTIPHDR(pkt)->srcaddr, tcphdr->srcport, TRUE);
    if (!NT_SUCCESS(status)) {
        TcbDelete(tcbChild);
        return TRUE;
    }

    InsertHeadList(&PcbList, &tcbChild->links);
    return TRUE;
}


PRIVATE VOID
TcbConnectionEstablished(
    TCB* tcb,
    TcpHeader* tcphdr
    )

/*++

Routine Description:

    Change a TCP connection to established state

Arguments:

    tcb - Points to the TCP control block
    tcphdr - Points to the incoming SYN/ACK segment

Return Value:

    NONE

--*/

{
    TCPSEQ ack;
    TCPSENDBUF* sendbuf;

    TRACE_("Connection established: %s:%d ", IPADDRSTR(tcb->srcaddr), NTOHS(tcb->srcport));
    TRACE_("to %s:%d", IPADDRSTR(tcb->dstaddr), NTOHS(tcb->dstport));

    //
    // The incoming segment must acknowledge our SYN
    //
    sendbuf = TcbFirstSendbuf(tcb);
    ack = SEG_ACK(tcphdr);
    ASSERT(sendbuf->tcpflags & TCP_SYN);
    ASSERT(ISTCPSEG(tcphdr, ACK));
    ASSERT(ack == tcb->snd_nxt);

    tcb->snd_una = ack;
    TcbUpdateSndWnd(tcb, SEG_WIN(tcphdr), SEG_SEQ(tcphdr), ack);

    if (tcb->rtt_tick) {
        TcbUpdateRTO(tcb);
    }
    RemoveEntryList(&sendbuf->links);
    SendbufRelease(sendbuf);
    tcb->xmitTimer = tcb->synTimer = 0;

    // NOTE: Since we don't fragment outgoing IP datagrams, we need
    // to limit snd_mss to be less than the first-hop interface MTU
    // minus the TCP and IP headers.
    if (tcb->snd_mss > tcb->rcv_mss)
        tcb->snd_mss = tcb->rcv_mss;

    tcb->snd_cwnd = 2*tcb->snd_mss;
    tcb->snd_ssthresh = max(tcb->snd_wnd, tcb->snd_cwnd);
    tcb->rcv_swsthresh = min(tcb->maxRecvBufsize>>1, tcb->snd_mss);

    TcbSetState(tcb, TCPST_ESTABLISHED, "Connected");
    tcb->flags |= PCBFLAG_CONNECTED;
    PcbSignalEvent(tcb, PCBEVENT_CONNECT);

    if (IsPendingConnReqTcb(tcb)) {
        // Signal the connection request is ready for acceptance
        PcbSignalEvent(tcb->parent, PCBEVENT_ACCEPT);
    }
}


PRIVATE BOOL
TcbAcceptConnReqActive(
    TCB* tcb,
    Packet* pkt,
    TcpHeader* tcphdr
    )

/*++

Routine Description:

    Process an incoming TCP connection request
    that was made to an actively opened socket

Arguments:

    tcb - TCP control block
    pkt - Points to the incoming packet
    tcphdr - Points to the TCP segment header

Return Value:

    FALSE if the incoming connection request is bad and
    the caller should send out a RST in response; TRUE otherwise

--*/

{
    TcpOptions opts;
    NTSTATUS status;

    // Parse TCP options and save connection request information
    if (!TcpParseOptions(pkt, tcphdr, &opts)) return FALSE;
    TcpSaveConnReqParams(tcb, tcphdr, &opts);

    if (ISTCPSEG(tcphdr, ACK)) {
        TcbConnectionEstablished(tcb, tcphdr);
        NeedSendACKNow(tcb);
        status = NETERR_OK;
    } else {
        TcbSetState(tcb, TCPST_SYN_RECEIVED, "Simultaneous open");
        status = TcbEmitSYN(tcb, TRUE);
    }

    return NT_SUCCESS(status);
}


VOID
TcpReceivePacket(
    Packet* pkt
    )

/*++

Routine Description:

    Receive a TCP segment

Arguments:

    pkt - Points to the received TCP segment

Return Value:

    NONE

--*/

{
    IpHeader* iphdr;
    PseudoHeader pseudohdr;
    TcpHeader* tcphdr;
    UINT tcphdrlen;
    UINT checksum;
    TCB* tcb;
    BOOL isAck;
    TCPSEQ ack;

    // Verify TCP segment header
    if (pkt->datalen < TCPHDRLEN) goto discard;

    iphdr = GETPKTIPHDR(pkt);
    pseudohdr.srcaddr = iphdr->srcaddr;
    pseudohdr.dstaddr = iphdr->dstaddr;
    pseudohdr.zero = 0;
    pseudohdr.protocol = IPPROTOCOL_TCP;
    pseudohdr.length = (WORD) HTONS(pkt->datalen);

    tcphdr = GETPKTDATA(pkt, TcpHeader);
    tcphdrlen = GETTCPHDRLEN(tcphdr);
    if (tcphdrlen < TCPHDRLEN || tcphdrlen > pkt->datalen)
        goto discard;

    // Verify checksum
    checksum = tcpipxsum(0, &pseudohdr, sizeof(pseudohdr));
    if (tcpipxsum(checksum, tcphdr, pkt->datalen) != 0xffff)
        goto discard;

    pkt->data += tcphdrlen;
    pkt->datalen -= tcphdrlen;

    // Find the socket that the segment is addressed to
    tcb = (TCB*) PcbFindMatch(
                    pseudohdr.dstaddr,
                    tcphdr->dstport,
                    pseudohdr.srcaddr,
                    tcphdr->srcport,
                    SOCK_STREAM,
                    IPPROTOCOL_TCP);

    if (!tcb) goto sendrst;

    if (IsTcpSyncState(tcb) && (tcphdr->flags & TCP_CONTROLS) == 0) {
        // Quick check for special case:
        //  we're in a synchronized state and
        //  the segment has no control flags.
        tcb = TcbProcessDataAck(tcb, pkt, tcphdr);

rcvdone:
        // NOTE: We return the interface driver's buffer first
        // before trying to send out the ACK segment.

        XnetCompletePacket(pkt, NETERR_OK);
        if (tcb && tcb->delayedAcks >= (tcb->rcv_mss << 1)) {
            TcbEmitACK(tcb);
        }
        return;
    }

    // If the connection is closed, send RST
    if (tcb->tcpstate == TCPST_CLOSED) goto sendrst;

    // Discard packets with broadcast/multicast destination address.
    // We assume that packets with broadcast/multicast source address
    // are already discarded by the IP layer.
    if (IfBcastAddr(pkt->recvifp, pseudohdr.dstaddr) ||
        IS_MCAST_IPADDR(pseudohdr.dstaddr))
        goto discard;

    isAck = ISTCPSEG(tcphdr, ACK);

    switch (tcb->tcpstate) {
    case TCPST_LISTEN:

        // Ignore RST segment in listen state
        if (ISTCPSEG(tcphdr, RST)) goto discard;

        // If ACK is on, send RST
        if (isAck) goto sendrst;

        // If there is no SYN, discard the segment
        if (!ISTCPSEG(tcphdr, SYN)) goto discard;

        // Process an incoming connection request on a listening socket
        if (!TcbAcceptConnReqPassive(tcb, pkt, tcphdr)) goto sendrst;
        break;

    case TCPST_SYN_SENT:

        // If ACK is on and the acknowledgement number is bad, send RST
        if (isAck) {
            ack = SEG_ACK(tcphdr);
            if (!IsValidACK(tcb, ack)) goto sendrst;
        }

        if (ISTCPSEG(tcphdr, RST)) {
            // If RST is on and ACK is good, reset the connection.
            // Otherwise, discard the RST segment.
            if (isAck) { TcbReset(tcb, NETERR_CONNRESET); }
            goto discard;
        }

        // If there is no SYN, just discard the segment
        if (!ISTCPSEG(tcphdr, SYN)) goto discard;

        // Simultaneous active open
        if (!TcbAcceptConnReqActive(tcb, pkt, tcphdr)) goto sendrst;
        break;

    default: {
        TCPSEQ oldseq0 = SEG_SEQ(tcphdr);
        TCPSEQ oldseq1 = oldseq0 + SEG_LEN(tcphdr, pkt->datalen);
        TCPSEQ seq0, seq1;

        // Make sure the sequence number is correct.
        // If not and the incoming segment is not RST, we'll emit an ACK.
        if (!TcbValidateSeqs(tcb, oldseq0, oldseq1, &seq0, &seq1)) {
            if (!ISTCPSEG(tcphdr, RST)) {
                NeedSendACKNow(tcb);
            }
            break;
        }

        // If RST is on, then we'll reset the connection:
        //  - if the socket corresponds to a pending connection
        //    request, then we'll just close it and destroy the TCB.
        if (ISTCPSEG(tcphdr, RST)) {
            TcbReset(tcb, NETERR_CONNRESET);
            goto discard;
        }

        // If SYN is set, there is an error.
        // We send out a RST as well as reset the connection.
        if (ISTCPSEG(tcphdr, SYN)) {
            if (oldseq0 == seq0) {
                TcbReset(tcb, NETERR_CONNRESET);
                goto sendrst;
            }
            goto discard;
        }

        if (!ISTCPSEG(tcphdr, ACK)) goto discard;

        if (tcb->tcpstate == TCPST_SYN_RECEIVED) {
            ack = SEG_ACK(tcphdr);
            if (!IsValidACK(tcb, ack)) {
                // The acknowledgement number is bad, emit an RST
                goto sendrst;
            }

            // NOTE: The ACK for our SYN will be processed
            // inside the TcbProcessDataAck call below.
            TcbConnectionEstablished(tcb, tcphdr);
        }

        // Process the data and acknowledgement information
        // and continue to process the FIN flag if necessary
        tcb = TcbProcessDataAck(tcb, pkt, tcphdr);
        if (tcb && ISTCPSEG(tcphdr, FIN)) {
            TcbProcessFIN(tcb, tcphdr, seq1-1);
        }
        }
        break;
    }
    goto rcvdone;

sendrst:

    // If the received segment is a RST, do nothing
    if ((tcphdr->flags & TCP_RST) == 0) {
        TCPSEQ seq;
        BYTE flags;
        IpAddrPair addrpair;
        IfInfo* ifp;

        if (ISTCPSEG(tcphdr, ACK)) {
            ack = 0;
            seq = SEG_ACK(tcphdr);
            flags = 0;
        } else {
            ack = SEG_SEQ(tcphdr) + SEG_LEN(tcphdr, pkt->datalen);
            seq = 0;
            flags  = TCP_ACK;
        }

        addrpair.dstaddr = iphdr->srcaddr;
        addrpair.srcaddr = iphdr->dstaddr;
        addrpair.dstport = tcphdr->srcport;
        addrpair.srcport = tcphdr->dstport;
        ifp = pkt->recvifp;

        // NOTE: We return the interface driver's buffer first
        // before trying to send out the RST segment.

        XnetCompletePacket(pkt, NETERR_DISCARDED);
        TcbEmitRST(ifp, &addrpair, seq, ack, flags);
        return;
    }

discard:
    XnetCompletePacket(pkt, NETERR_DISCARDED);
}