[net-next-2.6.git] / net / ipv4 / tcp_timer.c

/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
 *
 * Authors:	Ross Biro
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

#include <linux/module.h>
#include <net/tcp.h>

int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
int sysctl_tcp_orphan_retries;

static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
static void tcp_keepalive_timer (unsigned long data);

void tcp_init_xmit_timers(struct sock *sk)
{
	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
				  &tcp_keepalive_timer);
}

EXPORT_SYMBOL(tcp_init_xmit_timers);

static void tcp_write_err(struct sock *sk)
{
	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
	sk->sk_error_report(sk);

	tcp_done(sk);
	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
}

/* Do not allow orphaned sockets to eat all our resources.
 * This is direct violation of TCP specs, but it is required
 * to prevent DoS attacks. It is called when a retransmission timeout
 * or zero probe timeout occurs on orphaned socket.
 *
 * Criterium is still not confirmed experimentally and may change.
 * We kill the socket, if:
 * 1. If number of orphaned sockets exceeds an administratively configured
 *    limit.
 * 2. If we have strong memory pressure.
 */
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int orphans = atomic_read(&tcp_orphan_count);

	/* If peer does not open window for long time, or did not transmit 
	 * anything for long time, penalize it. */
	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
		orphans <<= 1;

	/* If some dubious ICMP arrived, penalize even more. */
	if (sk->sk_err_soft)
		orphans <<= 1;

	if (orphans >= sysctl_tcp_max_orphans ||
	    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
	     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
		if (net_ratelimit())
			printk(KERN_INFO "Out of socket memory\n");

		/* Catch exceptional cases, when connection requires reset.
		 *      1. Last segment was sent recently. */
		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
		    /*  2. Window is closed. */
		    (!tp->snd_wnd && !tp->packets_out))
			do_reset = 1;
		if (do_reset)
			tcp_send_active_reset(sk, GFP_ATOMIC);
		tcp_done(sk);
		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
		return 1;
	}
	return 0;
}

/* Calculate maximal number or retries on an orphaned socket. */
static int tcp_orphan_retries(struct sock *sk, int alive)
{
	int retries = sysctl_tcp_orphan_retries; /* May be zero. */

	/* We know from an ICMP that something is wrong. */
	if (sk->sk_err_soft && !alive)
		retries = 0;

	/* However, if socket sent something recently, select some safe
	 * number of retries. 8 corresponds to >100 seconds with minimal
	 * RTO of 200msec. */
	if (retries == 0 && alive)
		retries = 8;
	return retries;
}

/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	int retry_until;

	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		if (icsk->icsk_retransmits)
			dst_negative_advice(&sk->sk_dst_cache);
		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
	} else {
		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
			   hole detection. :-(

			   It is place to make it. It is not made. I do not want
			   to make it. It is disguisting. It does not work in any
			   case. Let me to cite the same draft, which requires for
			   us to implement this:

   "The one security concern raised by this memo is that ICMP black holes
   are often caused by over-zealous security administrators who block
   all ICMP messages.  It is vitally important that those who design and
   deploy security systems understand the impact of strict filtering on
   upper-layer protocols.  The safest web site in the world is worthless
   if most TCP implementations cannot transfer data from it.  It would
   be far nicer to have all of the black holes fixed rather than fixing
   all of the TCP implementations."

                           Golden words :-).
		   */

			dst_negative_advice(&sk->sk_dst_cache);
		}

		retry_until = sysctl_tcp_retries2;
		if (sock_flag(sk, SOCK_DEAD)) {
			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 
			retry_until = tcp_orphan_retries(sk, alive);

			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
				return 1;
		}
	}

	if (icsk->icsk_retransmits >= retry_until) {
		/* Has it gone just too far? */
		tcp_write_err(sk);
		return 1;
	}
	return 0;
}

static void tcp_delack_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		icsk->icsk_ack.blocked = 1;
		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
		goto out_unlock;
	}

	sk_stream_mem_reclaim(sk);

	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
		goto out;

	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
		goto out;
	}
	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;

	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
		struct sk_buff *skb;

		NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);

		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
			sk->sk_backlog_rcv(sk, skb);

		tp->ucopy.memory = 0;
	}

	if (inet_csk_ack_scheduled(sk)) {
		if (!icsk->icsk_ack.pingpong) {
			/* Delayed ACK missed: inflate ATO. */
			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
		} else {
			/* Delayed ACK missed: leave pingpong mode and
			 * deflate ATO.
			 */
			icsk->icsk_ack.pingpong = 0;
			icsk->icsk_ack.ato      = TCP_ATO_MIN;
		}
		tcp_send_ack(sk);
		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
	}
	TCP_CHECK_TIMER(sk);

out:
	if (tcp_memory_pressure)
		sk_stream_mem_reclaim(sk);
out_unlock:
	bh_unlock_sock(sk);
	sock_put(sk);
}

static void tcp_probe_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int max_probes;

	if (tp->packets_out || !sk->sk_send_head) {
		tp->probes_out = 0;
		return;
	}

	/* *WARNING* RFC 1122 forbids this
	 *
	 * It doesn't AFAIK, because we kill the retransmit timer -AK
	 *
	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
	 * this behaviour in Solaris down as a bug fix. [AC]
	 *
	 * Let me to explain. probes_out is zeroed by incoming ACKs
	 * even if they advertise zero window. Hence, connection is killed only
	 * if we received no ACKs for normal connection timeout. It is not killed
	 * only because window stays zero for some time, window may be zero
	 * until armageddon and even later. We are in full accordance
	 * with RFCs, only probe timer combines both retransmission timeout
	 * and probe timeout in one bottle.				--ANK
	 */
	max_probes = sysctl_tcp_retries2;

	if (sock_flag(sk, SOCK_DEAD)) {
		const struct inet_connection_sock *icsk = inet_csk(sk);
		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 
		max_probes = tcp_orphan_retries(sk, alive);

		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
			return;
	}

	if (tp->probes_out > max_probes) {
		tcp_write_err(sk);
	} else {
		/* Only send another probe if we didn't close things up. */
		tcp_send_probe0(sk);
	}
}

/*
 *	The TCP retransmit timer.
 */

static void tcp_retransmit_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

	if (!tp->packets_out)
		goto out;

	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));

	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
		/* Receiver dastardly shrinks window. Our retransmits
		 * become zero probes, but we should not timeout this
		 * connection. If the socket is an orphan, time it out,
		 * we cannot allow such beasts to hang infinitely.
		 */
#ifdef TCP_DEBUG
		if (net_ratelimit()) {
			struct inet_sock *inet = inet_sk(sk);
			printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
			       NIPQUAD(inet->daddr), htons(inet->dport),
			       inet->num, tp->snd_una, tp->snd_nxt);
		}
#endif
		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
			tcp_write_err(sk);
			goto out;
		}
		tcp_enter_loss(sk, 0);
		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
		__sk_dst_reset(sk);
		goto out_reset_timer;
	}

	if (tcp_write_timeout(sk))
		goto out;

	if (icsk->icsk_retransmits == 0) {
		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
			if (tp->rx_opt.sack_ok) {
				if (tp->ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
			} else {
				if (tp->ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
			}
		} else if (tp->ca_state == TCP_CA_Loss) {
			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
		} else {
			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
		}
	}

	if (tcp_use_frto(sk)) {
		tcp_enter_frto(sk);
	} else {
		tcp_enter_loss(sk, 0);
	}

	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
		/* Retransmission failed because of local congestion,
		 * do not backoff.
		 */
		if (!icsk->icsk_retransmits)
			icsk->icsk_retransmits = 1;
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
					  TCP_RTO_MAX);
		goto out;
	}

	/* Increase the timeout each time we retransmit.  Note that
	 * we do not increase the rtt estimate.  rto is initialized
	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
	 * that doubling rto each time is the least we can get away with.
	 * In KA9Q, Karn uses this for the first few times, and then
	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
	 * defined in the protocol as the maximum possible RTT.  I guess
	 * we'll have to use something other than TCP to talk to the
	 * University of Mars.
	 *
	 * PAWS allows us longer timeouts and large windows, so once
	 * implemented ftp to mars will work nicely. We will have to fix
	 * the 120 second clamps though!
	 */
	icsk->icsk_backoff++;
	icsk->icsk_retransmits++;

out_reset_timer:
	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
		__sk_dst_reset(sk);

out:;
}

static void tcp_write_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct inet_connection_sock *icsk = inet_csk(sk);
	int event;

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later */
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
		goto out_unlock;
	}

	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
		goto out;

	if (time_after(icsk->icsk_timeout, jiffies)) {
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
		goto out;
	}

	event = icsk->icsk_pending;
	icsk->icsk_pending = 0;

	switch (event) {
	case ICSK_TIME_RETRANS:
		tcp_retransmit_timer(sk);
		break;
	case ICSK_TIME_PROBE0:
		tcp_probe_timer(sk);
		break;
	}
	TCP_CHECK_TIMER(sk);

out:
	sk_stream_mem_reclaim(sk);
out_unlock:
	bh_unlock_sock(sk);
	sock_put(sk);
}

/*
 *	Timer for listening sockets
 */

static void tcp_synack_timer(struct sock *sk)
{
	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}

void tcp_set_keepalive(struct sock *sk, int val)
{
	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
		return;

	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
	else if (!val)
		inet_csk_delete_keepalive_timer(sk);
}


static void tcp_keepalive_timer (unsigned long data)
{
	struct sock *sk = (struct sock *) data;
	struct tcp_sock *tp = tcp_sk(sk);
	__u32 elapsed;

	/* Only process if socket is not in use. */
	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */ 
		inet_csk_reset_keepalive_timer (sk, HZ/20);
		goto out;
	}

	if (sk->sk_state == TCP_LISTEN) {
		tcp_synack_timer(sk);
		goto out;
	}

	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
		if (tp->linger2 >= 0) {
			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

			if (tmo > 0) {
				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				goto out;
			}
		}
		tcp_send_active_reset(sk, GFP_ATOMIC);
		goto death;
	}

	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
		goto out;

	elapsed = keepalive_time_when(tp);

	/* It is alive without keepalive 8) */
	if (tp->packets_out || sk->sk_send_head)
		goto resched;

	elapsed = tcp_time_stamp - tp->rcv_tstamp;

	if (elapsed >= keepalive_time_when(tp)) {
		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
			tcp_send_active_reset(sk, GFP_ATOMIC);
			tcp_write_err(sk);
			goto out;
		}
		if (tcp_write_wakeup(sk) <= 0) {
			tp->probes_out++;
			elapsed = keepalive_intvl_when(tp);
		} else {
			/* If keepalive was lost due to local congestion,
			 * try harder.
			 */
			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
		}
	} else {
		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
		elapsed = keepalive_time_when(tp) - elapsed;
	}

	TCP_CHECK_TIMER(sk);
	sk_stream_mem_reclaim(sk);

resched:
	inet_csk_reset_keepalive_timer (sk, elapsed);
	goto out;

death:	
	tcp_done(sk);

out:
	bh_unlock_sock(sk);
	sock_put(sk);
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
	3	* operating system. INET is implemented using the BSD Socket
	4	* interface as the means of communication with the user level.
	5	*
	6	* Implementation of the Transmission Control Protocol(TCP).
	7	*
	8	* Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
	9	*
02c30a84	10	* Authors: Ross Biro
1da177e4 LT	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
	12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
	13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
	14	* Florian La Roche, <flla@stud.uni-sb.de>
	15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
	16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
	17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
	18	* Matthew Dillon, <dillon@apollo.west.oic.com>
	19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
	20	* Jorge Cwik, <jorge@laser.satlink.net>
	21	*/
	22
	23	#include <linux/module.h>
	24	#include <net/tcp.h>
	25
	26	int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
	27	int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
	28	int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
	29	int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
	30	int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
	31	int sysctl_tcp_retries1 = TCP_RETR1;
	32	int sysctl_tcp_retries2 = TCP_RETR2;
	33	int sysctl_tcp_orphan_retries;
	34
	35	static void tcp_write_timer(unsigned long);
	36	static void tcp_delack_timer(unsigned long);
	37	static void tcp_keepalive_timer (unsigned long data);
	38
463c84b9 ACM	39	void tcp_init_xmit_timers(struct sock *sk)
	40	{
	41	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
	42	&tcp_keepalive_timer);
	43	}
	44
3f421baa ACM	45	EXPORT_SYMBOL(tcp_init_xmit_timers);
3f421baa ACM	46
1da177e4 LT	47	static void tcp_write_err(struct sock *sk)
	48	{
	49	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
	50	sk->sk_error_report(sk);
	51
	52	tcp_done(sk);
	53	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
	54	}
	55
	56	/* Do not allow orphaned sockets to eat all our resources.
	57	* This is direct violation of TCP specs, but it is required
	58	* to prevent DoS attacks. It is called when a retransmission timeout
	59	* or zero probe timeout occurs on orphaned socket.
	60	*
	61	* Criterium is still not confirmed experimentally and may change.
	62	* We kill the socket, if:
	63	* 1. If number of orphaned sockets exceeds an administratively configured
	64	* limit.
	65	* 2. If we have strong memory pressure.
	66	*/
	67	static int tcp_out_of_resources(struct sock *sk, int do_reset)
	68	{
	69	struct tcp_sock *tp = tcp_sk(sk);
	70	int orphans = atomic_read(&tcp_orphan_count);
	71
	72	/* If peer does not open window for long time, or did not transmit
	73	* anything for long time, penalize it. */
	74	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX \|\| !do_reset)
	75	orphans <<= 1;
	76
	77	/* If some dubious ICMP arrived, penalize even more. */
	78	if (sk->sk_err_soft)
	79	orphans <<= 1;
	80
	81	if (orphans >= sysctl_tcp_max_orphans \|\|
	82	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
	83	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
	84	if (net_ratelimit())
	85	printk(KERN_INFO "Out of socket memory\n");
	86
	87	/* Catch exceptional cases, when connection requires reset.
	88	* 1. Last segment was sent recently. */
	89	if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN \|\|
	90	/* 2. Window is closed. */
	91	(!tp->snd_wnd && !tp->packets_out))
	92	do_reset = 1;
	93	if (do_reset)
	94	tcp_send_active_reset(sk, GFP_ATOMIC);
	95	tcp_done(sk);
	96	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
	97	return 1;
	98	}
	99	return 0;
	100	}
	101
	102	/* Calculate maximal number or retries on an orphaned socket. */
	103	static int tcp_orphan_retries(struct sock *sk, int alive)
	104	{
	105	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
	106
	107	/* We know from an ICMP that something is wrong. */
	108	if (sk->sk_err_soft && !alive)
	109	retries = 0;
	110
111	/* However, if socket sent something recently, select some safe
112	* number of retries. 8 corresponds to >100 seconds with minimal
113	* RTO of 200msec. */
114	if (retries == 0 && alive)
115	retries = 8;
116	return retries;
117	}
118
119	/* A write timeout has occurred. Process the after effects. */
120	static int tcp_write_timeout(struct sock *sk)
121	{
463c84b9	122	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 LT	123	int retry_until;
	124
	125	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
463c84b9	126	if (icsk->icsk_retransmits)
1da177e4	127	dst_negative_advice(&sk->sk_dst_cache);
463c84b9	128	retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1da177e4	129	} else {
463c84b9	130	if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
1da177e4 LT	131	/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
	132	hole detection. :-(
	133
	134	It is place to make it. It is not made. I do not want
	135	to make it. It is disguisting. It does not work in any
	136	case. Let me to cite the same draft, which requires for
	137	us to implement this:
	138
	139	"The one security concern raised by this memo is that ICMP black holes
	140	are often caused by over-zealous security administrators who block
	141	all ICMP messages. It is vitally important that those who design and
	142	deploy security systems understand the impact of strict filtering on
	143	upper-layer protocols. The safest web site in the world is worthless
	144	if most TCP implementations cannot transfer data from it. It would
	145	be far nicer to have all of the black holes fixed rather than fixing
	146	all of the TCP implementations."
	147
	148	Golden words :-).
	149	*/
	150
	151	dst_negative_advice(&sk->sk_dst_cache);
	152	}
	153
	154	retry_until = sysctl_tcp_retries2;
	155	if (sock_flag(sk, SOCK_DEAD)) {
463c84b9	156	const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
1da177e4 LT	157
	158	retry_until = tcp_orphan_retries(sk, alive);
	159
463c84b9	160	if (tcp_out_of_resources(sk, alive \|\| icsk->icsk_retransmits < retry_until))
1da177e4 LT	161	return 1;
	162	}
	163	}
	164
463c84b9	165	if (icsk->icsk_retransmits >= retry_until) {
1da177e4 LT	166	/* Has it gone just too far? */
	167	tcp_write_err(sk);
	168	return 1;
	169	}
	170	return 0;
	171	}
	172
	173	static void tcp_delack_timer(unsigned long data)
	174	{
	175	struct sock sk = (struct sock)data;
	176	struct tcp_sock *tp = tcp_sk(sk);
463c84b9	177	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 LT	178
	179	bh_lock_sock(sk);
	180	if (sock_owned_by_user(sk)) {
	181	/* Try again later. */
463c84b9	182	icsk->icsk_ack.blocked = 1;
1da177e4	183	NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
463c84b9	184	sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
1da177e4 LT	185	goto out_unlock;
	186	}
	187
	188	sk_stream_mem_reclaim(sk);
	189
463c84b9	190	if (sk->sk_state == TCP_CLOSE \|\| !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
1da177e4 LT	191	goto out;
1da177e4 LT	192
463c84b9 ACM	193	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
463c84b9 ACM	194	sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
1da177e4 LT	195	goto out;
1da177e4 LT	196	}
463c84b9	197	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
1da177e4	198
b03efcfb	199	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1da177e4 LT	200	struct sk_buff *skb;
1da177e4 LT	201
b03efcfb	202	NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
1da177e4 LT	203
	204	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
	205	sk->sk_backlog_rcv(sk, skb);
	206
	207	tp->ucopy.memory = 0;
	208	}
	209
463c84b9 ACM	210	if (inet_csk_ack_scheduled(sk)) {
463c84b9 ACM	211	if (!icsk->icsk_ack.pingpong) {
1da177e4	212	/* Delayed ACK missed: inflate ATO. */
463c84b9	213	icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
1da177e4 LT	214	} else {
	215	/* Delayed ACK missed: leave pingpong mode and
	216	* deflate ATO.
	217	*/
463c84b9 ACM	218	icsk->icsk_ack.pingpong = 0;
463c84b9 ACM	219	icsk->icsk_ack.ato = TCP_ATO_MIN;
1da177e4 LT	220	}
	221	tcp_send_ack(sk);
	222	NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
	223	}
	224	TCP_CHECK_TIMER(sk);
	225
	226	out:
	227	if (tcp_memory_pressure)
	228	sk_stream_mem_reclaim(sk);
	229	out_unlock:
	230	bh_unlock_sock(sk);
	231	sock_put(sk);
	232	}
	233
	234	static void tcp_probe_timer(struct sock *sk)
	235	{
	236	struct tcp_sock *tp = tcp_sk(sk);
	237	int max_probes;
	238
	239	if (tp->packets_out \|\| !sk->sk_send_head) {
	240	tp->probes_out = 0;
	241	return;
	242	}
	243
	244	/* WARNING RFC 1122 forbids this
	245	*
	246	* It doesn't AFAIK, because we kill the retransmit timer -AK
	247	*
	248	* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
	249	* this behaviour in Solaris down as a bug fix. [AC]
	250	*
	251	* Let me to explain. probes_out is zeroed by incoming ACKs
	252	* even if they advertise zero window. Hence, connection is killed only
	253	* if we received no ACKs for normal connection timeout. It is not killed
	254	* only because window stays zero for some time, window may be zero
	255	* until armageddon and even later. We are in full accordance
	256	* with RFCs, only probe timer combines both retransmission timeout
	257	* and probe timeout in one bottle. --ANK
	258	*/
	259	max_probes = sysctl_tcp_retries2;
	260
	261	if (sock_flag(sk, SOCK_DEAD)) {
463c84b9 ACM	262	const struct inet_connection_sock *icsk = inet_csk(sk);
463c84b9 ACM	263	const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
1da177e4 LT	264
	265	max_probes = tcp_orphan_retries(sk, alive);
	266
	267	if (tcp_out_of_resources(sk, alive \|\| tp->probes_out <= max_probes))
	268	return;
	269	}
	270
	271	if (tp->probes_out > max_probes) {
	272	tcp_write_err(sk);
	273	} else {
	274	/* Only send another probe if we didn't close things up. */
	275	tcp_send_probe0(sk);
	276	}
	277	}
	278
	279	/*
	280	* The TCP retransmit timer.
	281	*/
	282
	283	static void tcp_retransmit_timer(struct sock *sk)
	284	{
	285	struct tcp_sock *tp = tcp_sk(sk);
463c84b9	286	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 LT	287
	288	if (!tp->packets_out)
	289	goto out;
	290
	291	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
	292
	293	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
	294	!((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))) {
	295	/* Receiver dastardly shrinks window. Our retransmits
	296	* become zero probes, but we should not timeout this
	297	* connection. If the socket is an orphan, time it out,
	298	* we cannot allow such beasts to hang infinitely.
	299	*/
	300	#ifdef TCP_DEBUG
	301	if (net_ratelimit()) {
	302	struct inet_sock *inet = inet_sk(sk);
	303	printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
	304	NIPQUAD(inet->daddr), htons(inet->dport),
	305	inet->num, tp->snd_una, tp->snd_nxt);
	306	}
	307	#endif
	308	if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
	309	tcp_write_err(sk);
	310	goto out;
	311	}
	312	tcp_enter_loss(sk, 0);
	313	tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
	314	__sk_dst_reset(sk);
	315	goto out_reset_timer;
	316	}
	317
	318	if (tcp_write_timeout(sk))
	319	goto out;
	320
463c84b9	321	if (icsk->icsk_retransmits == 0) {
1da177e4 LT	322	if (tp->ca_state == TCP_CA_Disorder \|\| tp->ca_state == TCP_CA_Recovery) {
	323	if (tp->rx_opt.sack_ok) {
	324	if (tp->ca_state == TCP_CA_Recovery)
	325	NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
	326	else
	327	NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
	328	} else {
	329	if (tp->ca_state == TCP_CA_Recovery)
	330	NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
	331	else
	332	NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
	333	}
	334	} else if (tp->ca_state == TCP_CA_Loss) {
	335	NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
	336	} else {
	337	NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
	338	}
	339	}
	340
	341	if (tcp_use_frto(sk)) {
	342	tcp_enter_frto(sk);
	343	} else {
	344	tcp_enter_loss(sk, 0);
	345	}
	346
	347	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
	348	/* Retransmission failed because of local congestion,
	349	* do not backoff.
	350	*/
463c84b9 ACM	351	if (!icsk->icsk_retransmits)
	352	icsk->icsk_retransmits = 1;
	353	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3f421baa ACM	354	min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
3f421baa ACM	355	TCP_RTO_MAX);
1da177e4 LT	356	goto out;
	357	}
	358
	359	/* Increase the timeout each time we retransmit. Note that
	360	* we do not increase the rtt estimate. rto is initialized
	361	* from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
	362	* that doubling rto each time is the least we can get away with.
	363	* In KA9Q, Karn uses this for the first few times, and then
	364	* goes to quadratic. netBSD doubles, but only goes up to *64,
	365	* and clamps at 1 to 64 sec afterwards. Note that 120 sec is
	366	* defined in the protocol as the maximum possible RTT. I guess
	367	* we'll have to use something other than TCP to talk to the
	368	* University of Mars.
	369	*
	370	* PAWS allows us longer timeouts and large windows, so once
	371	* implemented ftp to mars will work nicely. We will have to fix
	372	* the 120 second clamps though!
	373	*/
463c84b9 ACM	374	icsk->icsk_backoff++;
463c84b9 ACM	375	icsk->icsk_retransmits++;
1da177e4 LT	376
1da177e4 LT	377	out_reset_timer:
463c84b9	378	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
3f421baa	379	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
463c84b9	380	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
1da177e4 LT	381	__sk_dst_reset(sk);
	382
	383	out:;
	384	}
	385
	386	static void tcp_write_timer(unsigned long data)
	387	{
	388	struct sock sk = (struct sock)data;
463c84b9	389	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 LT	390	int event;
	391
	392	bh_lock_sock(sk);
	393	if (sock_owned_by_user(sk)) {
	394	/* Try again later */
463c84b9	395	sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
1da177e4 LT	396	goto out_unlock;
	397	}
	398
463c84b9	399	if (sk->sk_state == TCP_CLOSE \|\| !icsk->icsk_pending)
1da177e4 LT	400	goto out;
1da177e4 LT	401
463c84b9 ACM	402	if (time_after(icsk->icsk_timeout, jiffies)) {
463c84b9 ACM	403	sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
1da177e4 LT	404	goto out;
	405	}
	406
463c84b9 ACM	407	event = icsk->icsk_pending;
463c84b9 ACM	408	icsk->icsk_pending = 0;
1da177e4 LT	409
1da177e4 LT	410	switch (event) {
463c84b9	411	case ICSK_TIME_RETRANS:
1da177e4 LT	412	tcp_retransmit_timer(sk);
1da177e4 LT	413	break;
463c84b9	414	case ICSK_TIME_PROBE0:
1da177e4 LT	415	tcp_probe_timer(sk);
	416	break;
	417	}
	418	TCP_CHECK_TIMER(sk);
	419
	420	out:
	421	sk_stream_mem_reclaim(sk);
	422	out_unlock:
	423	bh_unlock_sock(sk);
	424	sock_put(sk);
	425	}
	426
295f7324 ACM	427	/*
	428	* Timer for listening sockets
	429	*/
	430
	431	static void tcp_synack_timer(struct sock *sk)
	432	{
a019d6fe ACM	433	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
a019d6fe ACM	434	TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1da177e4 LT	435	}
1da177e4 LT	436
1da177e4 LT	437	void tcp_set_keepalive(struct sock *sk, int val)
	438	{
	439	if ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN))
	440	return;
	441
	442	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
463c84b9	443	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
1da177e4	444	else if (!val)
463c84b9	445	inet_csk_delete_keepalive_timer(sk);
1da177e4 LT	446	}
	447
	448
	449	static void tcp_keepalive_timer (unsigned long data)
	450	{
	451	struct sock sk = (struct sock ) data;
	452	struct tcp_sock *tp = tcp_sk(sk);
	453	__u32 elapsed;
	454
	455	/* Only process if socket is not in use. */
	456	bh_lock_sock(sk);
	457	if (sock_owned_by_user(sk)) {
	458	/* Try again later. */
463c84b9	459	inet_csk_reset_keepalive_timer (sk, HZ/20);
1da177e4 LT	460	goto out;
	461	}
	462
	463	if (sk->sk_state == TCP_LISTEN) {
	464	tcp_synack_timer(sk);
	465	goto out;
	466	}
	467
	468	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
	469	if (tp->linger2 >= 0) {
463c84b9	470	const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
1da177e4 LT	471
	472	if (tmo > 0) {
	473	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
	474	goto out;
	475	}
	476	}
	477	tcp_send_active_reset(sk, GFP_ATOMIC);
	478	goto death;
	479	}
	480
	481	if (!sock_flag(sk, SOCK_KEEPOPEN) \|\| sk->sk_state == TCP_CLOSE)
	482	goto out;
	483
	484	elapsed = keepalive_time_when(tp);
	485
	486	/* It is alive without keepalive 8) */
	487	if (tp->packets_out \|\| sk->sk_send_head)
	488	goto resched;
	489
	490	elapsed = tcp_time_stamp - tp->rcv_tstamp;
	491
	492	if (elapsed >= keepalive_time_when(tp)) {
	493	if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) \|\|
	494	(tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
	495	tcp_send_active_reset(sk, GFP_ATOMIC);
	496	tcp_write_err(sk);
	497	goto out;
	498	}
	499	if (tcp_write_wakeup(sk) <= 0) {
	500	tp->probes_out++;
	501	elapsed = keepalive_intvl_when(tp);
	502	} else {
	503	/* If keepalive was lost due to local congestion,
	504	* try harder.
	505	*/
	506	elapsed = TCP_RESOURCE_PROBE_INTERVAL;
	507	}
	508	} else {
	509	/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
	510	elapsed = keepalive_time_when(tp) - elapsed;
	511	}
	512
	513	TCP_CHECK_TIMER(sk);
	514	sk_stream_mem_reclaim(sk);
	515
	516	resched:
463c84b9	517	inet_csk_reset_keepalive_timer (sk, elapsed);
1da177e4 LT	518	goto out;
	519
	520	death:
	521	tcp_done(sk);
	522
	523	out:
	524	bh_unlock_sock(sk);
	525	sock_put(sk);
	526	}