patch-2.3.15 linux/net/ipv4/tcp_timer.c
Next file: linux/net/ipv4/timer.c
Previous file: linux/net/ipv4/tcp_output.c
Back to the patch index
Back to the overall index
- Lines: 626
- Date:
Mon Aug 23 10:01:02 1999
- Orig file:
v2.3.14/linux/net/ipv4/tcp_timer.c
- Orig date:
Sat Jul 3 17:57:23 1999
diff -u --recursive --new-file v2.3.14/linux/net/ipv4/tcp_timer.c linux/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.65 1999/07/02 11:26:35 davem Exp $
+ * Version: $Id: tcp_timer.c,v 1.66 1999/08/20 11:06:10 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -28,9 +28,9 @@
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
+
static void tcp_sltimer_handler(unsigned long);
static void tcp_syn_recv_timer(unsigned long);
-static void tcp_keepalive(unsigned long data);
static void tcp_twkill(unsigned long);
struct timer_list tcp_slow_timer = {
@@ -42,7 +42,6 @@
struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
- {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */
{ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */
};
@@ -77,6 +76,7 @@
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ spin_lock_bh(&sk->timer_lock);
switch (what) {
case TIME_RETRANS:
/* When seting the transmit timer the probe timer
@@ -84,16 +84,26 @@
* The delayed ack timer can be set if we are changing the
* retransmit timer when removing acked frames.
*/
- if(tp->probe_timer.prev)
- del_timer(&tp->probe_timer);
+ if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
+ __sock_put(sk);
+ if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
+ sock_hold(sk);
+ if (when > 120*HZ) {
+ printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
+ when = 120*HZ;
+ }
mod_timer(&tp->retransmit_timer, jiffies+when);
break;
case TIME_DACK:
+ if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
+ sock_hold(sk);
mod_timer(&tp->delack_timer, jiffies+when);
break;
case TIME_PROBE0:
+ if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
+ sock_hold(sk);
mod_timer(&tp->probe_timer, jiffies+when);
break;
@@ -104,40 +114,44 @@
default:
printk(KERN_DEBUG "bug: unknown timer value\n");
};
+ spin_unlock_bh(&sk->timer_lock);
}
void tcp_clear_xmit_timers(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if(tp->retransmit_timer.prev)
- del_timer(&tp->retransmit_timer);
- if(tp->delack_timer.prev)
- del_timer(&tp->delack_timer);
- if(tp->probe_timer.prev)
- del_timer(&tp->probe_timer);
+ spin_lock_bh(&sk->timer_lock);
+ if(tp->retransmit_timer.prev && del_timer(&tp->retransmit_timer))
+ __sock_put(sk);
+ if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
+ __sock_put(sk);
+ if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
+ __sock_put(sk);
+ if(sk->timer.prev && del_timer(&sk->timer))
+ __sock_put(sk);
+ spin_unlock_bh(&sk->timer_lock);
}
-static int tcp_write_err(struct sock *sk, int force)
+static void tcp_write_err(struct sock *sk, int force)
{
sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
sk->error_report(sk);
-
+
tcp_clear_xmit_timers(sk);
-
- /* Time wait the socket. */
- if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
- tcp_time_wait(sk);
- } else {
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
- return 0;
- }
- return 1;
+
+ /* Do not time wait the socket. It is timed out and, hence,
+ * idle for 120*HZ. "force" argument is ignored, delete
+ * it eventually.
+ */
+
+ /* Clean up time. */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_done(sk);
}
/* A write timeout has occurred. Process the after effects. */
-static int tcp_write_timeout(struct sock *sk)
+static void tcp_write_timeout(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -145,6 +159,26 @@
if ((sk->state == TCP_ESTABLISHED &&
tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
(sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+ hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
dst_negative_advice(&sk->dst_cache);
}
@@ -152,14 +186,10 @@
if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
tcp_write_err(sk, 1);
/* Don't FIN, we got nothing back */
- return 0;
+ } else if (tp->retransmits > sysctl_tcp_retries2) {
+ /* Has it gone just too far? */
+ tcp_write_err(sk, 0);
}
-
- /* Has it gone just too far? */
- if (tp->retransmits > sysctl_tcp_retries2)
- return tcp_write_err(sk, 0);
-
- return 1;
}
void tcp_delack_timer(unsigned long data)
@@ -167,15 +197,20 @@
struct sock *sk = (struct sock*)data;
bh_lock_sock(sk);
+ if (sk->lock.users) {
+ /* Try again later. */
+ tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5);
+ goto out_unlock;
+ }
+
if(!sk->zapped &&
sk->tp_pinfo.af_tcp.delayed_acks &&
- sk->state != TCP_CLOSE) {
- if (!sk->lock.users)
- tcp_send_ack(sk);
- else
- tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
- }
+ sk->state != TCP_CLOSE)
+ tcp_send_ack(sk);
+
+out_unlock:
bh_unlock_sock(sk);
+ sock_put(sk);
}
void tcp_probe_timer(unsigned long data)
@@ -183,79 +218,50 @@
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if(sk->zapped)
- return;
-
+ if(sk->zapped)
+ goto out;
+
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
- bh_unlock_sock(sk);
- return;
+ goto out_unlock;
}
- /* *WARNING* RFC 1122 forbids this
+ /* *WARNING* RFC 1122 forbids this
+ *
* It doesn't AFAIK, because we kill the retransmit timer -AK
+ *
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
+ *
+ * Let me to explain. probes_out is zeroed by incoming ACKs
+ * even if they advertise zero window. Hence, connection is killed only
+ * if we received no ACKs for normal connection timeout. It is not killed
+ * only because window stays zero for some time, window may be zero
+ * until armageddon and even later. We are in full accordance
+ * with RFCs, only probe timer combines both retransmission timeout
+ * and probe timeout in one bottle. --ANK
*/
if (tp->probes_out > sysctl_tcp_retries2) {
- if(sk->err_soft)
- sk->err = sk->err_soft;
- else
- sk->err = ETIMEDOUT;
- sk->error_report(sk);
-
- if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
- /* Time wait the socket. */
- tcp_time_wait(sk);
- } else {
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
- }
+ tcp_write_err(sk, 0);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
+out_unlock:
bh_unlock_sock(sk);
+out:
+ sock_put(sk);
}
-static __inline__ int tcp_keepopen_proc(struct sock *sk)
-{
- int res = 0;
-
- if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
-
- if (elapsed >= sysctl_tcp_keepalive_time) {
- if (tp->probes_out > sysctl_tcp_keepalive_probes) {
- if(sk->err_soft)
- sk->err = sk->err_soft;
- else
- sk->err = ETIMEDOUT;
-
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
- } else {
- tp->probes_out++;
- tp->pending = TIME_KEEPOPEN;
- tcp_write_wakeup(sk);
- res = 1;
- }
- }
- }
- return res;
-}
/* Kill off TIME_WAIT sockets once their lifetime has expired. */
int tcp_tw_death_row_slot = 0;
static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
-extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
static void tcp_twkill(unsigned long data)
{
@@ -263,17 +269,20 @@
int killed = 0;
/* The death-row tw chains are only ever touched
- * in BH context so no locking is needed.
+ * in BH context so no BH disabling (for now) is needed.
*/
+ spin_lock(&tw_death_lock);
tw = tcp_tw_death_row[tcp_tw_death_row_slot];
tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
tcp_tw_death_row_slot =
((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+ spin_unlock(&tw_death_lock);
while(tw != NULL) {
struct tcp_tw_bucket *next = tw->next_death;
tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
killed++;
tw = next;
}
@@ -288,17 +297,20 @@
*/
void tcp_tw_schedule(struct tcp_tw_bucket *tw)
{
- int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
+ struct tcp_tw_bucket **tpp;
+ int slot;
- SOCKHASH_LOCK_WRITE_BH();
+ spin_lock(&tw_death_lock);
+ slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+ tpp = &tcp_tw_death_row[slot];
if((tw->next_death = *tpp) != NULL)
(*tpp)->pprev_death = &tw->next_death;
*tpp = tw;
tw->pprev_death = tpp;
tw->death_slot = slot;
- SOCKHASH_UNLOCK_WRITE_BH();
+ atomic_inc(&tw->refcnt);
+ spin_unlock(&tw_death_lock);
tcp_inc_slow_timer(TCP_SLT_TWKILL);
}
@@ -309,11 +321,14 @@
struct tcp_tw_bucket **tpp;
int slot;
- SOCKHASH_LOCK_WRITE_BH();
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
+ spin_lock(&tw_death_lock);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ } else
+ atomic_inc(&tw->refcnt);
slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
tpp = &tcp_tw_death_row[slot];
@@ -323,7 +338,7 @@
tw->pprev_death = tpp;
tw->death_slot = slot;
- SOCKHASH_UNLOCK_WRITE_BH();
+ spin_unlock(&tw_death_lock);
/* Timer was incremented when we first entered the table. */
}
@@ -331,91 +346,28 @@
/* This is for handling early-kills of TIME_WAIT sockets. */
void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
{
- SOCKHASH_LOCK_WRITE_BH();
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
- SOCKHASH_UNLOCK_WRITE_BH();
+ spin_lock(&tw_death_lock);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ tcp_tw_put(tw);
+ }
+ spin_unlock(&tw_death_lock);
tcp_dec_slow_timer(TCP_SLT_TWKILL);
}
-/*
- * Check all sockets for keepalive timer
- * Called every 75 seconds
- * This timer is started by af_inet init routine and is constantly
- * running.
- *
- * It might be better to maintain a count of sockets that need it using
- * setsockopt/tcp_destroy_sk and only set the timer when needed.
- */
/*
- * don't send over 5 keepopens at a time to avoid burstiness
- * on big servers [AC]
- */
-#define MAX_KA_PROBES 5
-
-int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
-
-/* Keepopen's are only valid for "established" TCP's, nicely our listener
- * hash gets rid of most of the useless testing, so we run through a couple
- * of the established hash chains each clock tick. -DaveM
- *
- * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
- * going off for them, so we only need check the first half of the established
- * hash table, even less testing under heavy load.
- *
- * I _really_ would rather do this by adding a new timer_struct to struct sock,
- * and this way only those who set the keepalive option will get the overhead.
- * The idea is you set it for 2 hours when the sock is first connected, when it
- * does fire off (if at all, most sockets die earlier) you check for the keepalive
- * option and also if the sock has been idle long enough to start probing.
- */
-static void tcp_keepalive(unsigned long data)
-{
- static int chain_start = 0;
- int count = 0;
- int i;
-
- SOCKHASH_LOCK_READ_BH();
- for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) {
- struct sock *sk;
-
- sk = tcp_ehash[i];
- while(sk) {
- struct sock *next = sk->next;
-
- bh_lock_sock(sk);
- if (sk->keepopen && !sk->lock.users) {
- SOCKHASH_UNLOCK_READ_BH();
- count += tcp_keepopen_proc(sk);
- SOCKHASH_LOCK_READ_BH();
- }
- bh_unlock_sock(sk);
- if(count == sysctl_tcp_max_ka_probes)
- goto out;
- sk = next;
- }
- }
-out:
- SOCKHASH_UNLOCK_READ_BH();
- chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) &
- ((tcp_ehash_size >> 1) - 1));
-}
-
-/*
- * The TCP retransmit timer. This lacks a few small details.
+ * The TCP retransmit timer.
*
* 1. An initial rtt timeout on the probe0 should cause what we can
* of the first write queue buffer to be split and sent.
- * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
+ * 2. On a 'major timeout' as defined by RFC1122 we do not report
* ETIMEDOUT if we know an additional 'soft' error caused this.
- * tcp_err should save a 'soft error' for us.
- * [Unless someone has broken it then it does, except for one 2.0
- * broken case of a send when the route/device is directly unreachable,
- * and we error but should retry! - FIXME] [AC]
+ * tcp_err saves a 'soft error' for us.
*/
void tcp_retransmit_timer(unsigned long data)
@@ -424,17 +376,14 @@
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
/* We are reset. We will send no more retransmits. */
- if(sk->zapped) {
- tcp_clear_xmit_timer(sk, TIME_RETRANS);
- return;
- }
+ if(sk->zapped)
+ goto out;
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later */
tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
- bh_unlock_sock(sk);
- return;
+ goto out_unlock;
}
/* Clear delay ack timer. */
@@ -501,7 +450,10 @@
tcp_write_timeout(sk);
+out_unlock:
bh_unlock_sock(sk);
+out:
+ sock_put(sk);
}
/*
@@ -516,7 +468,7 @@
for(req = tp->syn_wait_queue; req; ) {
struct open_request *next = req->dl_next;
- if (! req->sk) {
+ if (!req->sk && (long)(now - req->expires) >= 0) {
tcp_synq_unlink(tp, req, prev);
if(req->retrans >= sysctl_tcp_retries1) {
(*req->class->destructor)(req);
@@ -552,7 +504,7 @@
unsigned long now = jiffies;
int i;
- SOCKHASH_LOCK_READ_BH();
+ read_lock(&tcp_lhash_lock);
for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
sk = tcp_listening_hash[i];
while(sk) {
@@ -566,7 +518,7 @@
sk = sk->next;
}
}
- SOCKHASH_UNLOCK_READ_BH();
+ read_unlock(&tcp_lhash_lock);
}
void tcp_sltimer_handler(unsigned long data)
@@ -613,4 +565,85 @@
tcp_slow_timer.expires = when;
add_timer(&tcp_slow_timer);
}
+}
+
+void tcp_delete_keepalive_timer (struct sock *sk)
+{
+ spin_lock_bh(&sk->timer_lock);
+ if (sk->timer.prev && del_timer (&sk->timer))
+ __sock_put(sk);
+ spin_unlock_bh(&sk->timer_lock);
+}
+
+void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
+{
+ spin_lock_bh(&sk->timer_lock);
+ if(!sk->timer.prev || !del_timer(&sk->timer))
+ sock_hold(sk);
+ mod_timer(&sk->timer, jiffies+len);
+ spin_unlock_bh(&sk->timer_lock);
+}
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if (val && !sk->keepopen)
+ tcp_reset_keepalive_timer(sk, sysctl_tcp_keepalive_time);
+ else if (!val)
+ tcp_delete_keepalive_timer(sk);
+}
+
+
+void tcp_keepalive_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock *) data;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ __u32 elapsed;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sk->lock.users) {
+ /* Try again later. */
+ tcp_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+ }
+
+ if (sk->state == TCP_FIN_WAIT2 && sk->dead)
+ goto death;
+
+ if (!sk->keepopen)
+ goto out;
+
+ elapsed = sysctl_tcp_keepalive_time;
+ if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)))
+ goto resched;
+
+ elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+ if (elapsed >= sysctl_tcp_keepalive_time) {
+ if (tp->probes_out > sysctl_tcp_keepalive_probes) {
+ tcp_write_err(sk, 1);
+ goto out;
+ }
+ tp->probes_out++;
+ tp->pending = TIME_KEEPOPEN;
+ tcp_write_wakeup(sk);
+ /* Randomize to avoid synchronization */
+ elapsed = (TCP_KEEPALIVE_PERIOD>>1) + (net_random()%TCP_KEEPALIVE_PERIOD);
+ } else {
+ /* It is tp->rcv_tstamp + sysctl_tcp_keepalive_time */
+ elapsed = sysctl_tcp_keepalive_time - elapsed;
+ }
+
+resched:
+ tcp_reset_keepalive_timer (sk, elapsed);
+ goto out;
+
+death:
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)