patch-2.4.4 linux/net/ipv4/netfilter/ip_conntrack_core.c
Next file: linux/net/ipv4/netfilter/ip_conntrack_ftp.c
Previous file: linux/net/ipv4/netfilter/Makefile
Back to the patch index
Back to the overall index
- Lines: 561
- Date:
Fri Apr 27 14:15:01 2001
- Orig file:
v2.4.3/linux/net/ipv4/netfilter/ip_conntrack_core.c
- Orig date:
Mon Jan 22 13:30:21 2001
diff -u --recursive --new-file v2.4.3/linux/net/ipv4/netfilter/ip_conntrack_core.c linux/net/ipv4/netfilter/ip_conntrack_core.c
@@ -23,6 +23,8 @@
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
+/* For ERR_PTR(). Yeah, I know... --RR */
+#include <linux/fs.h>
/* This rwlock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
@@ -152,7 +154,9 @@
clean_from_lists(struct ip_conntrack *ct)
{
MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
- /* Remove from both hash lists */
+ /* Remove from both hash lists: must not NULL out next ptrs,
+ otherwise we'll look unconfirmed. Fortunately, LIST_DELETE
+ doesn't do this. --RR */
LIST_DELETE(&ip_conntrack_hash
[hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
&ct->tuplehash[IP_CT_DIR_ORIGINAL]);
@@ -172,24 +176,6 @@
{
struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
- /* Unconfirmed connections haven't been cleaned up by the
- timer: hence they cannot be simply deleted here. */
- if (!(ct->status & IPS_CONFIRMED)) {
- WRITE_LOCK(&ip_conntrack_lock);
- /* Race check: they can't get a reference if noone has
- one and we have the write lock. */
- if (atomic_read(&ct->ct_general.use) == 0) {
- clean_from_lists(ct);
- WRITE_UNLOCK(&ip_conntrack_lock);
- } else {
- /* Either a last-minute confirmation (ie. ct
- now has timer attached), or a last-minute
- new skb has reference (still unconfirmed). */
- WRITE_UNLOCK(&ip_conntrack_lock);
- return;
- }
- }
-
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
@@ -207,7 +193,6 @@
struct ip_conntrack *ct = (void *)ul_conntrack;
WRITE_LOCK(&ip_conntrack_lock);
- IP_NF_ASSERT(ct->status & IPS_CONFIRMED);
clean_from_lists(ct);
WRITE_UNLOCK(&ip_conntrack_lock);
ip_conntrack_put(ct);
@@ -253,24 +238,85 @@
return h;
}
-/* Confirm a connection */
-void
-ip_conntrack_confirm(struct ip_conntrack *ct)
+static inline struct ip_conntrack *
+__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
{
+ struct ip_conntrack *ct
+ = (struct ip_conntrack *)nfct->master;
+
+ /* ctinfo is the index of the nfct inside the conntrack */
+ *ctinfo = nfct - ct->infos;
+ IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
+ return ct;
+}
+
+/* Return conntrack and conntrack_info given skb->nfct->master */
+struct ip_conntrack *
+ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+{
+ if (skb->nfct)
+ return __ip_conntrack_get(skb->nfct, ctinfo);
+ return NULL;
+}
+
+/* Confirm a connection given skb->nfct; places it in hash table */
+int
+__ip_conntrack_confirm(struct nf_ct_info *nfct)
+{
+ unsigned int hash, repl_hash;
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = __ip_conntrack_get(nfct, &ctinfo);
+
+ /* ipt_REJECT uses ip_conntrack_attach to attach related
+ ICMP/TCP RST packets in other direction. Actual packet
+ which created connection will be IP_CT_NEW or for an
+ expected connection, IP_CT_RELATED. */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ /* We're not in hash table, and we refuse to set up related
+ connections for unconfirmed conns. But packet copies and
+ REJECT will give spurious warnings here. */
+ /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+ /* No external references means noone else could have
+ confirmed us. */
+ IP_NF_ASSERT(!is_confirmed(ct));
DEBUGP("Confirming conntrack %p\n", ct);
+
WRITE_LOCK(&ip_conntrack_lock);
- /* Race check */
- if (!(ct->status & IPS_CONFIRMED)) {
- IP_NF_ASSERT(!timer_pending(&ct->timeout));
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ /* See if there's one in the list already, including reverse:
+ NAT could have grabbed it without realizing, since we're
+ not in the hash. If there is, we lost race. */
+ if (!LIST_FIND(&ip_conntrack_hash[hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+ && !LIST_FIND(&ip_conntrack_hash[repl_hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY]);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
wierd delay cases. */
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return NF_ACCEPT;
}
+
WRITE_UNLOCK(&ip_conntrack_lock);
+ return NF_DROP;
}
/* Returns true if a connection correspondings to the tuple (required
@@ -374,30 +420,16 @@
*ctinfo += IP_CT_IS_REPLY;
}
- /* REJECT target does this commonly, so allow locally
- generated ICMP errors --RR */
- if (!(h->ctrack->status & IPS_CONFIRMED)
- && hooknum != NF_IP_LOCAL_OUT) {
- DEBUGP("icmp_error_track: unconfirmed\n");
- ip_conntrack_put(h->ctrack);
- return NULL;
- }
-
/* Update skb to refer to this connection */
skb->nfct = &h->ctrack->infos[*ctinfo];
return h->ctrack;
}
-/* There's a small race here where we may free a just-replied to
+/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
{
- /* Unconfirmed connections either really fresh or transitory
- anyway */
- if (!(i->ctrack->status & IPS_ASSURED)
- && (i->ctrack->status & IPS_CONFIRMED))
- return 1;
- return 0;
+ return !(i->ctrack->status & IPS_ASSURED);
}
static int early_drop(struct list_head *chain)
@@ -436,10 +468,9 @@
return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
}
-/* Allocate a new conntrack; we set everything up, then grab write
- lock and see if we lost a race. If we lost it we return 0,
- indicating the controlling code should look again. */
-static int
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ failed due to stress. Otherwise it really is unclassifiable. */
+static struct ip_conntrack_tuple_hash *
init_conntrack(const struct ip_conntrack_tuple *tuple,
struct ip_conntrack_protocol *protocol,
struct sk_buff *skb)
@@ -448,8 +479,6 @@
struct ip_conntrack_tuple repl_tuple;
size_t hash, repl_hash;
struct ip_conntrack_expect *expected;
- enum ip_conntrack_info ctinfo;
- unsigned long extra_jiffies;
int i;
static unsigned int drop_next = 0;
@@ -457,30 +486,31 @@
if (ip_conntrack_max &&
atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
- if (net_ratelimit())
- printk(KERN_WARNING "ip_conntrack: maximum limit of"
- " %d entries exceeded\n", ip_conntrack_max);
-
/* Try dropping from random chain, or else from the
chain about to put into (in case they're trying to
bomb one hash chain). */
if (drop_next >= ip_conntrack_htable_size)
drop_next = 0;
if (!early_drop(&ip_conntrack_hash[drop_next++])
- && !early_drop(&ip_conntrack_hash[hash]))
- return 1;
+ && !early_drop(&ip_conntrack_hash[hash])) {
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "ip_conntrack: table full, dropping"
+ " packet.\n");
+ return ERR_PTR(-ENOMEM);
+ }
}
if (!invert_tuple(&repl_tuple, tuple, protocol)) {
DEBUGP("Can't invert tuple.\n");
- return 1;
+ return NULL;
}
repl_hash = hash_conntrack(&repl_tuple);
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
- return 1;
+ return ERR_PTR(-ENOMEM);
}
memset(conntrack, 0, sizeof(struct ip_conntrack));
@@ -493,32 +523,33 @@
for (i=0; i < IP_CT_NUMBER; i++)
conntrack->infos[i].master = &conntrack->ct_general;
- extra_jiffies = protocol->new(conntrack, skb->nh.iph, skb->len);
- if (!extra_jiffies) {
+ if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
kmem_cache_free(ip_conntrack_cachep, conntrack);
- return 1;
+ return NULL;
}
/* Don't set timer yet: wait for confirmation */
init_timer(&conntrack->timeout);
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
- conntrack->timeout.expires = extra_jiffies;
- /* Sew in at head of hash list. */
+ /* Mark clearly that it's not in the hash table. */
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL;
+
+ /* Write lock required for deletion of expected. Without
+ this, a read-lock would do. */
WRITE_LOCK(&ip_conntrack_lock);
- /* Check noone else beat us in the race... */
- if (__ip_conntrack_find(tuple, NULL)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return 0;
- }
conntrack->helper = LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
&repl_tuple);
/* Need finding and deleting of expected ONLY if we win race */
expected = LIST_FIND(&expect_list, expect_cmp,
struct ip_conntrack_expect *, tuple);
- if (expected) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (expected && is_confirmed(expected->expectant)) {
/* Welcome, Mr. Bond. We've been expecting you... */
conntrack->status = IPS_EXPECTED;
conntrack->master.master = &expected->expectant->ct_general;
@@ -526,23 +557,13 @@
LIST_DELETE(&expect_list, expected);
expected->expectant = NULL;
nf_conntrack_get(&conntrack->master);
- ctinfo = IP_CT_RELATED;
- } else {
- ctinfo = IP_CT_NEW;
}
- list_prepend(&ip_conntrack_hash[hash],
- &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &conntrack->tuplehash[IP_CT_DIR_REPLY]);
atomic_inc(&ip_conntrack_count);
WRITE_UNLOCK(&ip_conntrack_lock);
- /* Update skb to refer to this connection */
- skb->nfct = &conntrack->infos[ctinfo];
if (expected && expected->expectfn)
expected->expectfn(conntrack);
-
- return 1;
+ return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
}
/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
@@ -561,38 +582,18 @@
if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
return NULL;
- /* Loop around search/insert race */
- do {
- /* look for tuple match */
- h = ip_conntrack_find_get(&tuple, NULL);
- if (!h && init_conntrack(&tuple, proto, skb))
+ /* look for tuple match */
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ h = init_conntrack(&tuple, proto, skb);
+ if (!h)
return NULL;
- } while (!h);
+ if (IS_ERR(h))
+ return (void *)h;
+ }
/* It exists; we have (non-exclusive) reference. */
if (DIRECTION(h) == IP_CT_DIR_REPLY) {
- /* Reply on unconfirmed connection => unclassifiable */
- if (!(h->ctrack->status & IPS_CONFIRMED)) {
- /* Exception: local TCP RSTs (generated by
- REJECT target). */
- if (hooknum == NF_IP_LOCAL_OUT
- && h->tuple.dst.protonum == IPPROTO_TCP) {
- const struct tcphdr *tcph
- = (const struct tcphdr *)
- ((u_int32_t *)skb->nh.iph
- + skb->nh.iph->ihl);
- if (tcph->rst) {
- *ctinfo = IP_CT_ESTABLISHED
- + IP_CT_IS_REPLY;
- *set_reply = 0;
- goto set_skb;
- }
- }
- DEBUGP("Reply on unconfirmed connection\n");
- ip_conntrack_put(h->ctrack);
- return NULL;
- }
-
*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
/* Please set reply bit if this packet OK */
*set_reply = 1;
@@ -613,28 +614,10 @@
}
*set_reply = 0;
}
- set_skb:
skb->nfct = &h->ctrack->infos[*ctinfo];
return h->ctrack;
}
-/* Return conntrack and conntrack_info a given skb */
-inline struct ip_conntrack *
-ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
-{
- if (skb->nfct) {
- struct ip_conntrack *ct
- = (struct ip_conntrack *)skb->nfct->master;
-
- /* ctinfo is the index of the nfct inside the conntrack */
- *ctinfo = skb->nfct - ct->infos;
- IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
- return ct;
- }
- return NULL;
-}
-
-
/* Netfilter hook itself. */
unsigned int ip_conntrack_in(unsigned int hooknum,
struct sk_buff **pskb,
@@ -689,6 +672,10 @@
/* Not valid part of a connection */
return NF_ACCEPT;
+ if (IS_ERR(ct))
+ /* Too stressed to deal. */
+ return NF_DROP;
+
IP_NF_ASSERT((*pskb)->nfct);
ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
@@ -783,23 +770,18 @@
int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
const struct ip_conntrack_tuple *newreply)
{
- unsigned int newindex = hash_conntrack(newreply);
-
WRITE_LOCK(&ip_conntrack_lock);
if (__ip_conntrack_find(newreply, conntrack)) {
WRITE_UNLOCK(&ip_conntrack_lock);
return 0;
}
+ /* Should be unconfirmed, so not in hash table yet */
+ IP_NF_ASSERT(!is_confirmed(conntrack));
+
DEBUGP("Altering reply tuple of %p to ", conntrack);
DUMP_TUPLE(newreply);
- LIST_DELETE(&ip_conntrack_hash
- [hash_conntrack(&conntrack->tuplehash[IP_CT_DIR_REPLY]
- .tuple)],
- &conntrack->tuplehash[IP_CT_DIR_REPLY]);
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- list_prepend(&ip_conntrack_hash[newindex],
- &conntrack->tuplehash[IP_CT_DIR_REPLY]);
conntrack->helper = LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
newreply);
@@ -861,8 +843,8 @@
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
WRITE_LOCK(&ip_conntrack_lock);
- /* Timer may not be active yet */
- if (!(ct->status & IPS_CONFIRMED))
+ /* If not in hash table, timer will not be active yet */
+ if (!is_confirmed(ct))
ct->timeout.expires = extra_jiffies;
else {
/* Need del_timer for race avoidance (may already be dying). */
@@ -894,7 +876,12 @@
if (!skb) {
if (sk) sock_put(sk);
return skb;
+ } else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
+ kfree_skb(skb);
+ if (sk) sock_put(sk);
+ return NULL;
}
+
if (sk) {
skb_set_owner_w(skb, sk);
sock_put(sk);
@@ -909,6 +896,26 @@
return skb;
}
+/* Used by ipt_REJECT. */
+static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = __ip_conntrack_get(nfct, &ctinfo);
+
+ /* This ICMP is in reverse direction to the packet which
+ caused it */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+ else
+ ctinfo = IP_CT_RELATED;
+
+ /* Attach new skbuff, and increment count */
+ nskb->nfct = &ct->infos[ctinfo];
+ atomic_inc(&ct->ct_general.use);
+}
+
static inline int
do_kill(const struct ip_conntrack_tuple_hash *i,
int (*kill)(const struct ip_conntrack *i, void *data),
@@ -948,20 +955,6 @@
/* Time to push up daises... */
if (del_timer(&h->ctrack->timeout))
death_by_timeout((unsigned long)h->ctrack);
- else if (!(h->ctrack->status & IPS_CONFIRMED)) {
- /* Unconfirmed connection. Clean from lists,
- mark confirmed so it gets cleaned as soon
- as skb freed. */
- WRITE_LOCK(&ip_conntrack_lock);
- /* Lock protects race against another setting
- of confirmed bit. set_bit isolates this
- bit from the others. */
- if (!(h->ctrack->status & IPS_CONFIRMED)) {
- clean_from_lists(h->ctrack);
- set_bit(IPS_CONFIRMED_BIT, &h->ctrack->status);
- }
- WRITE_UNLOCK(&ip_conntrack_lock);
- }
/* ... else the timer will get him soon. */
ip_conntrack_put(h->ctrack);
@@ -986,7 +979,7 @@
return -ENOPROTOOPT;
}
- if (*len != sizeof(struct sockaddr_in)) {
+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
*len, sizeof(struct sockaddr_in));
return -EINVAL;
@@ -1057,6 +1050,12 @@
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(ip_conntrack_sysctl_header);
#endif
+ ip_ct_attach = NULL;
+ /* This makes sure all current packets have passed through
+ netfilter framework. Roll on, two-stage module
+ delete... */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
i_see_dead_people:
ip_ct_selective_cleanup(kill_all, NULL);
@@ -1070,16 +1069,27 @@
nf_unregister_sockopt(&so_getorigdst);
}
+static int hashsize = 0;
+MODULE_PARM(hashsize, "i");
+
int __init ip_conntrack_init(void)
{
unsigned int i;
int ret;
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
- * machine has 256 buckets. 1GB machine has 8192 buckets. */
- ip_conntrack_htable_size
- = (((num_physpages << PAGE_SHIFT) / 16384)
- / sizeof(struct list_head));
+ * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
+ if (hashsize) {
+ ip_conntrack_htable_size = hashsize;
+ } else {
+ ip_conntrack_htable_size
+ = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ ip_conntrack_htable_size = 8192;
+ if (ip_conntrack_htable_size < 16)
+ ip_conntrack_htable_size = 16;
+ }
ip_conntrack_max = 8 * ip_conntrack_htable_size;
printk("ip_conntrack (%u buckets, %d max)\n",
@@ -1131,5 +1141,7 @@
}
#endif /*CONFIG_SYSCTL*/
+ /* For use by ipt_REJECT */
+ ip_ct_attach = ip_conntrack_attach;
return ret;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)