aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/nf_conntrack_core.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/nf_conntrack_core.c')
-rw-r--r--net/netfilter/nf_conntrack_core.c432
1 files changed, 230 insertions, 202 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e27fd17c6743..db2312eeb2a4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -12,6 +12,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
@@ -52,6 +54,7 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <net/netns/hash.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -66,6 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+
+static __read_mostly struct kmem_cache *nf_conntrack_cachep;
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
@@ -105,7 +114,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
nf_conntrack_double_unlock(h1, h2);
return true;
}
@@ -139,43 +148,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-unsigned int nf_conntrack_hash_rnd __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ const struct net *net)
{
unsigned int n;
+ u32 seed;
+
+ get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
+ seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, seed ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
-static u32 __hash_bucket(u32 hash, unsigned int size)
-{
- return reciprocal_scale(hash, size);
-}
-
-static u32 hash_bucket(u32 hash, const struct net *net)
+static u32 scale_hash(u32 hash)
{
- return __hash_bucket(hash, net->ct.htable_size);
+ return reciprocal_scale(hash, nf_conntrack_htable_size);
}
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- unsigned int size)
+static u32 __hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+static u32 hash_conntrack(const struct net *net,
+ const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, net->ct.htable_size);
+ return scale_hash(hash_conntrack_raw(tuple, net));
}
bool
@@ -356,7 +365,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
}
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- if (l4proto && l4proto->destroy)
+ if (l4proto->destroy)
l4proto->destroy(ct);
rcu_read_unlock();
@@ -391,7 +400,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -443,7 +452,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct nf_conntrack_zone *zone,
+ const struct net *net)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -452,7 +462,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
- nf_ct_is_confirmed(ct);
+ nf_ct_is_confirmed(ct) &&
+ net_eq(net, nf_ct_net(ct));
}
/*
@@ -465,21 +476,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int bucket = hash_bucket(hash, net);
+ unsigned int bucket, sequence;
- /* Disable BHs the entire time since we normally need to disable them
- * at least once for the stats anyway.
- */
- local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_key_equal(h, tuple, zone)) {
- NF_CT_STAT_INC(net, found);
- local_bh_enable();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ bucket = scale_hash(hash);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
return h;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
@@ -487,10 +500,9 @@ begin:
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(n) != bucket) {
- NF_CT_STAT_INC(net, search_restart);
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
goto begin;
}
- local_bh_enable();
return NULL;
}
@@ -512,7 +524,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
nf_ct_put(ct);
goto begin;
}
@@ -528,7 +540,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple));
+ hash_conntrack_raw(tuple, net));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -536,12 +548,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
unsigned int reply_hash)
{
- struct net *net = nf_ct_net(ct);
-
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.hash[hash]);
+ &nf_conntrack_hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
- &net->ct.hash[reply_hash]);
+ &nf_conntrack_hash[reply_hash]);
}
int
@@ -558,7 +568,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net,
@@ -566,17 +576,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
add_timer(&ct->timeout);
@@ -597,6 +604,62 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+static inline void nf_ct_acct_update(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int len)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+}
+
+static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ const struct nf_conn *loser_ct)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(loser_ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+ unsigned int bytes;
+
+ /* u32 should be fine since we must have seen one packet. */
+ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
+ nf_ct_acct_update(ct, ctinfo, bytes);
+ }
+}
+
+/* Resolve race on insertion if this protocol allows this. */
+static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_l4proto *l4proto;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->allow_clash &&
+ !nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)) {
+ nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
+ nf_conntrack_put(skb->nfct);
+ /* Assign conntrack already in hashes to this skbuff. Don't
+ * modify skb->nfctinfo to ensure consistent stateful filtering.
+ */
+ skb->nfct = &ct->ct_general;
+ return NF_ACCEPT;
+ }
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
@@ -611,6 +674,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
+ int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -626,10 +690,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
local_bh_disable();
do {
- sequence = read_seqcount_begin(&net->ct.generation);
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
- hash = hash_bucket(hash, net);
+ hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -653,23 +717,22 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- if (unlikely(nf_ct_is_dying(ct)))
- goto out;
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_ct_add_to_dying_list(ct);
+ goto dying;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
- if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple) &&
- nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
- NF_CT_DIRECTION(h)))
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
goto out;
/* Timer relative to confirmation time, not original
@@ -708,10 +771,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
out:
nf_ct_add_to_dying_list(ct);
+ ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return NF_DROP;
+ return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -724,29 +789,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
struct net *net = nf_ct_net(ignored_conntrack);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
+ unsigned int hash, sequence;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- unsigned int hash;
zone = nf_ct_zone(ignored_conntrack);
- hash = hash_conntrack(net, tuple);
- /* Disable BHs the entire time since we need to disable them at
- * least once for the stats anyway.
- */
- rcu_read_lock_bh();
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ rcu_read_lock();
+ do {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ hash = hash_conntrack(net, tuple);
+ ct_hash = nf_conntrack_hash;
+ } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
- nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
- NF_CT_STAT_INC(net, found);
- rcu_read_unlock_bh();
+ nf_ct_key_equal(h, tuple, zone, net)) {
+ NF_CT_STAT_INC_ATOMIC(net, found);
+ rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC(net, searched);
+ NF_CT_STAT_INC_ATOMIC(net, searched);
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return 0;
}
@@ -760,71 +827,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
{
/* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *ct = NULL, *tmp;
+ struct nf_conn *tmp;
struct hlist_nulls_node *n;
- unsigned int i = 0, cnt = 0;
- int dropped = 0;
- unsigned int hash, sequence;
+ unsigned int i, hash, sequence;
+ struct nf_conn *ct = NULL;
spinlock_t *lockp;
+ bool ret = false;
+
+ i = 0;
local_bh_disable();
restart:
- sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_bucket(_hash, net);
- for (; i < net->ct.htable_size; i++) {
+ sequence = read_seqcount_begin(&nf_conntrack_generation);
+ for (; i < NF_CT_EVICTION_RANGE; i++) {
+ hash = scale_hash(_hash++);
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
- if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
- hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+ hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
- !nf_ct_is_dying(tmp) &&
- atomic_inc_not_zero(&tmp->ct_general.use)) {
+
+ if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
+ !net_eq(nf_ct_net(tmp), net) ||
+ nf_ct_is_dying(tmp))
+ continue;
+
+ if (atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp;
break;
}
- cnt++;
}
- hash = (hash + 1) % net->ct.htable_size;
spin_unlock(lockp);
-
- if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ if (ct)
break;
-
}
+
local_bh_enable();
if (!ct)
- return dropped;
+ return false;
- if (del_timer(&ct->timeout)) {
+ /* kill only if in same netns -- might have moved due to
+ * SLAB_DESTROY_BY_RCU rules
+ */
+ if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) {
if (nf_ct_delete(ct, 0, 0)) {
- dropped = 1;
NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ ret = true;
}
}
- nf_ct_put(ct);
- return dropped;
-}
-void init_nf_conntrack_hash_rnd(void)
-{
- unsigned int rand;
-
- /*
- * Why not initialize nf_conntrack_rnd in a "init()" function ?
- * Because there isn't enough entropy when system initializing,
- * and we initialize it as late as possible.
- */
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+ nf_ct_put(ct);
+ return ret;
}
static struct nf_conn *
@@ -836,12 +895,6 @@ __nf_conntrack_alloc(struct net *net,
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd)) {
- init_nf_conntrack_hash_rnd();
- /* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig);
- }
-
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
@@ -858,7 +911,7 @@ __nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
- ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
@@ -885,7 +938,7 @@ __nf_conntrack_alloc(struct net *net,
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
@@ -912,7 +965,7 @@ void nf_conntrack_free(struct nf_conn *ct)
nf_ct_ext_destroy(ct);
nf_ct_ext_free(ct);
- kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ kmem_cache_free(nf_conntrack_cachep, ct);
smp_mb__before_atomic();
atomic_dec(&net->ct.count);
}
@@ -966,7 +1019,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
- pr_debug("init conntrack: can't track with proto module\n");
+ pr_debug("can't track with proto module\n");
return NULL;
}
@@ -988,7 +1041,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ pr_debug("expectation arrives ct=%p exp=%p\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
@@ -1053,13 +1106,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
- pr_debug("resolve_normal_ct: Can't get tuple\n");
+ pr_debug("Can't get tuple\n");
return NULL;
}
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple);
+ hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1079,14 +1132,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("normal packet for %p\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: related packet for %p\n",
- ct);
+ pr_debug("related packet for %p\n", ct);
*ctinfo = IP_CT_RELATED;
} else {
- pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("new packet for %p\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1269,17 +1321,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
}
acct:
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1288,18 +1331,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
const struct sk_buff *skb,
int do_acct)
{
- if (do_acct) {
- struct nf_conn_acct *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct) {
- struct nf_conn_counter *counter = acct->counter;
-
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len - skb_network_offset(skb),
- &counter[CTINFO2DIR(ctinfo)].bytes);
- }
- }
+ if (do_acct)
+ nf_ct_acct_update(ct, ctinfo, skb->len);
if (del_timer(&ct->timeout)) {
ct->timeout.function((unsigned long)ct);
@@ -1395,16 +1428,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
int cpu;
spinlock_t *lockp;
- for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < net->ct.htable_size) {
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (*bucket < nf_conntrack_htable_size) {
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+ if (net_eq(nf_ct_net(ct), net) &&
+ iter(ct, data))
goto found;
}
}
@@ -1442,6 +1476,9 @@ void nf_ct_iterate_cleanup(struct net *net,
might_sleep();
+ if (atomic_read(&net->ct.count) == 0)
+ return;
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1493,6 +1530,8 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
@@ -1543,15 +1582,12 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_tstamp_pernet_fini(net);
nf_conntrack_acct_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
free_percpu(net->ct.stat);
free_percpu(net->ct.pcpu_lists);
}
@@ -1606,7 +1642,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
local_bh_disable();
nf_conntrack_all_lock();
- write_seqcount_begin(&init_net.ct.generation);
+ write_seqcount_begin(&nf_conntrack_generation);
/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
@@ -1614,26 +1650,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
* though since that required taking the locks.
*/
- for (i = 0; i < init_net.ct.htable_size; i++) {
- while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
- h = hlist_nulls_entry(init_net.ct.hash[i].first,
- struct nf_conntrack_tuple_hash, hnnode);
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
+ while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ h = hlist_nulls_entry(nf_conntrack_hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, hashsize);
+ bucket = __hash_conntrack(nf_ct_net(ct),
+ &h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = init_net.ct.htable_size;
- old_hash = init_net.ct.hash;
+ old_size = nf_conntrack_htable_size;
+ old_hash = nf_conntrack_hash;
- init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash = hash;
+ nf_conntrack_hash = hash;
+ nf_conntrack_htable_size = hashsize;
- write_seqcount_end(&init_net.ct.generation);
+ write_seqcount_end(&nf_conntrack_generation);
nf_conntrack_all_unlock();
local_bh_enable();
+ synchronize_net();
nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
@@ -1654,7 +1692,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void)
{
int max_factor = 8;
- int i, ret, cpu;
+ int ret = -ENOMEM;
+ int i, cpu;
+
+ seqcount_init(&nf_conntrack_generation);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
@@ -1681,8 +1722,19 @@ int nf_conntrack_init_start(void)
* entries. */
max_factor = 4;
}
+
+ nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+ if (!nf_conntrack_hash)
+ return -ENOMEM;
+
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+ nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!nf_conntrack_cachep)
+ goto err_cachep;
+
printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
@@ -1759,6 +1811,9 @@ err_tstamp:
err_acct:
nf_conntrack_expect_fini();
err_expect:
+ kmem_cache_destroy(nf_conntrack_cachep);
+err_cachep:
+ nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
return ret;
}
@@ -1778,12 +1833,10 @@ void nf_conntrack_init_end(void)
int nf_conntrack_init_net(struct net *net)
{
- static atomic64_t unique_id;
int ret = -ENOMEM;
int cpu;
atomic_set(&net->ct.count, 0);
- seqcount_init(&net->ct.generation);
net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
if (!net->ct.pcpu_lists)
@@ -1801,25 +1854,6 @@ int nf_conntrack_init_net(struct net *net)
if (!net->ct.stat)
goto err_pcpu_lists;
- net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu",
- (u64)atomic64_inc_return(&unique_id));
- if (!net->ct.slabname)
- goto err_slabname;
-
- net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
- sizeof(struct nf_conn), 0,
- SLAB_DESTROY_BY_RCU, NULL);
- if (!net->ct.nf_conntrack_cachep) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- goto err_cache;
- }
-
- net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
- if (!net->ct.hash) {
- printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
- goto err_hash;
- }
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
@@ -1851,12 +1885,6 @@ err_tstamp:
err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
-err_hash:
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
-err_cache:
- kfree(net->ct.slabname);
-err_slabname:
free_percpu(net->ct.stat);
err_pcpu_lists:
free_percpu(net->ct.pcpu_lists);