aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/nf_conntrack_core.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/nf_conntrack_core.c')
-rw-r--r--net/netfilter/nf_conntrack_core.c192
1 files changed, 104 insertions, 88 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 96ba19fc8155..94e18fb9690d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -21,7 +21,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
@@ -66,22 +65,19 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
struct conntrack_gc_work {
struct delayed_work dwork;
- u32 last_bucket;
+ u32 next_bucket;
bool exiting;
bool early_drop;
- long next_gc_run;
};
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV 128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES (16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO 50u
+#define GC_SCAN_INTERVAL (120u * HZ)
+#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
+
+#define MAX_CHAINLEN 64u
static struct conntrack_gc_work conntrack_gc_work;
@@ -149,7 +145,15 @@ static void nf_conntrack_all_lock(void)
spin_lock(&nf_conntrack_locks_all_lock);
- nf_conntrack_locks_all = true;
+ /* For nf_contrack_locks_all, only the latest time when another
+ * CPU will see an update is controlled, by the "release" of the
+ * spin_lock below.
+ * The earliest time is not controlled, an thus KCSAN could detect
+ * a race when nf_conntract_lock() reads the variable.
+ * WRITE_ONCE() is used to ensure the compiler will not
+ * optimize the write.
+ */
+ WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@@ -181,25 +185,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
-static unsigned int nf_conntrack_hash_rnd __read_mostly;
+static siphash_key_t nf_conntrack_hash_rnd __read_mostly;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
const struct net *net)
{
- unsigned int n;
- u32 seed;
+ struct {
+ struct nf_conntrack_man src;
+ union nf_inet_addr dst_addr;
+ u32 net_mix;
+ u16 dport;
+ u16 proto;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- /* The direction must be ignored, so we hash everything up to the
- * destination ports (which is a multiple of 4) and treat the last
- * three bytes manually.
- */
- seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
- n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, seed ^
- (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ memset(&combined, 0, sizeof(combined));
+
+ /* The direction must be ignored, so handle usable members manually. */
+ combined.src = tuple->src;
+ combined.dst_addr = tuple->dst.u3;
+ combined.net_mix = net_hash_mix(net);
+ combined.dport = (__force __u16)tuple->dst.u.all;
+ combined.proto = tuple->dst.protonum;
+
+ return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
}
static u32 scale_hash(u32 hash)
@@ -662,8 +672,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
return false;
tstamp = nf_conn_tstamp_find(ct);
- if (tstamp && tstamp->stop == 0)
+ if (tstamp) {
+ s32 timeout = ct->timeout - nfct_time_stamp;
+
tstamp->stop = ktime_get_real_ns();
+ if (timeout < 0)
+ tstamp->stop -= jiffies_to_nsecs(-timeout);
+ }
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
@@ -827,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int chainlen = 0;
unsigned int sequence;
+ int err = -EEXIST;
zone = nf_ct_zone(ct);
@@ -841,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (chainlen++ > MAX_CHAINLEN)
+ goto chaintoolong;
+ }
+
+ chainlen = 0;
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > MAX_CHAINLEN)
+ goto chaintoolong;
+ }
smp_wmb();
/* The caller holds a reference to this object */
@@ -859,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
NF_CT_STAT_INC(net, insert);
local_bh_enable();
return 0;
-
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ err = -ENOSPC;
out:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
- return -EEXIST;
+ return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -1076,6 +1104,7 @@ int
__nf_conntrack_confirm(struct sk_buff *skb)
{
const struct nf_conntrack_zone *zone;
+ unsigned int chainlen = 0, sequence;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -1083,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- unsigned int sequence;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
@@ -1143,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
+ if (chainlen++ > MAX_CHAINLEN)
+ goto chaintoolong;
+ }
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ chainlen = 0;
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > MAX_CHAINLEN) {
+chaintoolong:
+ nf_ct_add_to_dying_list(ct);
+ NF_CT_STAT_INC(net, chaintoolong);
+ NF_CT_STAT_INC(net, insert_failed);
+ ret = NF_DROP;
+ goto dying;
+ }
+ }
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
@@ -1350,17 +1391,13 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
- unsigned int i, goal, buckets = 0, expired_count = 0;
- unsigned int nf_conntrack_max95 = 0;
+ unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
+ unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ unsigned long next_run = GC_SCAN_INTERVAL;
struct conntrack_gc_work *gc_work;
- unsigned int ratio, scanned = 0;
- unsigned long next_run;
-
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
- goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
- i = gc_work->last_bucket;
+ i = gc_work->next_bucket;
if (gc_work->early_drop)
nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
@@ -1368,15 +1405,15 @@ static void gc_worker(struct work_struct *work)
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int hashsz;
struct nf_conn *tmp;
- i++;
rcu_read_lock();
nf_conntrack_get_ht(&ct_hash, &hashsz);
- if (i >= hashsz)
- i = 0;
+ if (i >= hashsz) {
+ rcu_read_unlock();
+ break;
+ }
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
struct nf_conntrack_net *cnet;
@@ -1384,7 +1421,6 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
- scanned++;
if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
nf_ct_offload_timeout(tmp);
continue;
@@ -1392,7 +1428,6 @@ static void gc_worker(struct work_struct *work)
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
- expired_count++;
continue;
}
@@ -1425,7 +1460,14 @@ static void gc_worker(struct work_struct *work)
*/
rcu_read_unlock();
cond_resched();
- } while (++buckets < goal);
+ i++;
+
+ if (time_after(jiffies, end_time) && i < hashsz) {
+ gc_work->next_bucket = i;
+ next_run = 0;
+ break;
+ }
+ } while (i < hashsz);
if (gc_work->exiting)
return;
@@ -1436,40 +1478,17 @@ static void gc_worker(struct work_struct *work)
*
* This worker is only here to reap expired entries when system went
* idle after a busy period.
- *
- * The heuristics below are supposed to balance conflicting goals:
- *
- * 1. Minimize time until we notice a stale entry
- * 2. Maximize scan intervals to not waste cycles
- *
- * Normally, expire ratio will be close to 0.
- *
- * As soon as a sizeable fraction of the entries have expired
- * increase scan frequency.
*/
- ratio = scanned ? expired_count * 100 / scanned : 0;
- if (ratio > GC_EVICT_RATIO) {
- gc_work->next_gc_run = min_interval;
- } else {
- unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
-
- BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
-
- gc_work->next_gc_run += min_interval;
- if (gc_work->next_gc_run > max)
- gc_work->next_gc_run = max;
+ if (next_run) {
+ gc_work->early_drop = false;
+ gc_work->next_bucket = 0;
}
-
- next_run = gc_work->next_gc_run;
- gc_work->last_bucket = i;
- gc_work->early_drop = false;
queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
- gc_work->next_gc_run = HZ;
gc_work->exiting = false;
}
@@ -2457,7 +2476,6 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
@@ -2609,26 +2627,24 @@ int nf_conntrack_init_start(void)
spin_lock_init(&nf_conntrack_locks[i]);
if (!nf_conntrack_htable_size) {
- /* Idea from tcp.c: use 1/16384 of memory.
- * On i386: 32MB machine has 512 buckets.
- * >= 1GB machines have 16384 buckets.
- * >= 4GB machines have 65536 buckets.
- */
nf_conntrack_htable_size
= (((nr_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
- if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
- nf_conntrack_htable_size = 65536;
+ if (BITS_PER_LONG >= 64 &&
+ nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+ nf_conntrack_htable_size = 262144;
else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
- nf_conntrack_htable_size = 16384;
- if (nf_conntrack_htable_size < 32)
- nf_conntrack_htable_size = 32;
-
- /* Use a max. factor of four by default to get the same max as
- * with the old struct list_heads. When a table size is given
- * we use the old value of 8 to avoid reducing the max.
- * entries. */
- max_factor = 4;
+ nf_conntrack_htable_size = 65536;
+
+ if (nf_conntrack_htable_size < 1024)
+ nf_conntrack_htable_size = 1024;
+ /* Use a max. factor of one by default to keep the average
+ * hash chain length at 2 entries. Each entry has to be added
+ * twice (once for original direction, once for reply).
+ * When a table size is given we use the old value of 8 to
+ * avoid implicit reduction of the max entries setting.
+ */
+ max_factor = 1;
}
nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);