// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ #ifdef COMPAT_CANNOT_DEPRECIATE_BH_RCU /* We normally alias all non-_bh functions to the _bh ones in the compat layer, * but that's not appropriate here, where we actually do want non-_bh ones. */ #undef synchronize_rcu #define synchronize_rcu old_synchronize_rcu #undef call_rcu #define call_rcu old_call_rcu #undef rcu_barrier #define rcu_barrier old_rcu_barrier #endif #include "ratelimiter.h" #include #include #include #include static struct kmem_cache *entry_cache; static hsiphash_key_t key; static spinlock_t table_lock = __SPIN_LOCK_UNLOCKED("ratelimiter_table_lock"); static DEFINE_MUTEX(init_lock); static u64 init_refcnt; /* Protected by init_lock, hence not atomic. */ static atomic_t total_entries = ATOMIC_INIT(0); static unsigned int max_entries, table_size; static void wg_ratelimiter_gc_entries(struct work_struct *); static DECLARE_DEFERRABLE_WORK(gc_work, wg_ratelimiter_gc_entries); static struct hlist_head *table_v4; #if IS_ENABLED(CONFIG_IPV6) static struct hlist_head *table_v6; #endif struct ratelimiter_entry { u64 last_time_ns, tokens, ip; void *net; spinlock_t lock; struct hlist_node hash; struct rcu_head rcu; }; enum { PACKETS_PER_SECOND = 20, PACKETS_BURSTABLE = 5, PACKET_COST = NSEC_PER_SEC / PACKETS_PER_SECOND, TOKEN_MAX = PACKET_COST * PACKETS_BURSTABLE }; static void entry_free(struct rcu_head *rcu) { kmem_cache_free(entry_cache, container_of(rcu, struct ratelimiter_entry, rcu)); atomic_dec(&total_entries); } static void entry_uninit(struct ratelimiter_entry *entry) { hlist_del_rcu(&entry->hash); call_rcu(&entry->rcu, entry_free); } /* Calling this function with a NULL work uninits all entries. */ static void wg_ratelimiter_gc_entries(struct work_struct *work) { const u64 now = ktime_get_coarse_boottime_ns(); struct ratelimiter_entry *entry; struct hlist_node *temp; unsigned int i; for (i = 0; i < table_size; ++i) { spin_lock(&table_lock); hlist_for_each_entry_safe(entry, temp, &table_v4[i], hash) { if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) entry_uninit(entry); } #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_safe(entry, temp, &table_v6[i], hash) { if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) entry_uninit(entry); } #endif spin_unlock(&table_lock); if (likely(work)) cond_resched(); } if (likely(work)) queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); } bool wg_ratelimiter_allow(struct sk_buff *skb, struct net *net) { /* We only take the bottom half of the net pointer, so that we can hash * 3 words in the end. This way, siphash's len param fits into the final * u32, and we don't incur an extra round. */ const u32 net_word = (unsigned long)net; struct ratelimiter_entry *entry; struct hlist_head *bucket; u64 ip; if (skb->protocol == htons(ETH_P_IP)) { ip = (u64 __force)ip_hdr(skb)->saddr; bucket = &table_v4[hsiphash_2u32(net_word, ip, &key) & (table_size - 1)]; } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { /* Only use 64 bits, so as to ratelimit the whole /64. */ memcpy(&ip, &ipv6_hdr(skb)->saddr, sizeof(ip)); bucket = &table_v6[hsiphash_3u32(net_word, ip >> 32, ip, &key) & (table_size - 1)]; } #endif else return false; rcu_read_lock(); hlist_for_each_entry_rcu(entry, bucket, hash) { if (entry->net == net && entry->ip == ip) { u64 now, tokens; bool ret; /* Quasi-inspired by nft_limit.c, but this is actually a * slightly different algorithm. Namely, we incorporate * the burst as part of the maximum tokens, rather than * as part of the rate. */ spin_lock(&entry->lock); now = ktime_get_coarse_boottime_ns(); tokens = min_t(u64, TOKEN_MAX, entry->tokens + now - entry->last_time_ns); entry->last_time_ns = now; ret = tokens >= PACKET_COST; entry->tokens = ret ? tokens - PACKET_COST : tokens; spin_unlock(&entry->lock); rcu_read_unlock(); return ret; } } rcu_read_unlock(); if (atomic_inc_return(&total_entries) > max_entries) goto err_oom; entry = kmem_cache_alloc(entry_cache, GFP_KERNEL); if (unlikely(!entry)) goto err_oom; entry->net = net; entry->ip = ip; INIT_HLIST_NODE(&entry->hash); spin_lock_init(&entry->lock); entry->last_time_ns = ktime_get_coarse_boottime_ns(); entry->tokens = TOKEN_MAX - PACKET_COST; spin_lock(&table_lock); hlist_add_head_rcu(&entry->hash, bucket); spin_unlock(&table_lock); return true; err_oom: atomic_dec(&total_entries); return false; } int wg_ratelimiter_init(void) { mutex_lock(&init_lock); if (++init_refcnt != 1) goto out; entry_cache = KMEM_CACHE(ratelimiter_entry, 0); if (!entry_cache) goto err; /* xt_hashlimit.c uses a slightly different algorithm for ratelimiting, * but what it shares in common is that it uses a massive hashtable. So, * we borrow their wisdom about good table sizes on different systems * dependent on RAM. This calculation here comes from there. */ table_size = (totalram_pages() > (1U << 30) / PAGE_SIZE) ? 8192 : max_t(unsigned long, 16, roundup_pow_of_two( (totalram_pages() << PAGE_SHIFT) / (1U << 14) / sizeof(struct hlist_head))); max_entries = table_size * 8; table_v4 = kvzalloc(table_size * sizeof(*table_v4), GFP_KERNEL); if (unlikely(!table_v4)) goto err_kmemcache; #if IS_ENABLED(CONFIG_IPV6) table_v6 = kvzalloc(table_size * sizeof(*table_v6), GFP_KERNEL); if (unlikely(!table_v6)) { kvfree(table_v4); goto err_kmemcache; } #endif queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); get_random_bytes(&key, sizeof(key)); out: mutex_unlock(&init_lock); return 0; err_kmemcache: kmem_cache_destroy(entry_cache); err: --init_refcnt; mutex_unlock(&init_lock); return -ENOMEM; } void wg_ratelimiter_uninit(void) { mutex_lock(&init_lock); if (!init_refcnt || --init_refcnt) goto out; cancel_delayed_work_sync(&gc_work); wg_ratelimiter_gc_entries(NULL); rcu_barrier(); kvfree(table_v4); #if IS_ENABLED(CONFIG_IPV6) kvfree(table_v6); #endif kmem_cache_destroy(entry_cache); out: mutex_unlock(&init_lock); } #include "selftest/ratelimiter.c"