aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/uapi/linux/bpf.h9
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/bpf_lru_list.c695
-rw-r--r--kernel/bpf/bpf_lru_list.h84
-rw-r--r--kernel/bpf/hashtab.c438
-rw-r--r--kernel/bpf/syscall.c8
-rw-r--r--samples/bpf/Makefile2
-rw-r--r--samples/bpf/map_perf_test_kern.c39
-rw-r--r--samples/bpf/map_perf_test_user.c32
-rw-r--r--samples/bpf/test_lru_dist.c538
-rw-r--r--tools/testing/selftests/bpf/Makefile6
-rw-r--r--tools/testing/selftests/bpf/test_lru_map.c583
12 files changed, 2386 insertions, 50 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2f38e0091b6..7d9b2832c280 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY,
+ BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH,
};
enum bpf_prog_type {
@@ -106,6 +108,13 @@ enum bpf_prog_type {
#define BPF_EXIST 2 /* update existing element */
#define BPF_F_NO_PREALLOC (1U << 0)
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+#define BPF_F_NO_COMMON_LRU (1U << 1)
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..c4d89d6e2058 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
new file mode 100644
index 000000000000..bfebff010ba9
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.c
@@ -0,0 +1,695 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include "bpf_lru_list.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
+
+#define PERCPU_FREE_TARGET (16)
+#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
+
+/* Helpers to get the local list index */
+#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
+#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
+#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
+#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
+
+static int get_next_cpu(int cpu)
+{
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_possible_mask);
+ return cpu;
+}
+
+/* Local list helpers */
+static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_FREE_LIST_IDX];
+}
+
+static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
+}
+
+/* bpf_lru_node helpers */
+static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
+{
+ return node->ref;
+}
+
+static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]++;
+}
+
+static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]--;
+}
+
+static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ /* If the removing node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ bpf_lru_list_count_dec(l, node->type);
+
+ node->type = tgt_free_type;
+ list_move(&node->list, free_list);
+}
+
+/* Move nodes from local list to the LRU list */
+static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ node->ref = 0;
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+/* Move nodes between or within active and inactive list (like
+ * active to inactive, inactive to active or tail of active back to
+ * the head of active).
+ */
+static void __bpf_lru_node_move(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ if (node->type != tgt_type) {
+ bpf_lru_list_count_dec(l, node->type);
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ }
+ node->ref = 0;
+
+ /* If the moving node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
+{
+ return l->counts[BPF_LRU_LIST_T_INACTIVE] <
+ l->counts[BPF_LRU_LIST_T_ACTIVE];
+}
+
+/* Rotate the active list:
+ * 1. Start from tail
+ * 2. If the node has the ref bit set, it will be rotated
+ * back to the head of active list with the ref bit cleared.
+ * Give this node one more chance to survive in the active list.
+ * 3. If the ref bit is not set, move it to the head of the
+ * inactive list.
+ * 4. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(active, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+
+ if (++i == lru->nr_scans || node == first_node)
+ break;
+ }
+}
+
+/* Rotate the inactive list. It starts from the next_inactive_rotation
+ * 1. If the node has ref bit set, it will be moved to the head
+ * of active list with the ref bit cleared.
+ * 2. If the node does not have ref bit set, it will leave it
+ * at its current location (i.e. do nothing) so that it can
+ * be considered during the next inactive_shrink.
+ * 3. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct list_head *cur, *next, *last;
+ struct bpf_lru_node *node;
+ unsigned int i = 0;
+
+ if (list_empty(inactive))
+ return;
+
+ last = l->next_inactive_rotation->next;
+ if (last == inactive)
+ last = last->next;
+
+ cur = l->next_inactive_rotation;
+ while (i < lru->nr_scans) {
+ if (cur == inactive) {
+ cur = cur->prev;
+ continue;
+ }
+
+ node = list_entry(cur, struct bpf_lru_node, list);
+ next = cur->prev;
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ if (cur == last)
+ break;
+ cur = next;
+ i++;
+ }
+
+ l->next_inactive_rotation = next;
+}
+
+/* Shrink the inactive list. It starts from the tail of the
+ * inactive list and only move the nodes without the ref bit
+ * set to the designated free list.
+ */
+static unsigned int
+__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int nshrinked = 0;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(inactive, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
+ if (bpf_lru_node_is_ref(node)) {
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ } else if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ if (++nshrinked == tgt_nshrink)
+ break;
+ }
+
+ if (++i == lru->nr_scans)
+ break;
+ }
+
+ return nshrinked;
+}
+
+/* 1. Rotate the active list (if needed)
+ * 2. Always rotate the inactive list
+ */
+static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
+{
+ if (bpf_lru_list_inactive_low(l))
+ __bpf_lru_list_rotate_active(lru, l);
+
+ __bpf_lru_list_rotate_inactive(lru, l);
+}
+
+/* Calls __bpf_lru_list_shrink_inactive() to shrink some
+ * ref-bit-cleared nodes and move them to the designated
+ * free list.
+ *
+ * If it cannot get a free node after calling
+ * __bpf_lru_list_shrink_inactive(). It will just remove
+ * one node from either inactive or active list without
+ * honoring the ref-bit. It prefers inactive list to active
+ * list in this situation.
+ */
+static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+
+{
+ struct bpf_lru_node *node, *tmp_node;
+ struct list_head *force_shrink_list;
+ unsigned int nshrinked;
+
+ nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
+ free_list, tgt_free_type);
+ if (nshrinked)
+ return nshrinked;
+
+ /* Do a force shrink by ignoring the reference bit */
+ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ else
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+
+ list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
+ list) {
+ if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Flush the nodes from the local pending list to the LRU list */
+static void __local_list_flush(struct bpf_lru_list *l,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node, *tmp_node;
+
+ list_for_each_entry_safe_reverse(node, tmp_node,
+ local_pending_list(loc_l), list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move_in(l, node,
+ BPF_LRU_LIST_T_INACTIVE);
+ }
+}
+
+static void bpf_lru_list_push_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ struct bpf_lru_node *node, *tmp_node;
+ unsigned int nfree = 0;
+
+ raw_spin_lock(&l->lock);
+
+ __local_list_flush(l, loc_l);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
+ list) {
+ __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+ if (++nfree == LOCAL_FREE_TARGET)
+ break;
+ }
+
+ if (nfree < LOCAL_FREE_TARGET)
+ __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+
+ raw_spin_unlock(&l->lock);
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l,
+ int cpu,
+ struct bpf_lru_node *node,
+ u32 hash)
+{
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->cpu = cpu;
+ node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ node->ref = 0;
+ list_add(&node->list, local_pending_list(loc_l));
+}
+
+struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+
+ node = list_first_entry_or_null(local_free_list(loc_l),
+ struct bpf_lru_node,
+ list);
+ if (node)
+ list_del(&node->list);
+
+ return node;
+}
+
+struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+ bool force = false;
+
+ignore_ref:
+ /* Get from the tail (i.e. older element) of the pending list. */
+ list_for_each_entry_reverse(node, local_pending_list(loc_l),
+ list) {
+ if ((!bpf_lru_node_is_ref(node) || force) &&
+ lru->del_from_htab(lru->del_arg, node)) {
+ list_del(&node->list);
+ return node;
+ }
+ }
+
+ if (!force) {
+ force = true;
+ goto ignore_ref;
+ }
+
+ return NULL;
+}
+
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct list_head *free_list;
+ struct bpf_lru_node *node = NULL;
+ struct bpf_lru_list *l;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ free_list = &l->lists[BPF_LRU_LIST_T_FREE];
+ if (list_empty(free_list))
+ __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
+ BPF_LRU_LIST_T_FREE);
+
+ if (!list_empty(free_list)) {
+ node = list_first_entry(free_list, struct bpf_lru_node, list);
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->ref = 0;
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+ }
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+
+ return node;
+}
+
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct bpf_lru_locallist *loc_l, *steal_loc_l;
+ struct bpf_common_lru *clru = &lru->common_lru;
+ struct bpf_lru_node *node;
+ int steal, first_steal;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ node = __local_list_pop_free(loc_l);
+ if (!node) {
+ bpf_lru_list_pop_free_to_local(lru, loc_l);
+ node = __local_list_pop_free(loc_l);
+ }
+
+ if (node)
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+ if (node)
+ return node;
+
+ /* No free nodes found from the local free list and
+ * the global LRU list.
+ *
+ * Steal from the local free/pending list of the
+ * current CPU and remote CPU in RR. It starts
+ * with the loc_l->next_steal CPU.
+ */
+
+ first_steal = loc_l->next_steal;
+ steal = first_steal;
+ do {
+ steal_loc_l = per_cpu_ptr(clru->local_list, steal);
+
+ raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+
+ raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+
+ steal = get_next_cpu(steal);
+ } while (!node && steal != first_steal);
+
+ loc_l->next_steal = steal;
+
+ if (node) {
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ }
+
+ return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+ if (lru->percpu)
+ return bpf_percpu_lru_pop_free(lru, hash);
+ else
+ return bpf_common_lru_pop_free(lru, hash);
+}
+
+static void bpf_common_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ return;
+
+ if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ goto check_lru_list;
+ }
+
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ node->ref = 0;
+ list_move(&node->list, local_free_list(loc_l));
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ return;
+ }
+
+check_lru_list:
+ bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
+}
+
+static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ struct bpf_lru_list *l;
+ unsigned long flags;
+
+ l = per_cpu_ptr(lru->percpu_lru, node->cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_push_free(lru, node);
+ else
+ bpf_common_lru_push_free(lru, node);
+}
+
+void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ u32 i;
+
+ for (i = 0; i < nr_elems; i++) {
+ struct bpf_lru_node *node;
+
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ buf += elem_size;
+ }
+}
+
+void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ u32 i, pcpu_entries;
+ int cpu;
+ struct bpf_lru_list *l;
+
+ pcpu_entries = nr_elems / num_possible_cpus();
+
+ i = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_node *node;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+again:
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->cpu = cpu;
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+ else
+ bpf_common_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+}
+
+static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
+ INIT_LIST_HEAD(&loc_l->lists[i]);
+
+ loc_l->next_steal = cpu;
+
+ raw_spin_lock_init(&loc_l->lock);
+}
+
+static void bpf_lru_list_init(struct bpf_lru_list *l)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
+ INIT_LIST_HEAD(&l->lists[i]);
+
+ for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
+ l->counts[i] = 0;
+
+ l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+
+ raw_spin_lock_init(&l->lock);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *del_arg)
+{
+ int cpu;
+
+ if (percpu) {
+ lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+ if (!lru->percpu_lru)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_list *l;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+ bpf_lru_list_init(l);
+ }
+ lru->nr_scans = PERCPU_NR_SCANS;
+ } else {
+ struct bpf_common_lru *clru = &lru->common_lru;
+
+ clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+ if (!clru->local_list)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+ bpf_lru_locallist_init(loc_l, cpu);
+ }
+
+ bpf_lru_list_init(&clru->lru_list);
+ lru->nr_scans = LOCAL_NR_SCANS;
+ }
+
+ lru->percpu = percpu;
+ lru->del_from_htab = del_from_htab;
+ lru->del_arg = del_arg;
+ lru->hash_offset = hash_offset;
+
+ return 0;
+}
+
+void bpf_lru_destroy(struct bpf_lru *lru)
+{
+ if (lru->percpu)
+ free_percpu(lru->percpu_lru);
+ else
+ free_percpu(lru->common_lru.local_list);
+}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
new file mode 100644
index 000000000000..5c35a98d02bf
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __BPF_LRU_LIST_H_
+#define __BPF_LRU_LIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#define NR_BPF_LRU_LIST_T (3)
+#define NR_BPF_LRU_LIST_COUNT (2)
+#define NR_BPF_LRU_LOCAL_LIST_T (2)
+#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
+
+enum bpf_lru_list_type {
+ BPF_LRU_LIST_T_ACTIVE,
+ BPF_LRU_LIST_T_INACTIVE,
+ BPF_LRU_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_PENDING,
+};
+
+struct bpf_lru_node {
+ struct list_head list;
+ u16 cpu;
+ u8 type;
+ u8 ref;
+};
+
+struct bpf_lru_list {
+ struct list_head lists[NR_BPF_LRU_LIST_T];
+ unsigned int counts[NR_BPF_LRU_LIST_COUNT];
+ /* The next inacitve list rotation starts from here */
+ struct list_head *next_inactive_rotation;
+
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct bpf_lru_locallist {
+ struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ u16 next_steal;
+ raw_spinlock_t lock;
+};
+
+struct bpf_common_lru {
+ struct bpf_lru_list lru_list;
+ struct bpf_lru_locallist __percpu *local_list;
+};
+
+typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
+
+struct bpf_lru {
+ union {
+ struct bpf_common_lru common_lru;
+ struct bpf_lru_list __percpu *percpu_lru;
+ };
+ del_from_htab_func del_from_htab;
+ void *del_arg;
+ unsigned int hash_offset;
+ unsigned int nr_scans;
+ bool percpu;
+};
+
+static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
+{
+ /* ref is an approximation on access frequency. It does not
+ * have to be very accurate. Hence, no protection is used.
+ */
+ node->ref = 1;
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *delete_arg);
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems);
+void bpf_lru_destroy(struct bpf_lru *lru);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ad1bc67aff1b..34debc1a9641 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -15,6 +15,7 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>
#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
struct bucket {
struct hlist_head head;
@@ -25,7 +26,10 @@ struct bpf_htab {
struct bpf_map map;
struct bucket *buckets;
void *elems;
- struct pcpu_freelist freelist;
+ union {
+ struct pcpu_freelist freelist;
+ struct bpf_lru lru;
+ };
void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
@@ -48,11 +52,26 @@ struct htab_elem {
union {
struct rcu_head rcu;
enum extra_elem_state state;
+ struct bpf_lru_node lru_node;
};
u32 hash;
char key[0] __aligned(8);
};
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
+
+static bool htab_is_lru(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool htab_is_percpu(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)
{
int i;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto free_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -87,7 +106,22 @@ free_elems:
vfree(htab->elems);
}
-static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
+ u32 hash)
+{
+ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+ struct htab_elem *l;
+
+ if (node) {
+ l = container_of(node, struct htab_elem, lru_node);
+ memcpy(l->key, key, htab->map.key_size);
+ return l;
+ }
+
+ return NULL;
+}
+
+static int prealloc_init(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
@@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
if (!htab->elems)
return -ENOMEM;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
}
skip_percpu_elems:
- err = pcpu_freelist_init(&htab->freelist);
+ if (htab_is_lru(htab))
+ err = bpf_lru_init(&htab->lru,
+ htab->map.map_flags & BPF_F_NO_COMMON_LRU,
+ offsetof(struct htab_elem, hash) -
+ offsetof(struct htab_elem, lru_node),
+ htab_lru_map_delete_node,
+ htab);
+ else
+ err = pcpu_freelist_init(&htab->freelist);
+
if (err)
goto free_elems;
- pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
- htab->map.max_entries);
+ if (htab_is_lru(htab))
+ bpf_lru_populate(&htab->lru, htab->elems,
+ offsetof(struct htab_elem, lru_node),
+ htab->elem_size, htab->map.max_entries);
+ else
+ pcpu_freelist_populate(&htab->freelist, htab->elems,
+ htab->elem_size, htab->map.max_entries);
+
return 0;
free_elems:
@@ -123,6 +172,16 @@ free_elems:
return err;
}
+static void prealloc_destroy(struct bpf_htab *htab)
+{
+ htab_free_elems(htab);
+
+ if (htab_is_lru(htab))
+ bpf_lru_destroy(&htab->lru);
+ else
+ pcpu_freelist_destroy(&htab->freelist);
+}
+
static int alloc_extra_elems(struct bpf_htab *htab)
{
void __percpu *pptr;
@@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
- bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
int err, i;
u64 cost;
- if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ if (lru && !capable(CAP_SYS_ADMIN))
+ /* LRU implementation is much complicated than other
+ * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ */
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
+ if (!lru && percpu_lru)
+ return ERR_PTR(-EINVAL);
+
+ if (lru && !prealloc)
+ return ERR_PTR(-ENOTSUPP);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size == 0)
goto free_htab;
+ if (percpu_lru) {
+ /* ensure each CPU's lru list has >=1 elements.
+ * since we are at it, make each lru list has the same
+ * number of elements.
+ */
+ htab->map.max_entries = roundup(attr->max_entries,
+ num_possible_cpus());
+ if (htab->map.max_entries < attr->max_entries)
+ htab->map.max_entries = rounddown(attr->max_entries,
+ num_possible_cpus());
+ }
+
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
@@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu) {
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
err = alloc_extra_elems(htab);
if (err)
goto free_buckets;
}
- if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
- err = prealloc_elems_and_freelist(htab);
+ if (prealloc) {
+ err = prealloc_init(htab);
if (err)
goto free_extra_elems;
}
@@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return l->key + round_up(map->key_size, 8);
+ }
+
+ return NULL;
+}
+
+/* It is called from the bpf_lru_list when the LRU needs to delete
+ * older elements from the htab.
+ */
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
+{
+ struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct htab_elem *l, *tgt_l;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+
+ tgt_l = container_of(node, struct htab_elem, lru_node);
+ b = __select_bucket(htab, tgt_l->hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l == tgt_l) {
+ hlist_del_rcu(&l->hash_node);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ return l == tgt_l;
+}
+
/* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ u32 size = round_up(htab->map.value_size, 8);
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
@@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
}
}
- if (!onallcpus) {
- /* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
- } else {
- int off = 0, cpu;
+ pcpu_copy_value(htab, pptr, value, onallcpus);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
} else {
@@ -571,6 +715,70 @@ err:
return ret;
}
+static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old = NULL;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because getting free nodes from LRU may need
+ * to remove older elements from htab and this removal
+ * operation will need a bucket lock.
+ */
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_new->lru_node);
+ hlist_del_rcu(&l_old->hash_node);
+ }
+ ret = 0;
+
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ if (ret)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ else if (l_old)
+ bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+
+ return ret;
+}
+
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool onallcpus)
@@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
- u32 size = htab->map.value_size;
-
/* per-cpu hash map can update value in-place */
- if (!onallcpus) {
- memcpy(this_cpu_ptr(pptr), value, size);
- } else {
- int off = 0, cpu;
-
- size = round_up(size, 8);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false);
@@ -637,12 +832,84 @@ err:
return ret;
}
+static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because LRU's elem alloc may need
+ * to remove older elem from htab and this removal
+ * operation will need a bucket lock.
+ */
+ if (map_flags != BPF_EXIST) {
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ }
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_old->lru_node);
+
+ /* per-cpu hash map can update value in-place */
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ value, onallcpus);
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ l_new = NULL;
+ }
+ ret = 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l_new)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ return ret;
+}
+
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
+static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
+ false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
@@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
+static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ ret = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ return ret;
+}
+
static void delete_all_elements(struct bpf_htab *htab)
{
int i;
@@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC)
delete_all_elements(htab);
- } else {
- htab_free_elems(htab);
- pcpu_freelist_destroy(&htab->freelist);
- }
+ else
+ prealloc_destroy(htab);
+
free_percpu(htab->extra_elems);
kvfree(htab->buckets);
kfree(htab);
@@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+static const struct bpf_map_ops htab_lru_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_update_elem = htab_lru_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_type __read_mostly = {
+ .ops = &htab_lru_ops,
+ .type = BPF_MAP_TYPE_LRU_HASH,
+};
+
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
+ if (htab_is_lru(htab))
+ bpf_lru_node_set_ref(&l->lru_node);
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
@@ -776,10 +1104,16 @@ out:
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
int ret;
rcu_read_lock();
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ if (htab_is_lru(htab))
+ ret = __htab_lru_percpu_map_update_elem(map, key, value,
+ map_flags, true);
+ else
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
+ true);
rcu_read_unlock();
return ret;
@@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
+static const struct bpf_map_ops htab_lru_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_update_elem = htab_lru_percpu_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+ .ops = &htab_lru_percpu_ops,
+ .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type);
+ bpf_register_map_type(&htab_lru_type);
+ bpf_register_map_type(&htab_lru_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 233e3ac836a6..ce1b7de7d72c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr)
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ac87f9c068ae..f394ac616ed8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -2,6 +2,7 @@
obj- := dummy.o
# List of programs to build
+hostprogs-y := test_lru_dist
hostprogs-y += sock_example
hostprogs-y += fds_example
hostprogs-y += sockex1
@@ -28,6 +29,7 @@ hostprogs-y += trace_event
hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect
+test_lru_dist-objs := test_lru_dist.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
fds_example-objs := bpf_load.o libbpf.o fds_example.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 311538e5a701..7ee1574c8ccf 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = {
.max_entries = MAX_ENTRIES,
};
+struct bpf_map_def SEC("maps") lru_hash_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") percpu_lru_hash_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 10000,
+ .map_flags = BPF_F_NO_COMMON_LRU,
+};
+
struct bpf_map_def SEC("maps") percpu_hash_map = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(u32),
@@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx)
value = bpf_map_lookup_elem(&hash_map, &key);
if (value)
bpf_map_delete_elem(&hash_map, &key);
+
return 0;
}
@@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
return 0;
}
+
+SEC("kprobe/sys_getpid")
+int stress_lru_hmap_alloc(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_prandom_u32();
+ long val = 1;
+
+ bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
+
+ return 0;
+}
+
+SEC("kprobe/sys_getppid")
+int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_prandom_u32();
+ long val = 1;
+
+ bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY);
+
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 3147377e8fd3..9505b4d112f4 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -35,6 +35,8 @@ static __u64 time_get_ns(void)
#define PERCPU_HASH_PREALLOC (1 << 1)
#define HASH_KMALLOC (1 << 2)
#define PERCPU_HASH_KMALLOC (1 << 3)
+#define LRU_HASH_PREALLOC (1 << 4)
+#define PERCPU_LRU_HASH_PREALLOC (1 << 5)
static int test_flags = ~0;
@@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu)
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
+static void test_lru_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < MAX_CNT; i++)
+ syscall(__NR_getpid);
+ printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_percpu_lru_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < MAX_CNT; i++)
+ syscall(__NR_getppid);
+ printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+}
+
static void test_percpu_hash_prealloc(int cpu)
{
__u64 start_time;
@@ -105,6 +131,12 @@ static void loop(int cpu)
if (test_flags & PERCPU_HASH_KMALLOC)
test_percpu_hash_kmalloc(cpu);
+
+ if (test_flags & LRU_HASH_PREALLOC)
+ test_lru_hash_prealloc(cpu);
+
+ if (test_flags & PERCPU_LRU_HASH_PREALLOC)
+ test_percpu_lru_hash_prealloc(cpu);
}
static void run_perf_test(int tasks)
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
new file mode 100644
index 000000000000..2859977b7f37
--- /dev/null
+++ b/samples/bpf/test_lru_dist.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <linux/types.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <time.h>
+#include "libbpf.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+static int nr_cpus;
+static unsigned long long *dist_keys;
+static unsigned int dist_key_counts;
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void __list_del_entry(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add(list, head);
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_last_entry(ptr, type, member) \
+ list_entry((ptr)->prev, type, member)
+
+struct pfect_lru_node {
+ struct list_head list;
+ unsigned long long key;
+};
+
+struct pfect_lru {
+ struct list_head list;
+ struct pfect_lru_node *free_nodes;
+ unsigned int cur_size;
+ unsigned int lru_size;
+ unsigned int nr_unique;
+ unsigned int nr_misses;
+ unsigned int total;
+ int map_fd;
+};
+
+static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size,
+ unsigned int nr_possible_elems)
+{
+ lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+ sizeof(unsigned long long),
+ sizeof(struct pfect_lru_node *),
+ nr_possible_elems, 0);
+ assert(lru->map_fd != -1);
+
+ lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node));
+ assert(lru->free_nodes);
+
+ INIT_LIST_HEAD(&lru->list);
+ lru->cur_size = 0;
+ lru->lru_size = lru_size;
+ lru->nr_unique = lru->nr_misses = lru->total = 0;
+}
+
+static void pfect_lru_destroy(struct pfect_lru *lru)
+{
+ close(lru->map_fd);
+ free(lru->free_nodes);
+}
+
+static int pfect_lru_lookup_or_insert(struct pfect_lru *lru,
+ unsigned long long key)
+{
+ struct pfect_lru_node *node = NULL;
+ int seen = 0;
+
+ lru->total++;
+ if (!bpf_lookup_elem(lru->map_fd, &key, &node)) {
+ if (node) {
+ list_move(&node->list, &lru->list);
+ return 1;
+ }
+ seen = 1;
+ }
+
+ if (lru->cur_size < lru->lru_size) {
+ node = &lru->free_nodes[lru->cur_size++];
+ INIT_LIST_HEAD(&node->list);
+ } else {
+ struct pfect_lru_node *null_node = NULL;
+
+ node = list_last_entry(&lru->list,
+ struct pfect_lru_node,
+ list);
+ bpf_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST);
+ }
+
+ node->key = key;
+ list_move(&node->list, &lru->list);
+
+ lru->nr_misses++;
+ if (seen) {
+ assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_EXIST));
+ } else {
+ lru->nr_unique++;
+ assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST));
+ }
+
+ return seen;
+}
+
+static unsigned int read_keys(const char *dist_file,
+ unsigned long long **keys)
+{
+ struct stat fst;
+ unsigned long long *retkeys;
+ unsigned int counts = 0;
+ int dist_fd;
+ char *b, *l;
+ int i;
+
+ dist_fd = open(dist_file, 0);
+ assert(dist_fd != -1);
+
+ assert(fstat(dist_fd, &fst) == 0);
+ b = malloc(fst.st_size);
+ assert(b);
+
+ assert(read(dist_fd, b, fst.st_size) == fst.st_size);
+ close(dist_fd);
+ for (i = 0; i < fst.st_size; i++) {
+ if (b[i] == '\n')
+ counts++;
+ }
+ counts++; /* in case the last line has no \n */
+
+ retkeys = malloc(counts * sizeof(unsigned long long));
+ assert(retkeys);
+
+ counts = 0;
+ for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n"))
+ retkeys[counts++] = strtoull(l, NULL, 10);
+ free(b);
+
+ *keys = retkeys;
+
+ return counts;
+}
+
+static int create_map(int map_type, int map_flags, unsigned int size)
+{
+ int map_fd;
+
+ map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
+ sizeof(unsigned long long), size, map_flags);
+
+ if (map_fd == -1)
+ perror("bpf_create_map");
+
+ return map_fd;
+}
+
+static int sched_next_online(int pid, int next_to_try)
+{
+ cpu_set_t cpuset;
+
+ if (next_to_try == nr_cpus)
+ return -1;
+
+ while (next_to_try < nr_cpus) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(next_to_try++, &cpuset);
+ if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
+ break;
+ }
+
+ return next_to_try;
+}
+
+static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data),
+ void *data)
+{
+ int next_sched_cpu = 0;
+ pid_t pid[tasks];
+ int i;
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ next_sched_cpu = sched_next_online(0, next_sched_cpu);
+ fn(i, data);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ /* It is mostly redundant and just allow the parent
+ * process to update next_shced_cpu for the next child
+ * process
+ */
+ next_sched_cpu = sched_next_online(pid[i], next_sched_cpu);
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void do_test_lru_dist(int task, void *data)
+{
+ unsigned int nr_misses = 0;
+ struct pfect_lru pfect_lru;
+ unsigned long long key, value = 1234;
+ unsigned int i;
+
+ unsigned int lru_map_fd = ((unsigned int *)data)[0];
+ unsigned int lru_size = ((unsigned int *)data)[1];
+ unsigned long long key_offset = task * dist_key_counts;
+
+ pfect_lru_init(&pfect_lru, lru_size, dist_key_counts);
+
+ for (i = 0; i < dist_key_counts; i++) {
+ key = dist_keys[i] + key_offset;
+
+ pfect_lru_lookup_or_insert(&pfect_lru, key);
+
+ if (!bpf_lookup_elem(lru_map_fd, &key, &value))
+ continue;
+
+ if (bpf_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) {
+ printf("bpf_update_elem(lru_map_fd, %llu): errno:%d\n",
+ key, errno);
+ assert(0);
+ }
+
+ nr_misses++;
+ }
+
+ printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, dist_key_counts, nr_misses,
+ dist_key_counts);
+ printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, pfect_lru.total,
+ pfect_lru.nr_misses, pfect_lru.total);
+
+ pfect_lru_destroy(&pfect_lru);
+ close(lru_map_fd);
+}
+
+static void test_parallel_lru_dist(int map_type, int map_flags,
+ int nr_tasks, unsigned int lru_size)
+{
+ int child_data[2];
+ int lru_map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_cpus * lru_size);
+ else
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_tasks * lru_size);
+ assert(lru_map_fd != -1);
+
+ child_data[0] = lru_map_fd;
+ child_data[1] = lru_size;
+
+ run_parallel(nr_tasks, do_test_lru_dist, child_data);
+
+ close(lru_map_fd);
+}
+
+static void test_lru_loss0(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ unsigned int old_unused_losses = 0;
+ unsigned int new_unused_losses = 0;
+ unsigned int used_losses = 0;
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 900 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 900);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++) {
+ int start_key, end_key;
+
+ assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0);
+
+ start_key = 101;
+ end_key = min(key, 900);
+
+ while (start_key <= end_key) {
+ bpf_lookup_elem(map_fd, &start_key, value);
+ start_key++;
+ }
+ }
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_lookup_elem(map_fd, &key, value)) {
+ if (key <= 100)
+ old_unused_losses++;
+ else if (key <= 900)
+ used_losses++;
+ else
+ new_unused_losses++;
+ }
+ }
+
+ close(map_fd);
+
+ printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) "
+ "newer-elem-losses:%d(/100)\n",
+ old_unused_losses, used_losses, new_unused_losses);
+}
+
+static void test_lru_loss1(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int map_fd;
+ unsigned int nr_losses = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 1000 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 1000);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++)
+ assert(!bpf_update_elem(map_fd, &key, value, BPF_NOEXIST));
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ }
+
+ close(map_fd);
+
+ printf("nr_losses:%d(/1000)\n", nr_losses);
+}
+
+static void do_test_parallel_lru_loss(int task, void *data)
+{
+ const unsigned int nr_stable_elems = 1000;
+ const unsigned int nr_repeats = 100000;
+
+ int map_fd = *(int *)data;
+ unsigned long long stable_base;
+ unsigned long long key, value[nr_cpus];
+ unsigned long long next_ins_key;
+ unsigned int nr_losses = 0;
+ unsigned int i;
+
+ stable_base = task * nr_repeats * 2 + 1;
+ next_ins_key = stable_base;
+ value[0] = 1234;
+ for (i = 0; i < nr_stable_elems; i++) {
+ assert(bpf_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST) == 0);
+ next_ins_key++;
+ }
+
+ for (i = 0; i < nr_repeats; i++) {
+ int rn;
+
+ rn = rand();
+
+ if (rn % 10) {
+ key = rn % nr_stable_elems + stable_base;
+ bpf_lookup_elem(map_fd, &key, value);
+ } else {
+ bpf_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST);
+ next_ins_key++;
+ }
+ }
+
+ key = stable_base;
+ for (i = 0; i < nr_stable_elems; i++) {
+ if (bpf_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ key++;
+ }
+
+ printf(" task:%d nr_losses:%u\n", task, nr_losses);
+}
+
+static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks)
+{
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ /* Give 20% more than the active working set */
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags,
+ nr_cpus * (1000 + 200));
+ else
+ map_fd = create_map(map_type, map_flags,
+ nr_tasks * (1000 + 200));
+
+ assert(map_fd != -1);
+
+ run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd);
+
+ close(map_fd);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
+ const char *dist_file;
+ int nr_tasks = 1;
+ int lru_size;
+ int f;
+
+ if (argc < 4) {
+ printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n",
+ argv[0]);
+ return -1;
+ }
+
+ dist_file = argv[1];
+ lru_size = atoi(argv[2]);
+ nr_tasks = atoi(argv[3]);
+
+ setbuf(stdout, NULL);
+
+ assert(!setrlimit(RLIMIT_MEMLOCK, &r));
+
+ srand(time(NULL));
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ assert(nr_cpus != -1);
+ printf("nr_cpus:%d\n\n", nr_cpus);
+
+ nr_tasks = min(nr_tasks, nr_cpus);
+
+ dist_key_counts = read_keys(dist_file, &dist_keys);
+ if (!dist_key_counts) {
+ printf("%s has no key\n", dist_file);
+ return -1;
+ }
+
+ for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
+ test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks);
+ test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks, lru_size);
+ printf("\n");
+ }
+
+ free(dist_keys);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 4761e2d65ab8..7a5f24543a5f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,8 +1,8 @@
-CFLAGS += -Wall -O2
+CFLAGS += -Wall -O2 -I../../../../usr/include
-test_objs = test_verifier test_maps
+test_objs = test_verifier test_maps test_lru_map
-TEST_PROGS := test_verifier test_maps test_kmod.sh
+TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
TEST_FILES := $(test_objs)
all: $(test_objs)
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
new file mode 100644
index 000000000000..627757ed7836
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <time.h>
+#include "bpf_sys.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define PERCPU_FREE_TARGET (16)
+
+static int nr_cpus;
+
+static int create_map(int map_type, int map_flags, unsigned int size)
+{
+ int map_fd;
+
+ map_fd = bpf_map_create(map_type, sizeof(unsigned long long),
+ sizeof(unsigned long long), size, map_flags);
+
+ if (map_fd == -1)
+ perror("bpf_map_create");
+
+ return map_fd;
+}
+
+static int map_subset(int map0, int map1)
+{
+ unsigned long long next_key = 0;
+ unsigned long long value0[nr_cpus], value1[nr_cpus];
+ int ret;
+
+ while (!bpf_map_next_key(map1, &next_key, &next_key)) {
+ assert(!bpf_map_lookup(map1, &next_key, value1));
+ ret = bpf_map_lookup(map0, &next_key, value0);
+ if (ret) {
+ printf("key:%llu not found from map. %s(%d)\n",
+ next_key, strerror(errno), errno);
+ return 0;
+ }
+ if (value0[0] != value1[0]) {
+ printf("key:%llu value0:%llu != value1:%llu\n",
+ next_key, value0[0], value1[0]);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int map_equal(int lru_map, int expected)
+{
+ return map_subset(lru_map, expected) && map_subset(expected, lru_map);
+}
+
+static int sched_next_online(int pid, int next_to_try)
+{
+ cpu_set_t cpuset;
+
+ if (next_to_try == nr_cpus)
+ return -1;
+
+ while (next_to_try < nr_cpus) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(next_to_try++, &cpuset);
+ if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
+ break;
+ }
+
+ return next_to_try;
+}
+
+/* Size of the LRU amp is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1
+ * Add Key=3
+ * => Key=2 will be removed by LRU
+ * Iterate map. Only found key=1 and key=3
+ */
+static void test_lru_sanity0(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 2);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* insert key=1 element */
+
+ key = 1;
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST));
+
+ /* BPF_NOEXIST means: add new element if it doesn't exist */
+ assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST) == -1 &&
+ /* key=1 already exists */
+ errno == EEXIST);
+
+ assert(bpf_map_update(lru_map_fd, &key, value, -1) == -1 &&
+ errno == EINVAL);
+
+ /* insert key=2 element */
+
+ /* check that key=2 is not found */
+ key = 2;
+ assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* BPF_EXIST means: update existing element */
+ assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+ /* key=2 is not there */
+ errno == ENOENT);
+
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* insert key=3 element */
+
+ /* check that key=3 is not found */
+ key = 3;
+ assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* check that key=1 can be found and mark the ref bit to
+ * stop LRU from removing key=1
+ */
+ key = 1;
+ assert(!bpf_map_lookup(lru_map_fd, &key, value));
+ assert(value[0] == 1234);
+
+ key = 3;
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST));
+
+ /* key=2 has been removed from the LRU */
+ key = 2;
+ assert(bpf_map_lookup(lru_map_fd, &key, value) == -1);
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 1.5*tgt_free
+ * Insert 1 to tgt_free (+tgt_free keys)
+ * Lookup 1 to tgt_free/2
+ * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
+ * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
+ */
+static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, end_key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ /* Ther percpu lru list (i.e each cpu has its own LRU
+ * list) does not have a local free list. Hence,
+ * it will only free old nodes till there is no free
+ * from the LRU list. Hence, this test does not apply
+ * to BPF_F_NO_COMMON_LRU
+ */
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free + batch_size;
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to tgt_free (+tgt_free keys) */
+ end_key = 1 + tgt_free;
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* Lookup 1 to tgt_free/2 */
+ end_key = 1 + batch_size;
+ for (key = 1; key < end_key; key++) {
+ assert(!bpf_map_lookup(lru_map_fd, &key, value));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ /* Insert 1+tgt_free to 2*tgt_free
+ * => 1+tgt_free/2 to LOCALFREE_TARGET will be
+ * removed by LRU
+ */
+ key = 1 + tgt_free;
+ end_key = key + tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map 1.5 * tgt_free
+ * Insert 1 to tgt_free (+tgt_free keys)
+ * Update 1 to tgt_free/2
+ * => The original 1 to tgt_free/2 will be removed due to
+ * the LRU shrink process
+ * Re-insert 1 to tgt_free/2 again and do a lookup immeidately
+ * Insert 1+tgt_free to tgt_free*3/2
+ * Insert 1+tgt_free*3/2 to tgt_free*5/2
+ * => Key 1+tgt_free to tgt_free*3/2
+ * will be removed from LRU because it has never
+ * been lookup and ref bit is not set
+ */
+static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, value[nr_cpus];
+ unsigned long long end_key;
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ /* Ther percpu lru list (i.e each cpu has its own LRU
+ * list) does not have a local free list. Hence,
+ * it will only free old nodes till there is no free
+ * from the LRU list. Hence, this test does not apply
+ * to BPF_F_NO_COMMON_LRU
+ */
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free + batch_size;
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ map_size * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to tgt_free (+tgt_free keys) */
+ end_key = 1 + tgt_free;
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* Any bpf_map_update will require to acquire a new node
+ * from LRU first.
+ *
+ * The local list is running out of free nodes.
+ * It gets from the global LRU list which tries to
+ * shrink the inactive list to get tgt_free
+ * number of free nodes.
+ *
+ * Hence, the oldest key 1 to tgt_free/2
+ * are removed from the LRU list.
+ */
+ key = 1;
+ if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_delete(lru_map_fd, &key));
+ } else {
+ assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST));
+ }
+
+ /* Re-insert 1 to tgt_free/2 again and do a lookup
+ * immeidately.
+ */
+ end_key = 1 + batch_size;
+ value[0] = 4321;
+ for (key = 1; key < end_key; key++) {
+ assert(bpf_map_lookup(lru_map_fd, &key, value));
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_lookup(lru_map_fd, &key, value));
+ assert(value[0] == 4321);
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ value[0] = 1234;
+
+ /* Insert 1+tgt_free to tgt_free*3/2 */
+ end_key = 1 + tgt_free + batch_size;
+ for (key = 1 + tgt_free; key < end_key; key++)
+ /* These newly added but not referenced keys will be
+ * gone during the next LRU shrink.
+ */
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
+ end_key = key + tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 2*tgt_free
+ * It is to test the active/inactive list rotation
+ * Insert 1 to 2*tgt_free (+2*tgt_free keys)
+ * Lookup key 1 to tgt_free*3/2
+ * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys)
+ * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU
+ */
+static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, end_key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free * 2;
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ map_size * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */
+ end_key = 1 + (2 * tgt_free);
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* Lookup key 1 to tgt_free*3/2 */
+ end_key = tgt_free + batch_size;
+ for (key = 1; key < end_key; key++) {
+ assert(!bpf_map_lookup(lru_map_fd, &key, value));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ /* Add 1+2*tgt_free to tgt_free*5/2
+ * (+tgt_free/2 keys)
+ */
+ key = 2 * tgt_free + 1;
+ end_key = key + batch_size;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Test deletion */
+static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
+{
+ int lru_map_fd, expected_map_fd;
+ unsigned long long key, value[nr_cpus];
+ unsigned long long end_key;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ 3 * tgt_free * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,
+ 3 * tgt_free);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 2 * tgt_free; key++)
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ key = 1;
+ assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ for (key = 1; key <= tgt_free; key++) {
+ assert(!bpf_map_lookup(lru_map_fd, &key, value));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ for (; key <= 2 * tgt_free; key++) {
+ assert(!bpf_map_delete(lru_map_fd, &key));
+ assert(bpf_map_delete(lru_map_fd, &key));
+ }
+
+ end_key = key + 2 * tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+static void do_test_lru_sanity5(unsigned long long last_key, int map_fd)
+{
+ unsigned long long key, value[nr_cpus];
+
+ /* Ensure the last key inserted by previous CPU can be found */
+ assert(!bpf_map_lookup(map_fd, &last_key, value));
+
+ value[0] = 1234;
+
+ key = last_key + 1;
+ assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_lookup(map_fd, &key, value));
+
+ /* Cannot find the last key because it was removed by LRU */
+ assert(bpf_map_lookup(map_fd, &last_key, value));
+}
+
+/* Test map with only one element */
+static void test_lru_sanity5(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int next_sched_cpu = 0;
+ int map_fd;
+ int i;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ map_fd = create_map(map_type, map_flags, 1);
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+ key = 0;
+ assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST));
+
+ for (i = 0; i < nr_cpus; i++) {
+ pid_t pid;
+
+ pid = fork();
+ if (pid == 0) {
+ next_sched_cpu = sched_next_online(0, next_sched_cpu);
+ if (next_sched_cpu != -1)
+ do_test_lru_sanity5(key, map_fd);
+ exit(0);
+ } else if (pid == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ } else {
+ int status;
+
+ /* It is mostly redundant and just allow the parent
+ * process to update next_shced_cpu for the next child
+ * process
+ */
+ next_sched_cpu = sched_next_online(pid, next_sched_cpu);
+
+ assert(waitpid(pid, &status, 0) == pid);
+ assert(status == 0);
+ key++;
+ }
+ }
+
+ close(map_fd);
+
+ printf("Pass\n");
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int map_types[] = {BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH};
+ int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
+ int t, f;
+
+ setbuf(stdout, NULL);
+
+ assert(!setrlimit(RLIMIT_MEMLOCK, &r));
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ assert(nr_cpus != -1);
+ printf("nr_cpus:%d\n\n", nr_cpus);
+
+ for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
+ unsigned int tgt_free = (map_flags[f] & BPF_F_NO_COMMON_LRU) ?
+ PERCPU_FREE_TARGET : LOCAL_FREE_TARGET;
+
+ for (t = 0; t < sizeof(map_types) / sizeof(*map_types); t++) {
+ test_lru_sanity0(map_types[t], map_flags[f]);
+ test_lru_sanity1(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity2(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity3(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity5(map_types[t], map_flags[f]);
+
+ printf("\n");
+ }
+ }
+
+ return 0;
+}