aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-01-26 20:51:05 -0800
committerDavid S. Miller <davem@davemloft.net>2011-01-26 20:51:05 -0800
commit62fa8a846d7de4b299232e330c74b7783539df76 (patch)
treee401dbdbf4b11cbd27bdc3a47d9dc8b512173c9f /net
parentMerge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6 (diff)
downloadlinux-dev-62fa8a846d7de4b299232e330c74b7783539df76.tar.xz
linux-dev-62fa8a846d7de4b299232e330c74b7783539df76.zip
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/dst.c39
-rw-r--r--net/decnet/dn_route.c18
-rw-r--r--net/ipv4/route.c45
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv6/route.c15
-rw-r--r--net/ipv6/xfrm6_policy.c2
6 files changed, 114 insertions, 9 deletions
diff --git a/net/core/dst.c b/net/core/dst.c
index b99c7c7ffce2..578893505702 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -164,6 +164,8 @@ int dst_discard(struct sk_buff *skb)
}
EXPORT_SYMBOL(dst_discard);
+static const u32 dst_default_metrics[RTAX_MAX];
+
void *dst_alloc(struct dst_ops *ops)
{
struct dst_entry *dst;
@@ -180,6 +182,7 @@ void *dst_alloc(struct dst_ops *ops)
dst->lastuse = jiffies;
dst->path = dst;
dst->input = dst->output = dst_discard;
+ dst_init_metrics(dst, dst_default_metrics, true);
#if RT_CACHE_DEBUG >= 2
atomic_inc(&dst_total);
#endif
@@ -282,6 +285,42 @@ void dst_release(struct dst_entry *dst)
}
EXPORT_SYMBOL(dst_release);
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = (unsigned long) dst_default_metrics;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
/**
* skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5e636365d33c..42c9c62d3417 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -112,6 +112,7 @@ static int dn_dst_gc(struct dst_ops *ops);
static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
@@ -133,11 +134,18 @@ static struct dst_ops dn_dst_ops = {
.check = dn_dst_check,
.default_advmss = dn_dst_default_advmss,
.default_mtu = dn_dst_default_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = dn_dst_destroy,
.negative_advice = dn_dst_negative_advice,
.link_failure = dn_dst_link_failure,
.update_pmtu = dn_dst_update_pmtu,
};
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+ dst_destroy_metrics_generic(dst);
+}
+
static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
{
__u16 tmp = (__u16 __force)(src ^ dst);
@@ -814,14 +822,14 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
{
struct dn_fib_info *fi = res->fi;
struct net_device *dev = rt->dst.dev;
+ unsigned int mss_metric;
struct neighbour *n;
- unsigned int metric;
if (fi) {
if (DN_FIB_RES_GW(*res) &&
DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = DN_FIB_RES_GW(*res);
- dst_import_metrics(&rt->dst, fi->fib_metrics);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
}
rt->rt_type = res->type;
@@ -834,10 +842,10 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
- metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
- if (metric) {
+ mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+ if (mss_metric) {
unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
- if (metric > mss)
+ if (mss_metric > mss)
dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
}
return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e5b7cc2db4f..980030d4e4ae 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
}
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else {
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
+ }
+ }
+ return p;
+}
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +189,7 @@ static struct dst_ops ipv4_dst_ops = {
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.default_mtu = ipv4_default_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
@@ -1441,6 +1472,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
+ if (rt->fi)
+ atomic_inc(&rt->fi->fib_clntref);
if (arp_bind_neighbour(&rt->dst) ||
!(rt->dst.neighbour->nud_state &
@@ -1720,6 +1753,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
struct rtable *rt = (struct rtable *) dst;
struct inet_peer *peer = rt->peer;
+ dst_destroy_metrics_generic(dst);
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
if (peer) {
rt->peer = NULL;
inet_putpeer(peer);
@@ -1824,7 +1862,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- dst_import_metrics(dst, fi->fib_metrics);
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ dst_init_metrics(dst, fi->fib_metrics, true);
#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
@@ -2752,6 +2792,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
rt->peer = ort->peer;
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
+ rt->fi = ort->fi;
+ if (rt->fi)
+ atomic_inc(&rt->fi->fib_clntref);
dst_free(new);
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..19fbdec6baaa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ dst_destroy_metrics_generic(dst);
+
if (likely(xdst->u.rt.peer))
inet_putpeer(xdst->u.rt.peer);
+
xfrm_dst_destroy(xdst);
}
@@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1534508f6c68..45fafa018f12 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -105,6 +105,7 @@ static struct dst_ops ip6_dst_ops_template = {
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.default_mtu = ip6_default_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
@@ -125,6 +126,10 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.update_pmtu = ip6_rt_blackhole_update_pmtu,
};
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+ [RTAX_HOPLIMIT - 1] = 255,
+};
+
static struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
@@ -193,6 +198,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
+ dst_destroy_metrics_generic(dst);
if (peer) {
BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
rt->rt6i_peer = NULL;
@@ -2681,7 +2687,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+ ip6_template_metrics, true);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2692,7 +2699,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+ ip6_template_metrics, true);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -2702,7 +2710,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+ ip6_template_metrics, true);
#endif
net->ipv6.sysctl.flush_delay = 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index da87428681cc..834dc02f1d4f 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -220,6 +220,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
+ dst_destroy_metrics_generic(dst);
if (likely(xdst->u.rt6.rt6i_peer))
inet_putpeer(xdst->u.rt6.rt6i_peer);
xfrm_dst_destroy(xdst);
@@ -257,6 +258,7 @@ static struct dst_ops xfrm6_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,