aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/Kconfig4
-rw-r--r--net/packet/af_packet.c479
-rw-r--r--net/packet/internal.h14
3 files changed, 299 insertions, 198 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
index b4abad135294..2997382d597c 100644
--- a/net/packet/Kconfig
+++ b/net/packet/Kconfig
@@ -5,7 +5,7 @@
config PACKET
tristate "Packet socket"
- ---help---
+ help
The Packet protocol is used by applications which communicate
directly with network devices without an intermediate network
protocol implemented in the kernel, e.g. tcpdump. If you want them
@@ -20,6 +20,6 @@ config PACKET_DIAG
tristate "Packet: sockets monitoring interface"
depends on PACKET
default n
- ---help---
+ help
Support for PF_PACKET sockets monitoring interface used by the ss tool.
If unsure, say Y.
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e5b0986215d2..6ce8dd19f33c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -46,6 +46,10 @@
* Copyright (C) 2011, <lokec@ccs.neu.edu>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/ethtool.h>
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/capability.h>
@@ -88,57 +92,62 @@
#endif
#include <linux/bpf.h>
#include <net/compat.h>
+#include <linux/netfilter_netdev.h>
#include "internal.h"
/*
Assumptions:
- - if device has no dev->hard_header routine, it adds and removes ll header
- inside itself. In this case ll header is invisible outside of device,
- but higher levels still should reserve dev->hard_header_len.
- Some devices are enough clever to reallocate skb, when header
- will not fit to reserved space (tunnel), another ones are silly
- (PPP).
+ - If the device has no dev->header_ops->create, there is no LL header
+ visible above the device. In this case, its hard_header_len should be 0.
+ The device may prepend its own header internally. In this case, its
+ needed_headroom should be set to the space needed for it to add its
+ internal header.
+ For example, a WiFi driver pretending to be an Ethernet driver should
+ set its hard_header_len to be the Ethernet header length, and set its
+ needed_headroom to be (the real WiFi header length - the fake Ethernet
+ header length).
- packet socket receives packets with pulled ll header,
so that SOCK_RAW should push it back.
On receive:
-----------
-Incoming, dev->hard_header!=NULL
+Incoming, dev_has_header(dev) == true
mac_header -> ll header
data -> data
-Outgoing, dev->hard_header!=NULL
+Outgoing, dev_has_header(dev) == true
mac_header -> ll header
data -> ll header
-Incoming, dev->hard_header==NULL
- mac_header -> UNKNOWN position. It is very likely, that it points to ll
- header. PPP makes it, that is wrong, because introduce
- assymetry between rx and tx paths.
+Incoming, dev_has_header(dev) == false
+ mac_header -> data
+ However drivers often make it point to the ll header.
+ This is incorrect because the ll header should be invisible to us.
data -> data
-Outgoing, dev->hard_header==NULL
- mac_header -> data. ll header is still not built!
+Outgoing, dev_has_header(dev) == false
+ mac_header -> data. ll header is invisible to us.
data -> data
Resume
- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+ If dev_has_header(dev) == false we are unable to restore the ll header,
+ because it is invisible to us.
On transmit:
------------
-dev->hard_header != NULL
+dev_has_header(dev) == true
mac_header -> ll header
data -> ll header
-dev->hard_header == NULL (ll header is added by device, we cannot control it)
+dev_has_header(dev) == false (ll header is invisible to us)
mac_header -> data
data -> data
- We should set nh.raw on output to correct posistion,
+ We should set network_header on output to the correct position,
packet classifier depends on it.
*/
@@ -177,7 +186,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
-#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
struct packet_sock;
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -235,8 +243,42 @@ struct packet_skb_cb {
static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
static void __fanout_link(struct sock *sk, struct packet_sock *po);
+#ifdef CONFIG_NETFILTER_EGRESS
+static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+ int rc;
+
+ rcu_read_lock();
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb_mark_not_on_list(skb);
+
+ if (!nf_hook_egress(skb, &rc, skb->dev))
+ continue;
+
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+
+ tail = skb;
+ }
+ rcu_read_unlock();
+
+ return head;
+}
+#endif
+
static int packet_direct_xmit(struct sk_buff *skb)
{
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_hook_egress_active()) {
+ skb = nf_hook_direct_egress(skb);
+ if (!skb)
+ return NET_XMIT_DROP;
+ }
+#endif
return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
}
@@ -246,8 +288,7 @@ static struct net_device *packet_cached_dev_get(struct packet_sock *po)
rcu_read_lock();
dev = rcu_dereference(po->cached_dev);
- if (likely(dev))
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
return dev;
@@ -418,7 +459,8 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
return TP_STATUS_TS_RAW_HARDWARE;
- if (ktime_to_timespec64_cond(skb->tstamp, ts))
+ if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
+ ktime_to_timespec64_cond(skb_tstamp(skb), ts))
return TP_STATUS_TS_SOFTWARE;
return 0;
@@ -593,6 +635,7 @@ static void init_prb_bdqc(struct packet_sock *po,
req_u->req3.tp_block_size);
p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
+ rwlock_init(&p1->blk_fill_in_prog_lock);
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
@@ -659,10 +702,9 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
*
*/
if (BLOCK_NUM_PKTS(pbd)) {
- while (atomic_read(&pkc->blk_fill_in_prog)) {
- /* Waiting for skb_copy_bits to finish... */
- cpu_relax();
- }
+ /* Waiting for skb_copy_bits to finish... */
+ write_lock(&pkc->blk_fill_in_prog_lock);
+ write_unlock(&pkc->blk_fill_in_prog_lock);
}
if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
@@ -921,10 +963,9 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
* the timer-handler already handled this case.
*/
if (!(status & TP_STATUS_BLK_TMO)) {
- while (atomic_read(&pkc->blk_fill_in_prog)) {
- /* Waiting for skb_copy_bits to finish... */
- cpu_relax();
- }
+ /* Waiting for skb_copy_bits to finish... */
+ write_lock(&pkc->blk_fill_in_prog_lock);
+ write_unlock(&pkc->blk_fill_in_prog_lock);
}
prb_close_block(pkc, pbd, po, status);
return;
@@ -942,9 +983,11 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
}
static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
+ __releases(&pkc->blk_fill_in_prog_lock)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
- atomic_dec(&pkc->blk_fill_in_prog);
+
+ read_unlock(&pkc->blk_fill_in_prog_lock);
}
static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
@@ -989,6 +1032,7 @@ static void prb_fill_curr_block(char *curr,
struct tpacket_kbdq_core *pkc,
struct tpacket_block_desc *pbd,
unsigned int len)
+ __acquires(&pkc->blk_fill_in_prog_lock)
{
struct tpacket3_hdr *ppd;
@@ -998,7 +1042,7 @@ static void prb_fill_curr_block(char *curr,
pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
BLOCK_NUM_PKTS(pbd) += 1;
- atomic_inc(&pkc->blk_fill_in_prog);
+ read_lock(&pkc->blk_fill_in_prog_lock);
prb_run_all_ft_ops(pkc, ppd);
}
@@ -1306,7 +1350,7 @@ static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
if (READ_ONCE(history[i]) == rxhash)
count++;
- victim = prandom_u32() % ROLLOVER_HLEN;
+ victim = prandom_u32_max(ROLLOVER_HLEN);
/* Avoid dirtying the cache line if possible */
if (READ_ONCE(history[victim]) != rxhash)
@@ -1353,7 +1397,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
struct packet_sock *po, *po_next, *po_skip = NULL;
unsigned int i, j, room = ROOM_NONE;
- po = pkt_sk(f->arr[idx]);
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
if (try_self) {
room = packet_rcv_has_room(po, skb);
@@ -1365,7 +1409,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
- po_next = pkt_sk(f->arr[i]);
+ po_next = pkt_sk(rcu_dereference(f->arr[i]));
if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
@@ -1460,7 +1504,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
idx = fanout_demux_rollover(f, skb, idx, true, num);
- po = pkt_sk(f->arr[idx]);
+ po = pkt_sk(rcu_dereference(f->arr[idx]));
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}
@@ -1474,7 +1518,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
struct packet_fanout *f = po->fanout;
spin_lock(&f->lock);
- f->arr[f->num_members] = sk;
+ rcu_assign_pointer(f->arr[f->num_members], sk);
smp_wmb();
f->num_members++;
if (f->num_members == 1)
@@ -1489,11 +1533,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
spin_lock(&f->lock);
for (i = 0; i < f->num_members; i++) {
- if (f->arr[i] == sk)
+ if (rcu_dereference_protected(f->arr[i],
+ lockdep_is_held(&f->lock)) == sk)
break;
}
BUG_ON(i >= f->num_members);
- f->arr[i] = f->arr[f->num_members - 1];
+ rcu_assign_pointer(f->arr[i],
+ rcu_dereference_protected(f->arr[f->num_members - 1],
+ lockdep_is_held(&f->lock)));
f->num_members--;
if (f->num_members == 0)
__dev_remove_pack(&f->prot_hook);
@@ -1536,7 +1583,7 @@ static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
}
}
-static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
unsigned int len)
{
struct bpf_prog *new;
@@ -1545,10 +1592,10 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
return -EPERM;
- if (len != sizeof(fprog))
- return -EINVAL;
- if (copy_from_user(&fprog, data, len))
- return -EFAULT;
+
+ ret = copy_bpf_fprog_from_user(&fprog, data, len);
+ if (ret)
+ return ret;
ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
if (ret)
@@ -1558,7 +1605,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
return 0;
}
-static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
+static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
unsigned int len)
{
struct bpf_prog *new;
@@ -1568,7 +1615,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
return -EPERM;
if (len != sizeof(fd))
return -EINVAL;
- if (copy_from_user(&fd, data, len))
+ if (copy_from_sockptr(&fd, data, len))
return -EFAULT;
new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
@@ -1579,7 +1626,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
return 0;
}
-static int fanout_set_data(struct packet_sock *po, char __user *data,
+static int fanout_set_data(struct packet_sock *po, sockptr_t data,
unsigned int len)
{
switch (po->fanout->type) {
@@ -1631,19 +1678,22 @@ static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
return false;
}
-static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
+static int fanout_add(struct sock *sk, struct fanout_args *args)
{
struct packet_rollover *rollover = NULL;
struct packet_sock *po = pkt_sk(sk);
+ u16 type_flags = args->type_flags;
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
u8 flags = type_flags >> 8;
+ u16 id = args->id;
int err;
switch (type) {
case PACKET_FANOUT_ROLLOVER:
if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
return -EINVAL;
+ break;
case PACKET_FANOUT_HASH:
case PACKET_FANOUT_LB:
case PACKET_FANOUT_CPU:
@@ -1695,11 +1745,21 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
err = -EINVAL;
- if (match && match->flags != flags)
- goto out;
- if (!match) {
+ if (match) {
+ if (match->flags != flags)
+ goto out;
+ if (args->max_num_members &&
+ args->max_num_members != match->max_num_members)
+ goto out;
+ } else {
+ if (args->max_num_members > PACKET_FANOUT_MAX)
+ goto out;
+ if (!args->max_num_members)
+ /* legacy PACKET_FANOUT_MAX */
+ args->max_num_members = 256;
err = -ENOMEM;
- match = kzalloc(sizeof(*match), GFP_KERNEL);
+ match = kvzalloc(struct_size(match, arr, args->max_num_members),
+ GFP_KERNEL);
if (!match)
goto out;
write_pnet(&match->net, sock_net(sk));
@@ -1714,7 +1774,9 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.dev = po->prot_hook.dev;
match->prot_hook.func = packet_rcv_fanout;
match->prot_hook.af_packet_priv = match;
+ match->prot_hook.af_packet_net = read_pnet(&match->net);
match->prot_hook.id_match = match_fanout_group;
+ match->max_num_members = args->max_num_members;
list_add(&match->list, &fanout_list);
}
err = -EINVAL;
@@ -1725,9 +1787,12 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.type == po->prot_hook.type &&
match->prot_hook.dev == po->prot_hook.dev) {
err = -ENOSPC;
- if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+ if (refcount_read(&match->sk_ref) < match->max_num_members) {
__dev_remove_pack(&po->prot_hook);
- po->fanout = match;
+
+ /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
+ WRITE_ONCE(po->fanout, match);
+
po->rollover = rollover;
rollover = NULL;
refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
@@ -1739,7 +1804,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
if (err && !refcount_read(&match->sk_ref)) {
list_del(&match->list);
- kfree(match);
+ kvfree(match);
}
out:
@@ -1840,7 +1905,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
*/
spkt->spkt_family = dev->type;
- strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
+ strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
spkt->spkt_protocol = skb->protocol;
/*
@@ -1859,12 +1924,22 @@ oom:
static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
{
+ int depth;
+
if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
sock->type == SOCK_RAW) {
skb_reset_mac_header(skb);
skb->protocol = dev_parse_header_protocol(skb);
}
+ /* Move network header to the right position for VLAN tagged packets */
+ if (likely(skb->dev->type == ARPHRD_ETHER) &&
+ eth_type_vlan(skb->protocol) &&
+ __vlan_get_protocol(skb, skb->protocol, &depth) != 0) {
+ if (pskb_may_pull(skb, depth))
+ skb_set_network_header(skb, depth);
+ }
+
skb_probe_transport_header(skb);
}
@@ -2038,7 +2113,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
* and skb->cb are mangled. It works because (and until) packets
* falling here are owned by current CPU. Output packets are cloned
* by dev_queue_xmit_nit(), input packets are processed by net_bh
- * sequencially, so that if we return skb to original state on exit,
+ * sequentially, so that if we return skb to original state on exit,
* we will not harm anyone.
*/
@@ -2064,7 +2139,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
- if (dev->header_ops) {
+ if (dev_has_header(dev)) {
/* The device has an explicit notion of ll header,
* exported to higher levels.
*
@@ -2134,6 +2209,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
sock_skb_set_dropcount(sk, skb);
+ skb_clear_delivery_time(skb);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
@@ -2168,11 +2244,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
int skb_len = skb->len;
unsigned int snaplen, res;
unsigned long status = TP_STATUS_USER;
- unsigned short macoff, netoff, hdrlen;
+ unsigned short macoff, hdrlen;
+ unsigned int netoff;
struct sk_buff *copy_skb = NULL;
struct timespec64 ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ unsigned int slot_id = 0;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
@@ -2191,7 +2269,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!net_eq(dev_net(dev), sock_net(sk)))
goto drop;
- if (dev->header_ops) {
+ if (dev_has_header(dev)) {
if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb));
else if (skb->pkt_type == PACKET_OUTGOING) {
@@ -2236,6 +2314,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
macoff = netoff - maclen;
}
+ if (netoff > USHRT_MAX) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
if (po->tp_version <= TPACKET_V2) {
if (macoff + snaplen > po->rx_ring.frame_size) {
if (po->copy_thresh &&
@@ -2246,8 +2328,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
copy_skb = skb_get(skb);
skb_head = skb->data;
}
- if (copy_skb)
+ if (copy_skb) {
+ memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
+ sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
skb_set_owner_r(copy_skb, sk);
+ }
}
snaplen = po->rx_ring.frame_size - macoff;
if ((int)snaplen < 0) {
@@ -2275,11 +2360,21 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!h.raw)
goto drop_n_account;
+ if (po->tp_version <= TPACKET_V2) {
+ slot_id = po->rx_ring.head;
+ if (test_bit(slot_id, po->rx_ring.rx_owner_map))
+ goto drop_n_account;
+ __set_bit(slot_id, po->rx_ring.rx_owner_map);
+ }
+
if (do_vnet &&
virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
- vio_le(), true, 0))
+ vio_le(), true, 0)) {
+ if (po->tp_version == TPACKET_V3)
+ prb_clear_blk_fill_status(&po->rx_ring);
goto drop_n_account;
+ }
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring);
@@ -2296,13 +2391,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
po->stats.stats1.tp_packets++;
if (copy_skb) {
status |= TP_STATUS_COPY;
+ skb_clear_delivery_time(copy_skb);
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
}
spin_unlock(&sk->sk_receive_queue.lock);
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ /* Always timestamp; prefer an existing software timestamp taken
+ * closer to the time of capture.
+ */
+ ts_status = tpacket_get_timestamp(skb, &ts,
+ po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
+ if (!ts_status)
ktime_get_real_ts64(&ts);
status |= ts_status;
@@ -2380,9 +2481,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
#endif
if (po->tp_version <= TPACKET_V2) {
+ spin_lock(&sk->sk_receive_queue.lock);
__packet_set_status(po, h.raw, status);
+ __clear_bit(slot_id, po->rx_ring.rx_owner_map);
+ spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
- } else {
+ } else if (po->tp_version == TPACKET_V3) {
prb_clear_blk_fill_status(&po->rx_ring);
}
@@ -2636,7 +2740,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
}
if (likely(saddr == NULL)) {
dev = packet_cached_dev_get(po);
- proto = po->num;
+ proto = READ_ONCE(po->num);
} else {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
@@ -2764,8 +2868,9 @@ tpacket_error:
status = TP_STATUS_SEND_REQUEST;
err = po->xmit(skb);
- if (unlikely(err > 0)) {
- err = net_xmit_errno(err);
+ if (unlikely(err != 0)) {
+ if (err > 0)
+ err = net_xmit_errno(err);
if (err && __packet_get_status(po, ph) ==
TP_STATUS_AVAILABLE) {
/* skb was destructed already */
@@ -2849,7 +2954,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (likely(saddr == NULL)) {
dev = packet_cached_dev_get(po);
- proto = po->num;
+ proto = READ_ONCE(po->num);
} else {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
@@ -2932,8 +3037,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (err)
goto out_free;
- if (sock->type == SOCK_RAW &&
- !dev_validate_header(dev, skb->data, len)) {
+ if ((sock->type == SOCK_RAW &&
+ !dev_validate_header(dev, skb->data, len)) || !skb->len) {
err = -EINVAL;
goto out_free;
}
@@ -2952,6 +3057,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
skb->mark = sockc.mark;
skb->tstamp = sockc.transmit_time;
+ if (unlikely(extra_len == 4))
+ skb->no_fcs = 1;
+
+ packet_parse_headers(skb, sock);
+
if (has_vnet_hdr) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
@@ -2960,14 +3070,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
virtio_net_hdr_set_proto(skb, &vnet_hdr);
}
- packet_parse_headers(skb, sock);
-
- if (unlikely(extra_len == 4))
- skb->no_fcs = 1;
-
err = po->xmit(skb);
- if (err > 0 && (err = net_xmit_errno(err)) != 0)
- goto out_unlock;
+ if (unlikely(err != 0)) {
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ goto out_unlock;
+ }
dev_put(dev);
@@ -2976,8 +3085,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
out_free:
kfree_skb(skb);
out_unlock:
- if (dev)
- dev_put(dev);
+ dev_put(dev);
out:
return err;
}
@@ -2987,10 +3095,13 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
- if (po->tx_ring.pg_vec)
+ /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
+ * tpacket_snd() will redo the check safely.
+ */
+ if (data_race(po->tx_ring.pg_vec))
return tpacket_snd(po, msg);
- else
- return packet_snd(sock, msg, len);
+
+ return packet_snd(sock, msg, len);
}
/*
@@ -3016,16 +3127,14 @@ static int packet_release(struct socket *sock)
sk_del_node_init_rcu(sk);
mutex_unlock(&net->packet.sklist_lock);
- preempt_disable();
sock_prot_inuse_add(net, sk->sk_prot, -1);
- preempt_enable();
spin_lock(&po->bind_lock);
unregister_prot_hook(sk, false);
packet_cached_dev_reset(po);
if (po->prot_hook.dev) {
- dev_put(po->prot_hook.dev);
+ netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
po->prot_hook.dev = NULL;
}
spin_unlock(&po->bind_lock);
@@ -3051,7 +3160,7 @@ static int packet_release(struct socket *sock)
kfree(po->rollover);
if (f) {
fanout_release_data(f);
- kfree(f);
+ kvfree(f);
}
/*
* Now the socket is dead. No more input will appear.
@@ -3077,12 +3186,10 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
__be16 proto)
{
struct packet_sock *po = pkt_sk(sk);
- struct net_device *dev_curr;
- __be16 proto_curr;
- bool need_rehook;
struct net_device *dev = NULL;
- int ret = 0;
bool unlisted = false;
+ bool need_rehook;
+ int ret = 0;
lock_sock(sk);
spin_lock(&po->bind_lock);
@@ -3107,46 +3214,42 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
}
}
- if (dev)
- dev_hold(dev);
-
- proto_curr = po->prot_hook.type;
- dev_curr = po->prot_hook.dev;
-
- need_rehook = proto_curr != proto || dev_curr != dev;
+ need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
if (need_rehook) {
+ dev_hold(dev);
if (po->running) {
rcu_read_unlock();
/* prevents packet_notifier() from calling
* register_prot_hook()
*/
- po->num = 0;
+ WRITE_ONCE(po->num, 0);
__unregister_prot_hook(sk, true);
rcu_read_lock();
- dev_curr = po->prot_hook.dev;
if (dev)
unlisted = !dev_get_by_index_rcu(sock_net(sk),
dev->ifindex);
}
BUG_ON(po->running);
- po->num = proto;
+ WRITE_ONCE(po->num, proto);
po->prot_hook.type = proto;
+ netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
+
if (unlikely(unlisted)) {
- dev_put(dev);
po->prot_hook.dev = NULL;
- po->ifindex = -1;
+ WRITE_ONCE(po->ifindex, -1);
packet_cached_dev_reset(po);
} else {
+ netdev_hold(dev, &po->prot_hook.dev_tracker,
+ GFP_ATOMIC);
po->prot_hook.dev = dev;
- po->ifindex = dev ? dev->ifindex : 0;
+ WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
packet_cached_dev_assign(po, dev);
}
+ dev_put(dev);
}
- if (dev_curr)
- dev_put(dev_curr);
if (proto == 0 || !need_rehook)
goto out_unlock;
@@ -3156,7 +3259,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
} else {
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
out_unlock:
@@ -3274,6 +3377,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
po->prot_hook.func = packet_rcv_spkt;
po->prot_hook.af_packet_priv = sk;
+ po->prot_hook.af_packet_net = sock_net(sk);
if (proto) {
po->prot_hook.type = proto;
@@ -3284,9 +3388,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
sk_add_node_tail_rcu(sk, &net->packet.sklist);
mutex_unlock(&net->packet.sklist_lock);
- preempt_disable();
sock_prot_inuse_add(net, &packet_proto, 1);
- preempt_enable();
return 0;
out2:
@@ -3334,7 +3436,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
* but then it will block.
*/
- skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
/*
* An error occurred so return it. Because skb_recv_datagram()
@@ -3377,9 +3479,11 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
sll->sll_protocol = skb->protocol;
}
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name) {
+ const size_t max_len = min(sizeof(skb->cb),
+ sizeof(struct sockaddr_storage));
int copy_len;
/* If the address length field is there to be filled
@@ -3402,6 +3506,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
msg->msg_namelen = sizeof(struct sockaddr_ll);
}
}
+ if (WARN_ON_ONCE(copy_len > max_len)) {
+ copy_len = max_len;
+ msg->msg_namelen = copy_len;
+ }
memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
}
@@ -3455,9 +3563,9 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
uaddr->sa_family = AF_PACKET;
memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
rcu_read_lock();
- dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
+ dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
if (dev)
- strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
+ strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
rcu_read_unlock();
return sizeof(*uaddr);
@@ -3470,16 +3578,18 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
+ int ifindex;
if (peer)
return -EOPNOTSUPP;
+ ifindex = READ_ONCE(po->ifindex);
sll->sll_family = AF_PACKET;
- sll->sll_ifindex = po->ifindex;
- sll->sll_protocol = po->num;
+ sll->sll_ifindex = ifindex;
+ sll->sll_protocol = READ_ONCE(po->num);
sll->sll_pkttype = 0;
rcu_read_lock();
- dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
if (dev) {
sll->sll_hatype = dev->type;
sll->sll_halen = dev->addr_len;
@@ -3641,7 +3751,8 @@ static void packet_flush_mclist(struct sock *sk)
}
static int
-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
@@ -3661,7 +3772,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return -EINVAL;
if (len > sizeof(mreq))
len = sizeof(mreq);
- if (copy_from_user(&mreq, optval, len))
+ if (copy_from_sockptr(&mreq, optval, len))
return -EFAULT;
if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
return -EINVAL;
@@ -3692,7 +3803,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen < len) {
ret = -EINVAL;
} else {
- if (copy_from_user(&req_u.req, optval, len))
+ if (copy_from_sockptr(&req_u.req, optval, len))
ret = -EFAULT;
else
ret = packet_set_ring(sk, &req_u, 0,
@@ -3707,7 +3818,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
pkt_sk(sk)->copy_thresh = val;
@@ -3719,7 +3830,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
switch (val) {
case TPACKET_V1:
@@ -3745,7 +3856,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
if (val > INT_MAX)
return -EINVAL;
@@ -3765,7 +3876,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
@@ -3784,7 +3895,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen < sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
@@ -3798,7 +3909,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen < sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
@@ -3814,7 +3925,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return -EINVAL;
if (optlen < sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
@@ -3833,7 +3944,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
po->tp_tstamp = val;
@@ -3841,18 +3952,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
}
case PACKET_FANOUT:
{
- int val;
+ struct fanout_args args = { 0 };
- if (optlen != sizeof(val))
+ if (optlen != sizeof(int) && optlen != sizeof(args))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&args, optval, optlen))
return -EFAULT;
- return fanout_add(sk, val & 0xffff, val >> 16);
+ return fanout_add(sk, &args);
}
case PACKET_FANOUT_DATA:
{
- if (!po->fanout)
+ /* Paired with the WRITE_ONCE() in fanout_add() */
+ if (!READ_ONCE(po->fanout))
return -EINVAL;
return fanout_set_data(po, optval, optlen);
@@ -3863,7 +3975,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
if (val < 0 || val > 1)
return -EINVAL;
@@ -3877,16 +3989,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
- ret = -EBUSY;
- } else {
+ if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
po->tp_tx_has_off = !!val;
- ret = 0;
- }
+
release_sock(sk);
return 0;
}
@@ -3896,7 +4005,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(val)))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
@@ -4029,28 +4138,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
return 0;
}
-
-#ifdef CONFIG_COMPAT
-static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct packet_sock *po = pkt_sk(sock->sk);
-
- if (level != SOL_PACKET)
- return -ENOPROTOOPT;
-
- if (optname == PACKET_FANOUT_DATA &&
- po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
- optval = (char __user *)get_compat_bpf_fprog(optval);
- if (!optval)
- return -EFAULT;
- optlen = sizeof(struct sock_fprog);
- }
-
- return packet_setsockopt(sock, level, optname, optval, optlen);
-}
-#endif
-
static int packet_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
@@ -4066,7 +4153,7 @@ static int packet_notifier(struct notifier_block *this,
case NETDEV_UNREGISTER:
if (po->mclist)
packet_dev_mclist_delete(dev, &po->mclist);
- /* fallthrough */
+ fallthrough;
case NETDEV_DOWN:
if (dev->ifindex == po->ifindex) {
@@ -4075,13 +4162,13 @@ static int packet_notifier(struct notifier_block *this,
__unregister_prot_hook(sk, false);
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
if (msg == NETDEV_UNREGISTER) {
packet_cached_dev_reset(po);
- po->ifindex = -1;
- if (po->prot_hook.dev)
- dev_put(po->prot_hook.dev);
+ WRITE_ONCE(po->ifindex, -1);
+ netdev_put(po->prot_hook.dev,
+ &po->prot_hook.dev_tracker);
po->prot_hook.dev = NULL;
}
spin_unlock(&po->bind_lock);
@@ -4277,11 +4364,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
+ unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
__be16 num;
- int err = -EINVAL;
+ int err;
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
@@ -4362,6 +4450,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
break;
default:
+ if (!tx_ring) {
+ rx_owner_map = bitmap_alloc(req->tp_frame_nr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+ if (!rx_owner_map)
+ goto out_free_pg_vec;
+ }
break;
}
}
@@ -4378,7 +4472,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
was_running = po->running;
num = po->num;
if (was_running) {
- po->num = 0;
+ WRITE_ONCE(po->num, 0);
__unregister_prot_hook(sk, false);
}
spin_unlock(&po->bind_lock);
@@ -4391,6 +4485,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);
+ if (po->tp_version <= TPACKET_V2)
+ swap(rb->rx_owner_map, rx_owner_map);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
@@ -4411,7 +4507,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
spin_lock(&po->bind_lock);
if (was_running) {
- po->num = num;
+ WRITE_ONCE(po->num, num);
register_prot_hook(sk);
}
spin_unlock(&po->bind_lock);
@@ -4422,8 +4518,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
out_free_pg_vec:
- if (pg_vec)
+ if (pg_vec) {
+ bitmap_free(rx_owner_map);
free_pg_vec(pg_vec, order, req->tp_block_nr);
+ }
out:
return err;
}
@@ -4504,8 +4602,6 @@ static const struct proto_ops packet_ops_spkt = {
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = packet_sendmsg_spkt,
.recvmsg = packet_recvmsg,
.mmap = sock_no_mmap,
@@ -4528,9 +4624,6 @@ static const struct proto_ops packet_ops = {
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
.getsockopt = packet_getsockopt,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_packet_setsockopt,
-#endif
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
.mmap = packet_mmap,
@@ -4573,7 +4666,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v)
static int packet_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
- seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
+ seq_printf(seq,
+ "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
+ IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
else {
struct sock *s = sk_entry(v);
const struct packet_sock *po = pkt_sk(s);
@@ -4583,8 +4678,8 @@ static int packet_seq_show(struct seq_file *seq, void *v)
s,
refcount_read(&s->sk_refcnt),
s->sk_type,
- ntohs(po->num),
- po->ifindex,
+ ntohs(READ_ONCE(po->num)),
+ READ_ONCE(po->ifindex),
po->running,
atomic_read(&s->sk_rmem_alloc),
from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
@@ -4607,9 +4702,11 @@ static int __net_init packet_net_init(struct net *net)
mutex_init(&net->packet.sklist_lock);
INIT_HLIST_HEAD(&net->packet.sklist);
+#ifdef CONFIG_PROC_FS
if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
sizeof(struct seq_net_private)))
return -ENOMEM;
+#endif /* CONFIG_PROC_FS */
return 0;
}
@@ -4628,37 +4725,37 @@ static struct pernet_operations packet_net_ops = {
static void __exit packet_exit(void)
{
- unregister_netdevice_notifier(&packet_netdev_notifier);
- unregister_pernet_subsys(&packet_net_ops);
sock_unregister(PF_PACKET);
proto_unregister(&packet_proto);
+ unregister_netdevice_notifier(&packet_netdev_notifier);
+ unregister_pernet_subsys(&packet_net_ops);
}
static int __init packet_init(void)
{
int rc;
- rc = proto_register(&packet_proto, 0);
- if (rc)
- goto out;
- rc = sock_register(&packet_family_ops);
- if (rc)
- goto out_proto;
rc = register_pernet_subsys(&packet_net_ops);
if (rc)
- goto out_sock;
+ goto out;
rc = register_netdevice_notifier(&packet_netdev_notifier);
if (rc)
goto out_pernet;
+ rc = proto_register(&packet_proto, 0);
+ if (rc)
+ goto out_notifier;
+ rc = sock_register(&packet_family_ops);
+ if (rc)
+ goto out_proto;
return 0;
-out_pernet:
- unregister_pernet_subsys(&packet_net_ops);
-out_sock:
- sock_unregister(PF_PACKET);
out_proto:
proto_unregister(&packet_proto);
+out_notifier:
+ unregister_netdevice_notifier(&packet_netdev_notifier);
+out_pernet:
+ unregister_pernet_subsys(&packet_net_ops);
out:
return rc;
}
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 82fb2b10f790..48af35b1aed2 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -39,7 +39,7 @@ struct tpacket_kbdq_core {
char *nxt_offset;
struct sk_buff *skb;
- atomic_t blk_fill_in_prog;
+ rwlock_t blk_fill_in_prog_lock;
/* Default is set to 8ms */
#define DEFAULT_PRB_RETIRE_TOV (8)
@@ -70,15 +70,19 @@ struct packet_ring_buffer {
unsigned int __percpu *pending_refcnt;
- struct tpacket_kbdq_core prb_bdqc;
+ union {
+ unsigned long *rx_owner_map;
+ struct tpacket_kbdq_core prb_bdqc;
+ };
};
extern struct mutex fanout_mutex;
-#define PACKET_FANOUT_MAX 256
+#define PACKET_FANOUT_MAX (1 << 16)
struct packet_fanout {
possible_net_t net;
unsigned int num_members;
+ u32 max_num_members;
u16 id;
u8 type;
u8 flags;
@@ -87,10 +91,10 @@ struct packet_fanout {
struct bpf_prog __rcu *bpf_prog;
};
struct list_head list;
- struct sock *arr[PACKET_FANOUT_MAX];
spinlock_t lock;
refcount_t sk_ref;
struct packet_type prot_hook ____cacheline_aligned_in_smp;
+ struct sock __rcu *arr[];
};
struct packet_rollover {
@@ -135,7 +139,7 @@ struct packet_sock {
atomic_t tp_drops ____cacheline_aligned_in_smp;
};
-static struct packet_sock *pkt_sk(struct sock *sk)
+static inline struct packet_sock *pkt_sk(struct sock *sk)
{
return (struct packet_sock *)sk;
}