diff options
author | 2019-02-28 03:27:57 +0000 | |
---|---|---|
committer | 2019-02-28 03:27:57 +0000 | |
commit | 912623fe26bc18905d10318fcec5061c20013a07 (patch) | |
tree | 93e07ad25c99871ec2ff723f09d0485de10a11e4 | |
parent | dup stdout/in for proxycommand=-, otherwise stdout might be (diff) | |
download | wireguard-openbsd-912623fe26bc18905d10318fcec5061c20013a07.tar.xz wireguard-openbsd-912623fe26bc18905d10318fcec5061c20013a07.zip |
Add mpip(4), an IP tunnel interface for "IP Layer 2" over MPLS pseudowires
This is basically mpw(4), but it carries IP directly instead of
Ethernet. On the wire it can look the same as what IP over MPLS
looks like, but because it is a pseudowire you can configure a
control word or the FAT label to improve load balancing. It can
be used to quickly set up an IP tunnel over an MPLS fabric
without the need to configure bgpd and mpe(4) interfaces.
Because It implements the same pwe3 ioctls that mpw(4) uses ifconfig
already supports configuration of mpip(4) interfaces. ldpd will
grow support for this in the near future.
This is not hooked up to the build yet
discussed with claduio@ at ak219
ok claudio@
-rw-r--r-- | sys/net/if_mpip.c | 706 |
1 files changed, 706 insertions, 0 deletions
diff --git a/sys/net/if_mpip.c b/sys/net/if_mpip.c new file mode 100644 index 00000000000..1b17a4f91c4 --- /dev/null +++ b/sys/net/if_mpip.c @@ -0,0 +1,706 @@ +/* $OpenBSD: if_mpip.c,v 1.1 2019/02/28 03:27:57 dlg Exp $ */ + +/* + * Copyright (c) 2015 Rafael Zalamena <rzalamena@openbsd.org> + * Copyright (c) 2019 David Gwynne <dlg@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "bpfilter.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/errno.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/ip.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#include <netmpls/mpls.h> + +#if NBPFILTER > 0 +#include <net/bpf.h> +#endif /* NBPFILTER */ + +struct mpip_neighbor { + struct shim_hdr n_rshim; + struct sockaddr_storage n_nexthop; +}; + +struct mpip_softc { + struct ifnet sc_if; + unsigned int sc_dead; + uint32_t sc_flow; /* xor for mbuf flowid */ + + struct ifaddr sc_ifa; + struct sockaddr_mpls sc_smpls; /* Local label */ + unsigned int sc_rdomain; + struct mpip_neighbor *sc_neighbor; + + unsigned int sc_cword; /* control word */ + unsigned int sc_fword; /* flow-aware transport */ + int sc_ttl; +}; + +void mpipattach(int); +int mpip_clone_create(struct if_clone *, int); +int mpip_clone_destroy(struct ifnet *); +int mpip_ioctl(struct ifnet *, u_long, caddr_t); +int mpip_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +void mpip_start(struct ifnet *); + +struct if_clone mpip_cloner = + IF_CLONE_INITIALIZER("mpip", mpip_clone_create, mpip_clone_destroy); + +void +mpipattach(int n) +{ + if_clone_attach(&mpip_cloner); +} + +int +mpip_clone_create(struct if_clone *ifc, int unit) +{ + struct mpip_softc *sc; + struct ifnet *ifp; + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc == NULL) + return (ENOMEM); + + sc->sc_neighbor = 0; + sc->sc_cword = 0; /* default to no control word */ + sc->sc_fword = 0; /* both sides have to agree on FAT first */ + sc->sc_flow = arc4random() & 0xfffff; + sc->sc_smpls.smpls_len = sizeof(sc->sc_smpls); + sc->sc_smpls.smpls_family = AF_MPLS; + sc->sc_ttl = -1; + + ifp = &sc->sc_if; + snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d", + ifc->ifc_name, unit); + ifp->if_softc = sc; + ifp->if_type = IFT_TUNNEL; + ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; + ifp->if_xflags = IFXF_CLONED; + ifp->if_ioctl = mpip_ioctl; + ifp->if_output = mpip_output; + ifp->if_start = mpip_start; + ifp->if_rtrequest = p2p_rtrequest; + ifp->if_mtu = 1500; + ifp->if_hardmtu = 65535; + IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN); + + if_attach(ifp); + if_counters_alloc(ifp); + if_alloc_sadl(ifp); + +#if NBPFILTER > 0 + bpfattach(&ifp->if_bpf, ifp, DLT_LOOP, sizeof(uint32_t)); +#endif + + sc->sc_ifa.ifa_ifp = ifp; + sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl); + + return (0); +} + +int +mpip_clone_destroy(struct ifnet *ifp) +{ + struct mpip_softc *sc = ifp->if_softc; + + NET_LOCK(); + ifp->if_flags &= ~IFF_RUNNING; + sc->sc_dead = 1; + + if (sc->sc_smpls.smpls_label) { + rt_ifa_del(&sc->sc_ifa, RTF_LOCAL | RTF_MPLS, + smplstosa(&sc->sc_smpls), 0); + } + NET_UNLOCK(); + + ifq_barrier(&ifp->if_snd); + + if_detach(ifp); + + free(sc->sc_neighbor, M_DEVBUF, sizeof(*sc->sc_neighbor)); + free(sc, M_DEVBUF, sizeof(*sc)); + + return (0); +} + +static int +mpip_set_route(struct mpip_softc *sc, uint32_t shim, unsigned int rdomain) +{ + int error; + + rt_ifa_del(&sc->sc_ifa, RTF_MPLS | RTF_LOCAL, + smplstosa(&sc->sc_smpls), 0); + + sc->sc_smpls.smpls_label = shim; + sc->sc_rdomain = rdomain; + + error = rt_ifa_add(&sc->sc_ifa, RTF_MPLS | RTF_LOCAL, + smplstosa(&sc->sc_smpls), 0); + if (error) { + sc->sc_smpls.smpls_label = MPLS_LABEL2SHIM(0); + return (error); + } + + return (0); +} + +static int +mpip_set_label(struct mpip_softc *sc, struct ifreq *ifr) +{ + struct shim_hdr label; + uint32_t shim; + int error; + + error = copyin(ifr->ifr_data, &label, sizeof(label)); + if (error != 0) + return (error); + + if (label.shim_label > MPLS_LABEL_MAX || + label.shim_label <= MPLS_LABEL_RESERVED_MAX) + return (EINVAL); + + shim = MPLS_LABEL2SHIM(label.shim_label); + + if (sc->sc_smpls.smpls_label == shim) + return (0); + + return (mpip_set_route(sc, shim, sc->sc_rdomain)); +} + +static int +mpip_get_label(struct mpip_softc *sc, struct ifreq *ifr) +{ + struct shim_hdr label; + + label.shim_label = MPLS_SHIM2LABEL(sc->sc_smpls.smpls_label); + + if (label.shim_label == MPLS_LABEL2SHIM(0)) + return (EADDRNOTAVAIL); + + return (copyout(&label, ifr->ifr_data, sizeof(label))); +} + +static int +mpip_del_label(struct mpip_softc *sc) +{ + if (sc->sc_smpls.smpls_label != MPLS_LABEL2SHIM(0)) { + rt_ifa_del(&sc->sc_ifa, RTF_MPLS | RTF_LOCAL, + smplstosa(&sc->sc_smpls), 0); + } + + sc->sc_smpls.smpls_label = MPLS_LABEL2SHIM(0); + + return (0); +} + +static int +mpip_set_neighbor(struct mpip_softc *sc, struct if_laddrreq *req) +{ + struct mpip_neighbor *n, *o; + struct sockaddr *sa = (struct sockaddr *)&req->addr; + struct sockaddr_mpls *smpls = (struct sockaddr_mpls *)&req->dstaddr; + uint32_t label; + + if (smpls->smpls_family != AF_MPLS) + return (EINVAL); + label = smpls->smpls_label; + if (label > MPLS_LABEL_MAX || label <= MPLS_LABEL_RESERVED_MAX) + return (EINVAL); + + switch (sa->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + + if (in_nullhost(sin->sin_addr) || + IN_MULTICAST(sin->sin_addr.s_addr)) + return (EINVAL); + + break; + } +#ifdef INET6 + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return (EINVAL); + + /* check scope */ + + break; + } +#endif + default: + return (EAFNOSUPPORT); + } + + if (sc->sc_dead) + return (ENXIO); + + n = malloc(sizeof(*n), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (n == NULL) + return (ENOMEM); + + n->n_rshim.shim_label = MPLS_LABEL2SHIM(label); + n->n_nexthop = req->addr; + + o = sc->sc_neighbor; + sc->sc_neighbor = n; + + NET_UNLOCK(); + ifq_barrier(&sc->sc_if.if_snd); + NET_LOCK(); + + free(o, M_DEVBUF, sizeof(*o)); + + return (0); +} + +static int +mpip_get_neighbor(struct mpip_softc *sc, struct if_laddrreq *req) +{ + struct sockaddr_mpls *smpls = (struct sockaddr_mpls *)&req->dstaddr; + struct mpip_neighbor *n = sc->sc_neighbor; + + if (n == NULL) + return (EADDRNOTAVAIL); + + smpls->smpls_len = sizeof(*smpls); + smpls->smpls_family = AF_MPLS; + smpls->smpls_label = MPLS_SHIM2LABEL(n->n_rshim.shim_label); + req->addr = n->n_nexthop; + + return (0); +} + +static int +mpip_del_neighbor(struct mpip_softc *sc, struct ifreq *req) +{ + struct mpip_neighbor *o; + + if (sc->sc_dead) + return (ENXIO); + + o = sc->sc_neighbor; + sc->sc_neighbor = NULL; + + NET_UNLOCK(); + ifq_barrier(&sc->sc_if.if_snd); + NET_LOCK(); + + free(o, M_DEVBUF, sizeof(*o)); + + return (0); +} + +int +mpip_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct mpip_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; + + switch (cmd) { + case SIOCSIFADDR: + break; + case SIOCSIFFLAGS: + if ((ifp->if_flags & IFF_UP)) + ifp->if_flags |= IFF_RUNNING; + else + ifp->if_flags &= ~IFF_RUNNING; + break; + case SIOCSIFMTU: + if (ifr->ifr_mtu < 60 || /* XXX */ + ifr->ifr_mtu > 65536) /* XXX */ + error = EINVAL; + else + ifp->if_mtu = ifr->ifr_mtu; + break; + + case SIOCGPWE3: + ifr->ifr_pwe3 = IF_PWE3_IP; + break; + case SIOCSPWE3CTRLWORD: + sc->sc_cword = ifr->ifr_pwe3 ? 1 : 0; + break; + case SIOCGPWE3CTRLWORD: + ifr->ifr_pwe3 = sc->sc_cword; + break; + case SIOCSPWE3FAT: + sc->sc_fword = ifr->ifr_pwe3 ? 1 : 0; + break; + case SIOCGPWE3FAT: + ifr->ifr_pwe3 = sc->sc_fword; + break; + + case SIOCSETLABEL: + error = mpip_set_label(sc, ifr); + break; + case SIOCGETLABEL: + error = mpip_get_label(sc, ifr); + break; + case SIOCDELLABEL: + error = mpip_del_label(sc); + break; + + case SIOCSPWE3NEIGHBOR: + error = mpip_set_neighbor(sc, (struct if_laddrreq *)data); + break; + case SIOCGPWE3NEIGHBOR: + error = mpip_get_neighbor(sc, (struct if_laddrreq *)data); + break; + case SIOCDPWE3NEIGHBOR: + error = mpip_del_neighbor(sc, ifr); + break; + + case SIOCSLIFPHYRTABLE: + if (ifr->ifr_rdomainid < 0 || + ifr->ifr_rdomainid > RT_TABLEID_MAX || + !rtable_exists(ifr->ifr_rdomainid) || + ifr->ifr_rdomainid != rtable_l2(ifr->ifr_rdomainid)) { + error = EINVAL; + break; + } + if (sc->sc_rdomain != ifr->ifr_rdomainid) { + error = mpip_set_route(sc, sc->sc_smpls.smpls_label, + ifr->ifr_rdomainid); + } + break; + case SIOCGLIFPHYRTABLE: + ifr->ifr_rdomainid = sc->sc_rdomain; + break; + + case SIOCSLIFPHYTTL: + if (ifr->ifr_ttl != -1 && + (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff)) { + error = EINVAL; + break; + } + + /* commit */ + sc->sc_ttl = ifr->ifr_ttl; + break; + case SIOCGLIFPHYTTL: + ifr->ifr_ttl = sc->sc_ttl; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + default: + error = ENOTTY; + break; + } + + return (error); +} + +static void +mpip_input(struct mpip_softc *sc, struct mbuf *m) +{ + struct ifnet *ifp = &sc->sc_if; + uint32_t shim; + struct mbuf *n; + uint8_t ttl; + void (*input)(struct ifnet *, struct mbuf *); + + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + goto drop; + + shim = *mtod(m, uint32_t *); + m_adj(m, sizeof(shim)); + + ttl = ntohl(shim & MPLS_TTL_MASK); + + if (sc->sc_fword) { + uint32_t label; + + if (MPLS_BOS_ISSET(shim)) + goto drop; + + if (m->m_len < sizeof(shim)) { + m = m_pullup(m, sizeof(shim)); + if (m == NULL) + return; + } + + shim = *mtod(m, uint32_t *); + if (!MPLS_BOS_ISSET(shim)) + goto drop; + + label = MPLS_SHIM2LABEL(shim); + if (label <= MPLS_LABEL_RESERVED_MAX) { + counters_inc(ifp->if_counters, ifc_noproto); /* ? */ + goto drop; + } + + label -= MPLS_LABEL_RESERVED_MAX + 1; + label ^= sc->sc_flow; + m->m_pkthdr.ph_flowid = M_FLOWID_VALID | label; + + m_adj(m, sizeof(shim)); + } else if (!MPLS_BOS_ISSET(shim)) + goto drop; + + if (sc->sc_cword) { + if (m->m_len < sizeof(shim)) { + m = m_pullup(m, sizeof(shim)); + if (m == NULL) + return; + } + shim = *mtod(m, uint32_t *); + + /* + * The first 4 bits identifies that this packet is a + * control word. If the control word is configured and + * we received an IP datagram we shall drop it. + */ + if (shim & CW_ZERO_MASK) { + counters_inc(ifp->if_counters, ifc_ierrors); + goto drop; + } + + /* We don't support fragmentation just yet. */ + if (shim & CW_FRAG_MASK) { + counters_inc(ifp->if_counters, ifc_ierrors); + goto drop; + } + + m_adj(m, sizeof(shim)); + } + + n = m; + while (n->m_len == 0) { + n = n->m_next; + if (n == NULL) + goto drop; + } + + switch (*mtod(n, uint8_t *) >> 4) { + case 4: + if (sc->sc_ttl == -1) { + m = mpls_ip_adjttl(m, ttl); + if (m == NULL) + return; + } + input = ipv4_input; + m->m_pkthdr.ph_family = AF_INET; + break; +#ifdef INET6 + case 6: + if (sc->sc_ttl == -1) { + m = mpls_ip6_adjttl(m, ttl); + if (m == NULL) + return; + } + input = ipv6_input; + m->m_pkthdr.ph_family = AF_INET6; + break; +#endif /* INET6 */ + default: + counters_inc(ifp->if_counters, ifc_noproto); + goto drop; + } + + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; + +#if NBPFILTER > 0 + { + caddr_t if_bpf = ifp->if_bpf; + if (if_bpf) { + bpf_mtap_af(if_bpf, m->m_pkthdr.ph_family, + m, BPF_DIRECTION_IN); + } + } +#endif + + (*input)(ifp, m); + return; +drop: + m_freem(m); +} + +int +mpip_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + struct mpip_softc *sc = ifp->if_softc; + int error; + + if (dst->sa_family == AF_LINK && + rt != NULL && ISSET(rt->rt_flags, RTF_LOCAL)) { + mpip_input(sc, m); + return (0); + } + + if (!ISSET(ifp->if_flags, IFF_RUNNING)) { + error = ENETDOWN; + goto drop; + } + + switch (dst->sa_family) { + case AF_INET: +#ifdef INET6 + case AF_INET6: +#endif + break; + default: + error = EAFNOSUPPORT; + goto drop; + } + + m->m_pkthdr.ph_family = dst->sa_family; + + error = if_enqueue(ifp, m); + if (error) + counters_inc(ifp->if_counters, ifc_oerrors); + return (error); + +drop: + m_freem(m); + return (error); +} + +void +mpip_start(struct ifnet *ifp) +{ + struct mpip_softc *sc = ifp->if_softc; + struct mpip_neighbor *n = sc->sc_neighbor; + struct rtentry *rt; + struct ifnet *ifp0; + struct mbuf *m; + uint32_t shim; + struct sockaddr_mpls smpls = { + .smpls_len = sizeof(smpls), + .smpls_family = AF_MPLS, + }; + uint32_t bos; + uint8_t ttl; + + if (!ISSET(ifp->if_flags, IFF_RUNNING) || n == NULL) { + IFQ_PURGE(&ifp->if_snd); + return; + } + + rt = rtalloc(sstosa(&n->n_nexthop), RT_RESOLVE, 0); + if (!rtisvalid(rt)) { + IFQ_PURGE(&ifp->if_snd); + goto rtfree; + } + + ifp0 = if_get(rt->rt_ifidx); + if (ifp0 == NULL) { + IFQ_PURGE(&ifp->if_snd); + goto rtfree; + } + + while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) { +#if NBPFILTER > 0 + caddr_t if_bpf = sc->sc_if.if_bpf; + if (if_bpf) { + bpf_mtap_af(if_bpf, m->m_pkthdr.ph_family, + m, BPF_DIRECTION_OUT); + } +#endif /* NBPFILTER */ + + if (sc->sc_ttl == -1) { + switch (m->m_pkthdr.ph_family) { + case AF_INET: { + struct ip *ip; + ip = mtod(m, struct ip *); + ttl = ip->ip_ttl; + break; + } +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *ip6; + ip6 = mtod(m, struct ip6_hdr *); + ttl = ip6->ip6_hlim; + break; + } +#endif + default: + unhandled_af(m->m_pkthdr.ph_family); + } + } else + ttl = mpls_defttl; + + if (sc->sc_cword) { + m = m_prepend(m, sizeof(shim), M_NOWAIT); + if (m == NULL) + continue; + + *mtod(m, uint32_t *) = 0; + } + + bos = MPLS_BOS_MASK; + + if (sc->sc_fword) { + uint32_t flow = 0; + m = m_prepend(m, sizeof(shim), M_NOWAIT); + if (m == NULL) + continue; + + if (ISSET(m->m_pkthdr.ph_flowid, M_FLOWID_VALID)) + flow = m->m_pkthdr.ph_flowid & M_FLOWID_MASK; + flow ^= sc->sc_flow; + flow += MPLS_LABEL_RESERVED_MAX + 1; + + shim = htonl(1) & MPLS_TTL_MASK; + shim |= htonl(flow << MPLS_LABEL_OFFSET) & + MPLS_LABEL_MASK; + shim |= bos; + *mtod(m, uint32_t *) = shim; + + bos = 0; + } + + m = m_prepend(m, sizeof(shim), M_NOWAIT); + if (m == NULL) + continue; + + shim = htonl(ttl) & MPLS_TTL_MASK; + shim |= n->n_rshim.shim_label; + shim |= bos; + *mtod(m, uint32_t *) = shim; + + mpls_output(ifp0, m, (struct sockaddr *)&smpls, rt); + } + + if_put(ifp0); +rtfree: + rtfree(rt); +} |