/* Cluster IP hashmark target * (C) 2003-2004 by Harald Welte * based on ideas of Fabio Olive Leite * * Development of this code funded by SuSE Linux AG, http://www.suse.com/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CLUSTERIP_VERSION "0.8" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ refcount_t refcount; /* reference count */ refcount_t entries; /* number of entries/rules * referencing us */ __be32 clusterip; /* the IP address */ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ int ifindex; /* device ifindex */ u_int16_t num_total_nodes; /* total number of nodes */ unsigned long local_nodes; /* node number array */ #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; /* proc dir entry */ #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; /* for call_rcu_bh */ struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ }; #ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; #endif struct clusterip_net { struct list_head configs; /* lock protects the configs list */ spinlock_t lock; #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; /* mutex protects the config->pde*/ struct mutex mutex; #endif }; static unsigned int clusterip_net_id __read_mostly; static inline struct clusterip_net *clusterip_pernet(struct net *net) { return net_generic(net, clusterip_net_id); } static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); } static void clusterip_config_rcu_free(struct rcu_head *head) { struct clusterip_config *config; struct net_device *dev; config = container_of(head, struct clusterip_config, rcu); dev = dev_get_by_name(config->net, config->ifname); if (dev) { dev_mc_del(dev, config->clustermac); dev_put(dev); } kfree(config); } static inline void clusterip_config_put(struct clusterip_config *c) { if (refcount_dec_and_test(&c->refcount)) call_rcu(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void clusterip_config_entry_put(struct clusterip_config *c) { struct clusterip_net *cn = clusterip_pernet(c->net); local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); spin_unlock(&cn->lock); local_bh_enable(); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); if (cn->procdir) proc_remove(c->pde); mutex_unlock(&cn->mutex); #endif return; } local_bh_enable(); } static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; struct clusterip_net *cn = clusterip_pernet(net); list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) return c; } return NULL; } static inline struct clusterip_config * clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { #ifdef CONFIG_PROC_FS if (!c->pde) c = NULL; else #endif if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; else if (entry) { if (unlikely(!refcount_inc_not_zero(&c->entries))) { clusterip_config_put(c); c = NULL; } } } rcu_read_unlock_bh(); return c; } static void clusterip_config_init_nodelist(struct clusterip_config *c, const struct ipt_clusterip_tgt_info *i) { int n; for (n = 0; n < i->num_local_nodes; n++) set_bit(i->local_nodes[n] - 1, &c->local_nodes); } static int clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; spin_lock_bh(&cn->lock); list_for_each_entry_rcu(c, &cn->configs, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } break; case NETDEV_UNREGISTER: if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } else if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; } } spin_unlock_bh(&cn->lock); return NOTIFY_DONE; } static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; struct net_device *dev; int err; if (iniface[0] == '\0') { pr_info("Please specify an interface name\n"); return ERR_PTR(-EINVAL); } c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); dev = dev_get_by_name(net, iniface); if (!dev) { pr_info("no such interface %s\n", iniface); kfree(c); return ERR_PTR(-ENOENT); } c->ifindex = dev->ifindex; strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); dev_mc_add(dev, c->clustermac); dev_put(dev); c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; c->net = net; refcount_set(&c->refcount, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { err = -EBUSY; goto out_config_put; } list_add_rcu(&c->list, &cn->configs); spin_unlock_bh(&cn->lock); #ifdef CONFIG_PROC_FS { char buffer[16]; /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; goto err; } } #endif refcount_set(&c->entries, 1); return c; #ifdef CONFIG_PROC_FS err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); return ERR_PTR(err); } #ifdef CONFIG_PROC_FS static int clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return 1; /* check if we already have this number in our bitfield */ if (test_and_set_bit(nodenum - 1, &c->local_nodes)) return 1; return 0; } static bool clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return true; if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) return false; return true; } #endif static inline u_int32_t clusterip_hashfn(const struct sk_buff *skb, const struct clusterip_config *config) { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; u_int16_t sport = 0, dport = 0; int poff; poff = proto_ports_offset(iph->protocol); if (poff >= 0) { const u_int16_t *ports; u16 _ports[2]; ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); if (ports) { sport = ports[0]; dport = ports[1]; } } else { net_info_ratelimited("unknown protocol %u\n", iph->protocol); } switch (config->hash_mode) { case CLUSTERIP_HASHMODE_SIP: hashval = jhash_1word(ntohl(iph->saddr), config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT: hashval = jhash_2words(ntohl(iph->saddr), sport, config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT_DPT: hashval = jhash_3words(ntohl(iph->saddr), sport, dport, config->hash_initval); break; default: /* to make gcc happy */ hashval = 0; /* This cannot happen, unless the check function wasn't called * at rule load time */ pr_info("unknown mode %u\n", config->hash_mode); BUG(); break; } /* node numbers are 1..n, not 0..n */ return reciprocal_scale(hashval, config->num_total_nodes) + 1; } static inline int clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) { return test_bit(hash - 1, &config->local_nodes); } /*********************************************************************** * IPTABLES TARGET ***********************************************************************/ static unsigned int clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return NF_DROP; /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ if (ip_hdr(skb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here * on, which all have an ID field [relevant for hashing]. */ hash = clusterip_hashfn(skb, cipinfo->config); switch (ctinfo) { case IP_CT_NEW: ct->mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* FIXME: we don't handle expectations at the moment. * They can arrive on a different node than * the master connection (e.g. FTP passive mode) */ case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: break; default: /* Prevent gcc warnings */ break; } #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; } pr_debug("responsible\n"); /* despite being received via linklayer multicast, this is * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ skb->pkt_type = PACKET_HOST; return XT_CONTINUE; } static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; int ret, i; if (par->nft_compat) { pr_err("cannot use CLUSTERIP target from nftables compat\n"); return -EOPNOTSUPP; } if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { pr_info("unknown mode %u\n", cipinfo->hash_mode); return -EINVAL; } if (e->ip.dmsk.s_addr != htonl(0xffffffff) || e->ip.dst.s_addr == 0) { pr_info("Please specify destination IP\n"); return -EINVAL; } if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); return -EINVAL; } for (i = 0; i < cipinfo->num_local_nodes; i++) { if (cipinfo->local_nodes[i] - 1 >= sizeof(config->local_nodes) * 8) { pr_info("bad local_nodes[%d] %u\n", i, cipinfo->local_nodes[i]); return -EINVAL; } } config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { pr_info("no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); return -EINVAL; } else { config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); if (IS_ERR(config)) return PTR_ERR(config); } } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } if (!par->net->xt.clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " "use xt_cluster instead\n"); par->net->xt.clusterip_deprecated_warning = true; } cipinfo->config = config; return ret; } /* drop reference count of cluster config when rule is deleted */ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ clusterip_config_entry_put(cipinfo->config); clusterip_config_put(cipinfo->config); nf_ct_netns_put(par->net, par->family); } #ifdef CONFIG_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; u_int8_t clustermac[6]; u_int16_t num_total_nodes; u_int16_t num_local_nodes; u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; u_int32_t hash_mode; u_int32_t hash_initval; compat_uptr_t config; }; #endif /* CONFIG_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", .family = NFPROTO_IPV4, .target = clusterip_tg, .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), #endif /* CONFIG_COMPAT */ .me = THIS_MODULE }; /*********************************************************************** * ARP MANGLING CODE ***********************************************************************/ /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ struct arp_payload { u_int8_t src_hw[ETH_ALEN]; __be32 src_ip; u_int8_t dst_hw[ETH_ALEN]; __be32 dst_ip; } __packed; #ifdef DEBUG static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; int j, k; for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); hbuffer[k++] = ':'; } hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); } #endif static unsigned int arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) return NF_ACCEPT; /* we only want to mangle arp requests and replies */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) return NF_ACCEPT; payload = (void *)(arp+1); /* if there is no clusterip configuration for the arp reply's * source ip, we don't want to mangle it */ c = clusterip_config_find_get(net, payload->src_ip, 0); if (!c) return NF_ACCEPT; /* normally the linux kernel always replies to arp queries of * addresses on different interfacs. However, in the CLUSTERIP case * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ if (c->ifindex != state->out->ifindex) { pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", c->ifindex, state->out->ifindex); clusterip_config_put(c); return NF_ACCEPT; } /* mangle reply hardware address */ memcpy(payload->src_hw, c->clustermac, arp->ar_hln); #ifdef DEBUG pr_debug("mangled arp reply: "); arp_print(payload); #endif clusterip_config_put(c); return NF_ACCEPT; } static const struct nf_hook_ops cip_arp_ops = { .hook = arp_mangle, .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = -1 }; /*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ #ifdef CONFIG_PROC_FS struct clusterip_seq_position { unsigned int pos; /* position */ unsigned int weight; /* number of bits set == size */ unsigned int bit; /* current bit */ unsigned long val; /* current value */ }; static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { struct clusterip_config *c = s->private; unsigned int weight; u_int32_t local_nodes; struct clusterip_seq_position *idx; /* FIXME: possible race */ local_nodes = c->local_nodes; weight = hweight32(local_nodes); if (*pos >= weight) return NULL; idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); if (!idx) return ERR_PTR(-ENOMEM); idx->pos = *pos; idx->weight = weight; idx->bit = ffs(local_nodes); idx->val = local_nodes; clear_bit(idx->bit - 1, &idx->val); return idx; } static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct clusterip_seq_position *idx = v; *pos = ++idx->pos; if (*pos >= idx->weight) { kfree(v); return NULL; } idx->bit = ffs(idx->val); clear_bit(idx->bit - 1, &idx->val); return idx; } static void clusterip_seq_stop(struct seq_file *s, void *v) { if (!IS_ERR(v)) kfree(v); } static int clusterip_seq_show(struct seq_file *s, void *v) { struct clusterip_seq_position *idx = v; if (idx->pos != 0) seq_putc(s, ','); seq_printf(s, "%u", idx->bit); if (idx->pos == idx->weight - 1) seq_putc(s, '\n'); return 0; } static const struct seq_operations clusterip_seq_ops = { .start = clusterip_seq_start, .next = clusterip_seq_next, .stop = clusterip_seq_stop, .show = clusterip_seq_show, }; static int clusterip_proc_open(struct inode *inode, struct file *file) { int ret = seq_open(file, &clusterip_seq_ops); if (!ret) { struct seq_file *sf = file->private_data; struct clusterip_config *c = PDE_DATA(inode); sf->private = c; clusterip_config_get(c); } return ret; } static int clusterip_proc_release(struct inode *inode, struct file *file) { struct clusterip_config *c = PDE_DATA(inode); int ret; ret = seq_release(inode, file); if (!ret) clusterip_config_put(c); return ret; } static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { struct clusterip_config *c = PDE_DATA(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; int rc; if (size > PROC_WRITELEN) return -EIO; if (copy_from_user(buffer, input, size)) return -EFAULT; buffer[size] = 0; if (*buffer == '+') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_add_node(c, nodenum)) return -ENOMEM; } else if (*buffer == '-') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_del_node(c, nodenum)) return -ENOENT; } else return -EIO; return size; } static const struct file_operations clusterip_proc_fops = { .open = clusterip_proc_open, .read = seq_read, .write = clusterip_proc_write, .llseek = seq_lseek, .release = clusterip_proc_release, }; #endif /* CONFIG_PROC_FS */ static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); int ret; INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); ret = nf_register_net_hook(net, &cip_arp_ops); if (ret < 0) return ret; #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { nf_unregister_net_hook(net, &cip_arp_ops); pr_err("Unable to proc dir entry\n"); return -ENOMEM; } mutex_init(&cn->mutex); #endif /* CONFIG_PROC_FS */ return 0; } static void clusterip_net_exit(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); #ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; mutex_unlock(&cn->mutex); #endif nf_unregister_net_hook(net, &cip_arp_ops); } static struct pernet_operations clusterip_net_ops = { .init = clusterip_net_init, .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), }; struct notifier_block cip_netdev_notifier = { .notifier_call = clusterip_netdev_event }; static int __init clusterip_tg_init(void) { int ret; ret = register_pernet_subsys(&clusterip_net_ops); if (ret < 0) return ret; ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&cip_netdev_notifier); if (ret < 0) goto unregister_target; pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; unregister_target: xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; } static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ rcu_barrier(); } module_init(clusterip_tg_init); module_exit(clusterip_tg_exit);